PR rtl-optimization/82913
[official-gcc.git] / gcc / tree-ssa-math-opts.c
blob5986ac1da38c3edfadb258af4572fbc77af6d746
1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
23 modulus = sqrt(x*x + y*y + z*z);
24 x = x / modulus;
25 y = y / modulus;
26 z = z / modulus;
28 that can be optimized to
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
32 x = x * rmodulus;
33 y = y * rmodulus;
34 z = z * rmodulus;
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 by the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
49 this comment.
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "backend.h"
91 #include "target.h"
92 #include "rtl.h"
93 #include "tree.h"
94 #include "gimple.h"
95 #include "predict.h"
96 #include "alloc-pool.h"
97 #include "tree-pass.h"
98 #include "ssa.h"
99 #include "optabs-tree.h"
100 #include "gimple-pretty-print.h"
101 #include "alias.h"
102 #include "fold-const.h"
103 #include "gimple-fold.h"
104 #include "gimple-iterator.h"
105 #include "gimplify.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-dfa.h"
110 #include "tree-ssa.h"
111 #include "builtins.h"
112 #include "params.h"
113 #include "internal-fn.h"
114 #include "case-cfn-macros.h"
115 #include "optabs-libfuncs.h"
116 #include "tree-eh.h"
117 #include "targhooks.h"
119 /* This structure represents one basic block that either computes a
120 division, or is a common dominator for basic block that compute a
121 division. */
122 struct occurrence {
123 /* The basic block represented by this structure. */
124 basic_block bb;
126 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
127 inserted in BB. */
128 tree recip_def;
130 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
131 was inserted in BB. */
132 gimple *recip_def_stmt;
134 /* Pointer to a list of "struct occurrence"s for blocks dominated
135 by BB. */
136 struct occurrence *children;
138 /* Pointer to the next "struct occurrence"s in the list of blocks
139 sharing a common dominator. */
140 struct occurrence *next;
142 /* The number of divisions that are in BB before compute_merit. The
143 number of divisions that are in BB or post-dominate it after
144 compute_merit. */
145 int num_divisions;
147 /* True if the basic block has a division, false if it is a common
148 dominator for basic blocks that do. If it is false and trapping
149 math is active, BB is not a candidate for inserting a reciprocal. */
150 bool bb_has_division;
153 static struct
155 /* Number of 1.0/X ops inserted. */
156 int rdivs_inserted;
158 /* Number of 1.0/FUNC ops inserted. */
159 int rfuncs_inserted;
160 } reciprocal_stats;
162 static struct
164 /* Number of cexpi calls inserted. */
165 int inserted;
166 } sincos_stats;
168 static struct
170 /* Number of hand-written 16-bit nop / bswaps found. */
171 int found_16bit;
173 /* Number of hand-written 32-bit nop / bswaps found. */
174 int found_32bit;
176 /* Number of hand-written 64-bit nop / bswaps found. */
177 int found_64bit;
178 } nop_stats, bswap_stats;
180 static struct
182 /* Number of widening multiplication ops inserted. */
183 int widen_mults_inserted;
185 /* Number of integer multiply-and-accumulate ops inserted. */
186 int maccs_inserted;
188 /* Number of fp fused multiply-add ops inserted. */
189 int fmas_inserted;
191 /* Number of divmod calls inserted. */
192 int divmod_calls_inserted;
193 } widen_mul_stats;
195 /* The instance of "struct occurrence" representing the highest
196 interesting block in the dominator tree. */
197 static struct occurrence *occ_head;
199 /* Allocation pool for getting instances of "struct occurrence". */
200 static object_allocator<occurrence> *occ_pool;
204 /* Allocate and return a new struct occurrence for basic block BB, and
205 whose children list is headed by CHILDREN. */
206 static struct occurrence *
207 occ_new (basic_block bb, struct occurrence *children)
209 struct occurrence *occ;
211 bb->aux = occ = occ_pool->allocate ();
212 memset (occ, 0, sizeof (struct occurrence));
214 occ->bb = bb;
215 occ->children = children;
216 return occ;
220 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
221 list of "struct occurrence"s, one per basic block, having IDOM as
222 their common dominator.
224 We try to insert NEW_OCC as deep as possible in the tree, and we also
225 insert any other block that is a common dominator for BB and one
226 block already in the tree. */
228 static void
229 insert_bb (struct occurrence *new_occ, basic_block idom,
230 struct occurrence **p_head)
232 struct occurrence *occ, **p_occ;
234 for (p_occ = p_head; (occ = *p_occ) != NULL; )
236 basic_block bb = new_occ->bb, occ_bb = occ->bb;
237 basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
238 if (dom == bb)
240 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
241 from its list. */
242 *p_occ = occ->next;
243 occ->next = new_occ->children;
244 new_occ->children = occ;
246 /* Try the next block (it may as well be dominated by BB). */
249 else if (dom == occ_bb)
251 /* OCC_BB dominates BB. Tail recurse to look deeper. */
252 insert_bb (new_occ, dom, &occ->children);
253 return;
256 else if (dom != idom)
258 gcc_assert (!dom->aux);
260 /* There is a dominator between IDOM and BB, add it and make
261 two children out of NEW_OCC and OCC. First, remove OCC from
262 its list. */
263 *p_occ = occ->next;
264 new_occ->next = occ;
265 occ->next = NULL;
267 /* None of the previous blocks has DOM as a dominator: if we tail
268 recursed, we would reexamine them uselessly. Just switch BB with
269 DOM, and go on looking for blocks dominated by DOM. */
270 new_occ = occ_new (dom, new_occ);
273 else
275 /* Nothing special, go on with the next element. */
276 p_occ = &occ->next;
280 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
281 new_occ->next = *p_head;
282 *p_head = new_occ;
285 /* Register that we found a division in BB. */
287 static inline void
288 register_division_in (basic_block bb)
290 struct occurrence *occ;
292 occ = (struct occurrence *) bb->aux;
293 if (!occ)
295 occ = occ_new (bb, NULL);
296 insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
299 occ->bb_has_division = true;
300 occ->num_divisions++;
304 /* Compute the number of divisions that postdominate each block in OCC and
305 its children. */
307 static void
308 compute_merit (struct occurrence *occ)
310 struct occurrence *occ_child;
311 basic_block dom = occ->bb;
313 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
315 basic_block bb;
316 if (occ_child->children)
317 compute_merit (occ_child);
319 if (flag_exceptions)
320 bb = single_noncomplex_succ (dom);
321 else
322 bb = dom;
324 if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
325 occ->num_divisions += occ_child->num_divisions;
330 /* Return whether USE_STMT is a floating-point division by DEF. */
331 static inline bool
332 is_division_by (gimple *use_stmt, tree def)
334 return is_gimple_assign (use_stmt)
335 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
336 && gimple_assign_rhs2 (use_stmt) == def
337 /* Do not recognize x / x as valid division, as we are getting
338 confused later by replacing all immediate uses x in such
339 a stmt. */
340 && gimple_assign_rhs1 (use_stmt) != def;
343 /* Walk the subset of the dominator tree rooted at OCC, setting the
344 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
345 the given basic block. The field may be left NULL, of course,
346 if it is not possible or profitable to do the optimization.
348 DEF_BSI is an iterator pointing at the statement defining DEF.
349 If RECIP_DEF is set, a dominator already has a computation that can
350 be used. */
352 static void
353 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
354 tree def, tree recip_def, int threshold)
356 tree type;
357 gassign *new_stmt;
358 gimple_stmt_iterator gsi;
359 struct occurrence *occ_child;
361 if (!recip_def
362 && (occ->bb_has_division || !flag_trapping_math)
363 && occ->num_divisions >= threshold)
365 /* Make a variable with the replacement and substitute it. */
366 type = TREE_TYPE (def);
367 recip_def = create_tmp_reg (type, "reciptmp");
368 new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
369 build_one_cst (type), def);
371 if (occ->bb_has_division)
373 /* Case 1: insert before an existing division. */
374 gsi = gsi_after_labels (occ->bb);
375 while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
376 gsi_next (&gsi);
378 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
380 else if (def_gsi && occ->bb == def_gsi->bb)
382 /* Case 2: insert right after the definition. Note that this will
383 never happen if the definition statement can throw, because in
384 that case the sole successor of the statement's basic block will
385 dominate all the uses as well. */
386 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
388 else
390 /* Case 3: insert in a basic block not containing defs/uses. */
391 gsi = gsi_after_labels (occ->bb);
392 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
395 reciprocal_stats.rdivs_inserted++;
397 occ->recip_def_stmt = new_stmt;
400 occ->recip_def = recip_def;
401 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
402 insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
406 /* Replace the division at USE_P with a multiplication by the reciprocal, if
407 possible. */
409 static inline void
410 replace_reciprocal (use_operand_p use_p)
412 gimple *use_stmt = USE_STMT (use_p);
413 basic_block bb = gimple_bb (use_stmt);
414 struct occurrence *occ = (struct occurrence *) bb->aux;
416 if (optimize_bb_for_speed_p (bb)
417 && occ->recip_def && use_stmt != occ->recip_def_stmt)
419 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
420 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
421 SET_USE (use_p, occ->recip_def);
422 fold_stmt_inplace (&gsi);
423 update_stmt (use_stmt);
428 /* Free OCC and return one more "struct occurrence" to be freed. */
430 static struct occurrence *
431 free_bb (struct occurrence *occ)
433 struct occurrence *child, *next;
435 /* First get the two pointers hanging off OCC. */
436 next = occ->next;
437 child = occ->children;
438 occ->bb->aux = NULL;
439 occ_pool->remove (occ);
441 /* Now ensure that we don't recurse unless it is necessary. */
442 if (!child)
443 return next;
444 else
446 while (next)
447 next = free_bb (next);
449 return child;
454 /* Look for floating-point divisions among DEF's uses, and try to
455 replace them by multiplications with the reciprocal. Add
456 as many statements computing the reciprocal as needed.
458 DEF must be a GIMPLE register of a floating-point type. */
460 static void
461 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
463 use_operand_p use_p;
464 imm_use_iterator use_iter;
465 struct occurrence *occ;
466 int count = 0, threshold;
468 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
470 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
472 gimple *use_stmt = USE_STMT (use_p);
473 if (is_division_by (use_stmt, def))
475 register_division_in (gimple_bb (use_stmt));
476 count++;
480 /* Do the expensive part only if we can hope to optimize something. */
481 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
482 if (count >= threshold)
484 gimple *use_stmt;
485 for (occ = occ_head; occ; occ = occ->next)
487 compute_merit (occ);
488 insert_reciprocals (def_gsi, occ, def, NULL, threshold);
491 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
493 if (is_division_by (use_stmt, def))
495 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
496 replace_reciprocal (use_p);
501 for (occ = occ_head; occ; )
502 occ = free_bb (occ);
504 occ_head = NULL;
507 /* Return an internal function that implements the reciprocal of CALL,
508 or IFN_LAST if there is no such function that the target supports. */
510 internal_fn
511 internal_fn_reciprocal (gcall *call)
513 internal_fn ifn;
515 switch (gimple_call_combined_fn (call))
517 CASE_CFN_SQRT:
518 CASE_CFN_SQRT_FN:
519 ifn = IFN_RSQRT;
520 break;
522 default:
523 return IFN_LAST;
526 tree_pair types = direct_internal_fn_types (ifn, call);
527 if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED))
528 return IFN_LAST;
530 return ifn;
533 /* Go through all the floating-point SSA_NAMEs, and call
534 execute_cse_reciprocals_1 on each of them. */
535 namespace {
537 const pass_data pass_data_cse_reciprocals =
539 GIMPLE_PASS, /* type */
540 "recip", /* name */
541 OPTGROUP_NONE, /* optinfo_flags */
542 TV_NONE, /* tv_id */
543 PROP_ssa, /* properties_required */
544 0, /* properties_provided */
545 0, /* properties_destroyed */
546 0, /* todo_flags_start */
547 TODO_update_ssa, /* todo_flags_finish */
550 class pass_cse_reciprocals : public gimple_opt_pass
552 public:
553 pass_cse_reciprocals (gcc::context *ctxt)
554 : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
557 /* opt_pass methods: */
558 virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
559 virtual unsigned int execute (function *);
561 }; // class pass_cse_reciprocals
563 unsigned int
564 pass_cse_reciprocals::execute (function *fun)
566 basic_block bb;
567 tree arg;
569 occ_pool = new object_allocator<occurrence> ("dominators for recip");
571 memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
572 calculate_dominance_info (CDI_DOMINATORS);
573 calculate_dominance_info (CDI_POST_DOMINATORS);
575 if (flag_checking)
576 FOR_EACH_BB_FN (bb, fun)
577 gcc_assert (!bb->aux);
579 for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
580 if (FLOAT_TYPE_P (TREE_TYPE (arg))
581 && is_gimple_reg (arg))
583 tree name = ssa_default_def (fun, arg);
584 if (name)
585 execute_cse_reciprocals_1 (NULL, name);
588 FOR_EACH_BB_FN (bb, fun)
590 tree def;
592 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
593 gsi_next (&gsi))
595 gphi *phi = gsi.phi ();
596 def = PHI_RESULT (phi);
597 if (! virtual_operand_p (def)
598 && FLOAT_TYPE_P (TREE_TYPE (def)))
599 execute_cse_reciprocals_1 (NULL, def);
602 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
603 gsi_next (&gsi))
605 gimple *stmt = gsi_stmt (gsi);
607 if (gimple_has_lhs (stmt)
608 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
609 && FLOAT_TYPE_P (TREE_TYPE (def))
610 && TREE_CODE (def) == SSA_NAME)
611 execute_cse_reciprocals_1 (&gsi, def);
614 if (optimize_bb_for_size_p (bb))
615 continue;
617 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
618 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
619 gsi_next (&gsi))
621 gimple *stmt = gsi_stmt (gsi);
623 if (is_gimple_assign (stmt)
624 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
626 tree arg1 = gimple_assign_rhs2 (stmt);
627 gimple *stmt1;
629 if (TREE_CODE (arg1) != SSA_NAME)
630 continue;
632 stmt1 = SSA_NAME_DEF_STMT (arg1);
634 if (is_gimple_call (stmt1)
635 && gimple_call_lhs (stmt1))
637 bool fail;
638 imm_use_iterator ui;
639 use_operand_p use_p;
640 tree fndecl = NULL_TREE;
642 gcall *call = as_a <gcall *> (stmt1);
643 internal_fn ifn = internal_fn_reciprocal (call);
644 if (ifn == IFN_LAST)
646 fndecl = gimple_call_fndecl (call);
647 if (!fndecl
648 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_MD)
649 continue;
650 fndecl = targetm.builtin_reciprocal (fndecl);
651 if (!fndecl)
652 continue;
655 /* Check that all uses of the SSA name are divisions,
656 otherwise replacing the defining statement will do
657 the wrong thing. */
658 fail = false;
659 FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
661 gimple *stmt2 = USE_STMT (use_p);
662 if (is_gimple_debug (stmt2))
663 continue;
664 if (!is_gimple_assign (stmt2)
665 || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
666 || gimple_assign_rhs1 (stmt2) == arg1
667 || gimple_assign_rhs2 (stmt2) != arg1)
669 fail = true;
670 break;
673 if (fail)
674 continue;
676 gimple_replace_ssa_lhs (call, arg1);
677 if (gimple_call_internal_p (call) != (ifn != IFN_LAST))
679 auto_vec<tree, 4> args;
680 for (unsigned int i = 0;
681 i < gimple_call_num_args (call); i++)
682 args.safe_push (gimple_call_arg (call, i));
683 gcall *stmt2;
684 if (ifn == IFN_LAST)
685 stmt2 = gimple_build_call_vec (fndecl, args);
686 else
687 stmt2 = gimple_build_call_internal_vec (ifn, args);
688 gimple_call_set_lhs (stmt2, arg1);
689 if (gimple_vdef (call))
691 gimple_set_vdef (stmt2, gimple_vdef (call));
692 SSA_NAME_DEF_STMT (gimple_vdef (stmt2)) = stmt2;
694 gimple_call_set_nothrow (stmt2,
695 gimple_call_nothrow_p (call));
696 gimple_set_vuse (stmt2, gimple_vuse (call));
697 gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
698 gsi_replace (&gsi2, stmt2, true);
700 else
702 if (ifn == IFN_LAST)
703 gimple_call_set_fndecl (call, fndecl);
704 else
705 gimple_call_set_internal_fn (call, ifn);
706 update_stmt (call);
708 reciprocal_stats.rfuncs_inserted++;
710 FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
712 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
713 gimple_assign_set_rhs_code (stmt, MULT_EXPR);
714 fold_stmt_inplace (&gsi);
715 update_stmt (stmt);
722 statistics_counter_event (fun, "reciprocal divs inserted",
723 reciprocal_stats.rdivs_inserted);
724 statistics_counter_event (fun, "reciprocal functions inserted",
725 reciprocal_stats.rfuncs_inserted);
727 free_dominance_info (CDI_DOMINATORS);
728 free_dominance_info (CDI_POST_DOMINATORS);
729 delete occ_pool;
730 return 0;
733 } // anon namespace
735 gimple_opt_pass *
736 make_pass_cse_reciprocals (gcc::context *ctxt)
738 return new pass_cse_reciprocals (ctxt);
741 /* Records an occurrence at statement USE_STMT in the vector of trees
742 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
743 is not yet initialized. Returns true if the occurrence was pushed on
744 the vector. Adjusts *TOP_BB to be the basic block dominating all
745 statements in the vector. */
747 static bool
748 maybe_record_sincos (vec<gimple *> *stmts,
749 basic_block *top_bb, gimple *use_stmt)
751 basic_block use_bb = gimple_bb (use_stmt);
752 if (*top_bb
753 && (*top_bb == use_bb
754 || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
755 stmts->safe_push (use_stmt);
756 else if (!*top_bb
757 || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
759 stmts->safe_push (use_stmt);
760 *top_bb = use_bb;
762 else
763 return false;
765 return true;
768 /* Look for sin, cos and cexpi calls with the same argument NAME and
769 create a single call to cexpi CSEing the result in this case.
770 We first walk over all immediate uses of the argument collecting
771 statements that we can CSE in a vector and in a second pass replace
772 the statement rhs with a REALPART or IMAGPART expression on the
773 result of the cexpi call we insert before the use statement that
774 dominates all other candidates. */
776 static bool
777 execute_cse_sincos_1 (tree name)
779 gimple_stmt_iterator gsi;
780 imm_use_iterator use_iter;
781 tree fndecl, res, type;
782 gimple *def_stmt, *use_stmt, *stmt;
783 int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
784 auto_vec<gimple *> stmts;
785 basic_block top_bb = NULL;
786 int i;
787 bool cfg_changed = false;
789 type = TREE_TYPE (name);
790 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
792 if (gimple_code (use_stmt) != GIMPLE_CALL
793 || !gimple_call_lhs (use_stmt))
794 continue;
796 switch (gimple_call_combined_fn (use_stmt))
798 CASE_CFN_COS:
799 seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
800 break;
802 CASE_CFN_SIN:
803 seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
804 break;
806 CASE_CFN_CEXPI:
807 seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
808 break;
810 default:;
814 if (seen_cos + seen_sin + seen_cexpi <= 1)
815 return false;
817 /* Simply insert cexpi at the beginning of top_bb but not earlier than
818 the name def statement. */
819 fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
820 if (!fndecl)
821 return false;
822 stmt = gimple_build_call (fndecl, 1, name);
823 res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
824 gimple_call_set_lhs (stmt, res);
826 def_stmt = SSA_NAME_DEF_STMT (name);
827 if (!SSA_NAME_IS_DEFAULT_DEF (name)
828 && gimple_code (def_stmt) != GIMPLE_PHI
829 && gimple_bb (def_stmt) == top_bb)
831 gsi = gsi_for_stmt (def_stmt);
832 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
834 else
836 gsi = gsi_after_labels (top_bb);
837 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
839 sincos_stats.inserted++;
841 /* And adjust the recorded old call sites. */
842 for (i = 0; stmts.iterate (i, &use_stmt); ++i)
844 tree rhs = NULL;
846 switch (gimple_call_combined_fn (use_stmt))
848 CASE_CFN_COS:
849 rhs = fold_build1 (REALPART_EXPR, type, res);
850 break;
852 CASE_CFN_SIN:
853 rhs = fold_build1 (IMAGPART_EXPR, type, res);
854 break;
856 CASE_CFN_CEXPI:
857 rhs = res;
858 break;
860 default:;
861 gcc_unreachable ();
864 /* Replace call with a copy. */
865 stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
867 gsi = gsi_for_stmt (use_stmt);
868 gsi_replace (&gsi, stmt, true);
869 if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
870 cfg_changed = true;
873 return cfg_changed;
876 /* To evaluate powi(x,n), the floating point value x raised to the
877 constant integer exponent n, we use a hybrid algorithm that
878 combines the "window method" with look-up tables. For an
879 introduction to exponentiation algorithms and "addition chains",
880 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
881 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
882 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
883 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
885 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
886 multiplications to inline before calling the system library's pow
887 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
888 so this default never requires calling pow, powf or powl. */
890 #ifndef POWI_MAX_MULTS
891 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
892 #endif
894 /* The size of the "optimal power tree" lookup table. All
895 exponents less than this value are simply looked up in the
896 powi_table below. This threshold is also used to size the
897 cache of pseudo registers that hold intermediate results. */
898 #define POWI_TABLE_SIZE 256
900 /* The size, in bits of the window, used in the "window method"
901 exponentiation algorithm. This is equivalent to a radix of
902 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
903 #define POWI_WINDOW_SIZE 3
905 /* The following table is an efficient representation of an
906 "optimal power tree". For each value, i, the corresponding
907 value, j, in the table states than an optimal evaluation
908 sequence for calculating pow(x,i) can be found by evaluating
909 pow(x,j)*pow(x,i-j). An optimal power tree for the first
910 100 integers is given in Knuth's "Seminumerical algorithms". */
912 static const unsigned char powi_table[POWI_TABLE_SIZE] =
914 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
915 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
916 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
917 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
918 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
919 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
920 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
921 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
922 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
923 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
924 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
925 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
926 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
927 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
928 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
929 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
930 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
931 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
932 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
933 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
934 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
935 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
936 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
937 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
938 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
939 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
940 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
941 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
942 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
943 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
944 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
945 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
949 /* Return the number of multiplications required to calculate
950 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
951 subroutine of powi_cost. CACHE is an array indicating
952 which exponents have already been calculated. */
954 static int
955 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
957 /* If we've already calculated this exponent, then this evaluation
958 doesn't require any additional multiplications. */
959 if (cache[n])
960 return 0;
962 cache[n] = true;
963 return powi_lookup_cost (n - powi_table[n], cache)
964 + powi_lookup_cost (powi_table[n], cache) + 1;
967 /* Return the number of multiplications required to calculate
968 powi(x,n) for an arbitrary x, given the exponent N. This
969 function needs to be kept in sync with powi_as_mults below. */
971 static int
972 powi_cost (HOST_WIDE_INT n)
974 bool cache[POWI_TABLE_SIZE];
975 unsigned HOST_WIDE_INT digit;
976 unsigned HOST_WIDE_INT val;
977 int result;
979 if (n == 0)
980 return 0;
982 /* Ignore the reciprocal when calculating the cost. */
983 val = (n < 0) ? -n : n;
985 /* Initialize the exponent cache. */
986 memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
987 cache[1] = true;
989 result = 0;
991 while (val >= POWI_TABLE_SIZE)
993 if (val & 1)
995 digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
996 result += powi_lookup_cost (digit, cache)
997 + POWI_WINDOW_SIZE + 1;
998 val >>= POWI_WINDOW_SIZE;
1000 else
1002 val >>= 1;
1003 result++;
1007 return result + powi_lookup_cost (val, cache);
1010 /* Recursive subroutine of powi_as_mults. This function takes the
1011 array, CACHE, of already calculated exponents and an exponent N and
1012 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
1014 static tree
1015 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
1016 HOST_WIDE_INT n, tree *cache)
1018 tree op0, op1, ssa_target;
1019 unsigned HOST_WIDE_INT digit;
1020 gassign *mult_stmt;
1022 if (n < POWI_TABLE_SIZE && cache[n])
1023 return cache[n];
1025 ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1027 if (n < POWI_TABLE_SIZE)
1029 cache[n] = ssa_target;
1030 op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1031 op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1033 else if (n & 1)
1035 digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1036 op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1037 op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1039 else
1041 op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1042 op1 = op0;
1045 mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1046 gimple_set_location (mult_stmt, loc);
1047 gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1049 return ssa_target;
1052 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1053 This function needs to be kept in sync with powi_cost above. */
1055 static tree
1056 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1057 tree arg0, HOST_WIDE_INT n)
1059 tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1060 gassign *div_stmt;
1061 tree target;
1063 if (n == 0)
1064 return build_real (type, dconst1);
1066 memset (cache, 0, sizeof (cache));
1067 cache[1] = arg0;
1069 result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1070 if (n >= 0)
1071 return result;
1073 /* If the original exponent was negative, reciprocate the result. */
1074 target = make_temp_ssa_name (type, NULL, "powmult");
1075 div_stmt = gimple_build_assign (target, RDIV_EXPR,
1076 build_real (type, dconst1), result);
1077 gimple_set_location (div_stmt, loc);
1078 gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1080 return target;
1083 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1084 location info LOC. If the arguments are appropriate, create an
1085 equivalent sequence of statements prior to GSI using an optimal
1086 number of multiplications, and return an expession holding the
1087 result. */
1089 static tree
1090 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1091 tree arg0, HOST_WIDE_INT n)
1093 /* Avoid largest negative number. */
1094 if (n != -n
1095 && ((n >= -1 && n <= 2)
1096 || (optimize_function_for_speed_p (cfun)
1097 && powi_cost (n) <= POWI_MAX_MULTS)))
1098 return powi_as_mults (gsi, loc, arg0, n);
1100 return NULL_TREE;
1103 /* Build a gimple call statement that calls FN with argument ARG.
1104 Set the lhs of the call statement to a fresh SSA name. Insert the
1105 statement prior to GSI's current position, and return the fresh
1106 SSA name. */
1108 static tree
1109 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1110 tree fn, tree arg)
1112 gcall *call_stmt;
1113 tree ssa_target;
1115 call_stmt = gimple_build_call (fn, 1, arg);
1116 ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1117 gimple_set_lhs (call_stmt, ssa_target);
1118 gimple_set_location (call_stmt, loc);
1119 gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1121 return ssa_target;
1124 /* Build a gimple binary operation with the given CODE and arguments
1125 ARG0, ARG1, assigning the result to a new SSA name for variable
1126 TARGET. Insert the statement prior to GSI's current position, and
1127 return the fresh SSA name.*/
1129 static tree
1130 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1131 const char *name, enum tree_code code,
1132 tree arg0, tree arg1)
1134 tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1135 gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1136 gimple_set_location (stmt, loc);
1137 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1138 return result;
1141 /* Build a gimple reference operation with the given CODE and argument
1142 ARG, assigning the result to a new SSA name of TYPE with NAME.
1143 Insert the statement prior to GSI's current position, and return
1144 the fresh SSA name. */
1146 static inline tree
1147 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1148 const char *name, enum tree_code code, tree arg0)
1150 tree result = make_temp_ssa_name (type, NULL, name);
1151 gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1152 gimple_set_location (stmt, loc);
1153 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1154 return result;
1157 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1158 prior to GSI's current position, and return the fresh SSA name. */
1160 static tree
1161 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1162 tree type, tree val)
1164 tree result = make_ssa_name (type);
1165 gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1166 gimple_set_location (stmt, loc);
1167 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1168 return result;
1171 struct pow_synth_sqrt_info
1173 bool *factors;
1174 unsigned int deepest;
1175 unsigned int num_mults;
1178 /* Return true iff the real value C can be represented as a
1179 sum of powers of 0.5 up to N. That is:
1180 C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1181 Record in INFO the various parameters of the synthesis algorithm such
1182 as the factors a[i], the maximum 0.5 power and the number of
1183 multiplications that will be required. */
1185 bool
1186 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1187 struct pow_synth_sqrt_info *info)
1189 REAL_VALUE_TYPE factor = dconsthalf;
1190 REAL_VALUE_TYPE remainder = c;
1192 info->deepest = 0;
1193 info->num_mults = 0;
1194 memset (info->factors, 0, n * sizeof (bool));
1196 for (unsigned i = 0; i < n; i++)
1198 REAL_VALUE_TYPE res;
1200 /* If something inexact happened bail out now. */
1201 if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1202 return false;
1204 /* We have hit zero. The number is representable as a sum
1205 of powers of 0.5. */
1206 if (real_equal (&res, &dconst0))
1208 info->factors[i] = true;
1209 info->deepest = i + 1;
1210 return true;
1212 else if (!REAL_VALUE_NEGATIVE (res))
1214 remainder = res;
1215 info->factors[i] = true;
1216 info->num_mults++;
1218 else
1219 info->factors[i] = false;
1221 real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1223 return false;
1226 /* Return the tree corresponding to FN being applied
1227 to ARG N times at GSI and LOC.
1228 Look up previous results from CACHE if need be.
1229 cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times. */
1231 static tree
1232 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1233 tree fn, location_t loc, tree *cache)
1235 tree res = cache[n];
1236 if (!res)
1238 tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1239 res = build_and_insert_call (gsi, loc, fn, prev);
1240 cache[n] = res;
1243 return res;
1246 /* Print to STREAM the repeated application of function FNAME to ARG
1247 N times. So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1248 "foo (foo (x))". */
1250 static void
1251 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1252 unsigned int n)
1254 if (n == 0)
1255 fprintf (stream, "%s", arg);
1256 else
1258 fprintf (stream, "%s (", fname);
1259 print_nested_fn (stream, fname, arg, n - 1);
1260 fprintf (stream, ")");
1264 /* Print to STREAM the fractional sequence of sqrt chains
1265 applied to ARG, described by INFO. Used for the dump file. */
1267 static void
1268 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1269 struct pow_synth_sqrt_info *info)
1271 for (unsigned int i = 0; i < info->deepest; i++)
1273 bool is_set = info->factors[i];
1274 if (is_set)
1276 print_nested_fn (stream, "sqrt", arg, i + 1);
1277 if (i != info->deepest - 1)
1278 fprintf (stream, " * ");
1283 /* Print to STREAM a representation of raising ARG to an integer
1284 power N. Used for the dump file. */
1286 static void
1287 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1289 if (n > 1)
1290 fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1291 else if (n == 1)
1292 fprintf (stream, "%s", arg);
1295 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1296 square roots. Place at GSI and LOC. Limit the maximum depth
1297 of the sqrt chains to MAX_DEPTH. Return the tree holding the
1298 result of the expanded sequence or NULL_TREE if the expansion failed.
1300 This routine assumes that ARG1 is a real number with a fractional part
1301 (the integer exponent case will have been handled earlier in
1302 gimple_expand_builtin_pow).
1304 For ARG1 > 0.0:
1305 * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1306 FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1307 FRAC_PART == ARG1 - WHOLE_PART:
1308 Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1309 POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1310 if it can be expressed as such, that is if FRAC_PART satisfies:
1311 FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1312 where integer a[i] is either 0 or 1.
1314 Example:
1315 POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1316 --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1318 For ARG1 < 0.0 there are two approaches:
1319 * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1320 is calculated as above.
1322 Example:
1323 POW (x, -5.625) == 1.0 / POW (x, 5.625)
1324 --> 1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1326 * (B) : WHOLE_PART := - ceil (abs (ARG1))
1327 FRAC_PART := ARG1 - WHOLE_PART
1328 and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1329 Example:
1330 POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1331 --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1333 For ARG1 < 0.0 we choose between (A) and (B) depending on
1334 how many multiplications we'd have to do.
1335 So, for the example in (B): POW (x, -5.875), if we were to
1336 follow algorithm (A) we would produce:
1337 1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1338 which contains more multiplications than approach (B).
1340 Hopefully, this approach will eliminate potentially expensive POW library
1341 calls when unsafe floating point math is enabled and allow the compiler to
1342 further optimise the multiplies, square roots and divides produced by this
1343 function. */
1345 static tree
1346 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1347 tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1349 tree type = TREE_TYPE (arg0);
1350 machine_mode mode = TYPE_MODE (type);
1351 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1352 bool one_over = true;
1354 if (!sqrtfn)
1355 return NULL_TREE;
1357 if (TREE_CODE (arg1) != REAL_CST)
1358 return NULL_TREE;
1360 REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1362 gcc_assert (max_depth > 0);
1363 tree *cache = XALLOCAVEC (tree, max_depth + 1);
1365 struct pow_synth_sqrt_info synth_info;
1366 synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1367 synth_info.deepest = 0;
1368 synth_info.num_mults = 0;
1370 bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1371 REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1373 /* The whole and fractional parts of exp. */
1374 REAL_VALUE_TYPE whole_part;
1375 REAL_VALUE_TYPE frac_part;
1377 real_floor (&whole_part, mode, &exp);
1378 real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1381 REAL_VALUE_TYPE ceil_whole = dconst0;
1382 REAL_VALUE_TYPE ceil_fract = dconst0;
1384 if (neg_exp)
1386 real_ceil (&ceil_whole, mode, &exp);
1387 real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1390 if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1391 return NULL_TREE;
1393 /* Check whether it's more profitable to not use 1.0 / ... */
1394 if (neg_exp)
1396 struct pow_synth_sqrt_info alt_synth_info;
1397 alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1398 alt_synth_info.deepest = 0;
1399 alt_synth_info.num_mults = 0;
1401 if (representable_as_half_series_p (ceil_fract, max_depth,
1402 &alt_synth_info)
1403 && alt_synth_info.deepest <= synth_info.deepest
1404 && alt_synth_info.num_mults < synth_info.num_mults)
1406 whole_part = ceil_whole;
1407 frac_part = ceil_fract;
1408 synth_info.deepest = alt_synth_info.deepest;
1409 synth_info.num_mults = alt_synth_info.num_mults;
1410 memcpy (synth_info.factors, alt_synth_info.factors,
1411 (max_depth + 1) * sizeof (bool));
1412 one_over = false;
1416 HOST_WIDE_INT n = real_to_integer (&whole_part);
1417 REAL_VALUE_TYPE cint;
1418 real_from_integer (&cint, VOIDmode, n, SIGNED);
1420 if (!real_identical (&whole_part, &cint))
1421 return NULL_TREE;
1423 if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1424 return NULL_TREE;
1426 memset (cache, 0, (max_depth + 1) * sizeof (tree));
1428 tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1430 /* Calculate the integer part of the exponent. */
1431 if (n > 1)
1433 integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1434 if (!integer_res)
1435 return NULL_TREE;
1438 if (dump_file)
1440 char string[64];
1442 real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1443 fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1445 if (neg_exp)
1447 if (one_over)
1449 fprintf (dump_file, "1.0 / (");
1450 dump_integer_part (dump_file, "x", n);
1451 if (n > 0)
1452 fprintf (dump_file, " * ");
1453 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1454 fprintf (dump_file, ")");
1456 else
1458 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1459 fprintf (dump_file, " / (");
1460 dump_integer_part (dump_file, "x", n);
1461 fprintf (dump_file, ")");
1464 else
1466 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1467 if (n > 0)
1468 fprintf (dump_file, " * ");
1469 dump_integer_part (dump_file, "x", n);
1472 fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1476 tree fract_res = NULL_TREE;
1477 cache[0] = arg0;
1479 /* Calculate the fractional part of the exponent. */
1480 for (unsigned i = 0; i < synth_info.deepest; i++)
1482 if (synth_info.factors[i])
1484 tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1486 if (!fract_res)
1487 fract_res = sqrt_chain;
1489 else
1490 fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1491 fract_res, sqrt_chain);
1495 tree res = NULL_TREE;
1497 if (neg_exp)
1499 if (one_over)
1501 if (n > 0)
1502 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1503 fract_res, integer_res);
1504 else
1505 res = fract_res;
1507 res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1508 build_real (type, dconst1), res);
1510 else
1512 res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1513 fract_res, integer_res);
1516 else
1517 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1518 fract_res, integer_res);
1519 return res;
1522 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1523 with location info LOC. If possible, create an equivalent and
1524 less expensive sequence of statements prior to GSI, and return an
1525 expession holding the result. */
1527 static tree
1528 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1529 tree arg0, tree arg1)
1531 REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1532 REAL_VALUE_TYPE c2, dconst3;
1533 HOST_WIDE_INT n;
1534 tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1535 machine_mode mode;
1536 bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1537 bool hw_sqrt_exists, c_is_int, c2_is_int;
1539 dconst1_4 = dconst1;
1540 SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1542 /* If the exponent isn't a constant, there's nothing of interest
1543 to be done. */
1544 if (TREE_CODE (arg1) != REAL_CST)
1545 return NULL_TREE;
1547 /* Don't perform the operation if flag_signaling_nans is on
1548 and the operand is a signaling NaN. */
1549 if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1)))
1550 && ((TREE_CODE (arg0) == REAL_CST
1551 && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0)))
1552 || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1))))
1553 return NULL_TREE;
1555 /* If the exponent is equivalent to an integer, expand to an optimal
1556 multiplication sequence when profitable. */
1557 c = TREE_REAL_CST (arg1);
1558 n = real_to_integer (&c);
1559 real_from_integer (&cint, VOIDmode, n, SIGNED);
1560 c_is_int = real_identical (&c, &cint);
1562 if (c_is_int
1563 && ((n >= -1 && n <= 2)
1564 || (flag_unsafe_math_optimizations
1565 && speed_p
1566 && powi_cost (n) <= POWI_MAX_MULTS)))
1567 return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1569 /* Attempt various optimizations using sqrt and cbrt. */
1570 type = TREE_TYPE (arg0);
1571 mode = TYPE_MODE (type);
1572 sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1574 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1575 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1576 sqrt(-0) = -0. */
1577 if (sqrtfn
1578 && real_equal (&c, &dconsthalf)
1579 && !HONOR_SIGNED_ZEROS (mode))
1580 return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1582 hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1584 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1585 optimizations since 1./3. is not exactly representable. If x
1586 is negative and finite, the correct value of pow(x,1./3.) is
1587 a NaN with the "invalid" exception raised, because the value
1588 of 1./3. actually has an even denominator. The correct value
1589 of cbrt(x) is a negative real value. */
1590 cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1591 dconst1_3 = real_value_truncate (mode, dconst_third ());
1593 if (flag_unsafe_math_optimizations
1594 && cbrtfn
1595 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1596 && real_equal (&c, &dconst1_3))
1597 return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1599 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1600 if we don't have a hardware sqrt insn. */
1601 dconst1_6 = dconst1_3;
1602 SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1604 if (flag_unsafe_math_optimizations
1605 && sqrtfn
1606 && cbrtfn
1607 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1608 && speed_p
1609 && hw_sqrt_exists
1610 && real_equal (&c, &dconst1_6))
1612 /* sqrt(x) */
1613 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1615 /* cbrt(sqrt(x)) */
1616 return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1620 /* Attempt to expand the POW as a product of square root chains.
1621 Expand the 0.25 case even when otpimising for size. */
1622 if (flag_unsafe_math_optimizations
1623 && sqrtfn
1624 && hw_sqrt_exists
1625 && (speed_p || real_equal (&c, &dconst1_4))
1626 && !HONOR_SIGNED_ZEROS (mode))
1628 unsigned int max_depth = speed_p
1629 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1630 : 2;
1632 tree expand_with_sqrts
1633 = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1635 if (expand_with_sqrts)
1636 return expand_with_sqrts;
1639 real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1640 n = real_to_integer (&c2);
1641 real_from_integer (&cint, VOIDmode, n, SIGNED);
1642 c2_is_int = real_identical (&c2, &cint);
1644 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1646 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
1647 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
1649 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
1650 different from pow(x, 1./3.) due to rounding and behavior with
1651 negative x, we need to constrain this transformation to unsafe
1652 math and positive x or finite math. */
1653 real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1654 real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1655 real_round (&c2, mode, &c2);
1656 n = real_to_integer (&c2);
1657 real_from_integer (&cint, VOIDmode, n, SIGNED);
1658 real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1659 real_convert (&c2, mode, &c2);
1661 if (flag_unsafe_math_optimizations
1662 && cbrtfn
1663 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1664 && real_identical (&c2, &c)
1665 && !c2_is_int
1666 && optimize_function_for_speed_p (cfun)
1667 && powi_cost (n / 3) <= POWI_MAX_MULTS)
1669 tree powi_x_ndiv3 = NULL_TREE;
1671 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
1672 possible or profitable, give up. Skip the degenerate case when
1673 abs(n) < 3, where the result is always 1. */
1674 if (absu_hwi (n) >= 3)
1676 powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1677 abs_hwi (n / 3));
1678 if (!powi_x_ndiv3)
1679 return NULL_TREE;
1682 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
1683 as that creates an unnecessary variable. Instead, just produce
1684 either cbrt(x) or cbrt(x) * cbrt(x). */
1685 cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1687 if (absu_hwi (n) % 3 == 1)
1688 powi_cbrt_x = cbrt_x;
1689 else
1690 powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1691 cbrt_x, cbrt_x);
1693 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
1694 if (absu_hwi (n) < 3)
1695 result = powi_cbrt_x;
1696 else
1697 result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1698 powi_x_ndiv3, powi_cbrt_x);
1700 /* If n is negative, reciprocate the result. */
1701 if (n < 0)
1702 result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1703 build_real (type, dconst1), result);
1705 return result;
1708 /* No optimizations succeeded. */
1709 return NULL_TREE;
1712 /* ARG is the argument to a cabs builtin call in GSI with location info
1713 LOC. Create a sequence of statements prior to GSI that calculates
1714 sqrt(R*R + I*I), where R and I are the real and imaginary components
1715 of ARG, respectively. Return an expression holding the result. */
1717 static tree
1718 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1720 tree real_part, imag_part, addend1, addend2, sum, result;
1721 tree type = TREE_TYPE (TREE_TYPE (arg));
1722 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1723 machine_mode mode = TYPE_MODE (type);
1725 if (!flag_unsafe_math_optimizations
1726 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1727 || !sqrtfn
1728 || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1729 return NULL_TREE;
1731 real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1732 REALPART_EXPR, arg);
1733 addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1734 real_part, real_part);
1735 imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1736 IMAGPART_EXPR, arg);
1737 addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1738 imag_part, imag_part);
1739 sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1740 result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1742 return result;
1745 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1746 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
1747 an optimal number of multiplies, when n is a constant. */
1749 namespace {
1751 const pass_data pass_data_cse_sincos =
1753 GIMPLE_PASS, /* type */
1754 "sincos", /* name */
1755 OPTGROUP_NONE, /* optinfo_flags */
1756 TV_NONE, /* tv_id */
1757 PROP_ssa, /* properties_required */
1758 PROP_gimple_opt_math, /* properties_provided */
1759 0, /* properties_destroyed */
1760 0, /* todo_flags_start */
1761 TODO_update_ssa, /* todo_flags_finish */
1764 class pass_cse_sincos : public gimple_opt_pass
1766 public:
1767 pass_cse_sincos (gcc::context *ctxt)
1768 : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1771 /* opt_pass methods: */
1772 virtual bool gate (function *)
1774 /* We no longer require either sincos or cexp, since powi expansion
1775 piggybacks on this pass. */
1776 return optimize;
1779 virtual unsigned int execute (function *);
1781 }; // class pass_cse_sincos
1783 unsigned int
1784 pass_cse_sincos::execute (function *fun)
1786 basic_block bb;
1787 bool cfg_changed = false;
1789 calculate_dominance_info (CDI_DOMINATORS);
1790 memset (&sincos_stats, 0, sizeof (sincos_stats));
1792 FOR_EACH_BB_FN (bb, fun)
1794 gimple_stmt_iterator gsi;
1795 bool cleanup_eh = false;
1797 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1799 gimple *stmt = gsi_stmt (gsi);
1801 /* Only the last stmt in a bb could throw, no need to call
1802 gimple_purge_dead_eh_edges if we change something in the middle
1803 of a basic block. */
1804 cleanup_eh = false;
1806 if (is_gimple_call (stmt)
1807 && gimple_call_lhs (stmt))
1809 tree arg, arg0, arg1, result;
1810 HOST_WIDE_INT n;
1811 location_t loc;
1813 switch (gimple_call_combined_fn (stmt))
1815 CASE_CFN_COS:
1816 CASE_CFN_SIN:
1817 CASE_CFN_CEXPI:
1818 /* Make sure we have either sincos or cexp. */
1819 if (!targetm.libc_has_function (function_c99_math_complex)
1820 && !targetm.libc_has_function (function_sincos))
1821 break;
1823 arg = gimple_call_arg (stmt, 0);
1824 if (TREE_CODE (arg) == SSA_NAME)
1825 cfg_changed |= execute_cse_sincos_1 (arg);
1826 break;
1828 CASE_CFN_POW:
1829 arg0 = gimple_call_arg (stmt, 0);
1830 arg1 = gimple_call_arg (stmt, 1);
1832 loc = gimple_location (stmt);
1833 result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1835 if (result)
1837 tree lhs = gimple_get_lhs (stmt);
1838 gassign *new_stmt = gimple_build_assign (lhs, result);
1839 gimple_set_location (new_stmt, loc);
1840 unlink_stmt_vdef (stmt);
1841 gsi_replace (&gsi, new_stmt, true);
1842 cleanup_eh = true;
1843 if (gimple_vdef (stmt))
1844 release_ssa_name (gimple_vdef (stmt));
1846 break;
1848 CASE_CFN_POWI:
1849 arg0 = gimple_call_arg (stmt, 0);
1850 arg1 = gimple_call_arg (stmt, 1);
1851 loc = gimple_location (stmt);
1853 if (real_minus_onep (arg0))
1855 tree t0, t1, cond, one, minus_one;
1856 gassign *stmt;
1858 t0 = TREE_TYPE (arg0);
1859 t1 = TREE_TYPE (arg1);
1860 one = build_real (t0, dconst1);
1861 minus_one = build_real (t0, dconstm1);
1863 cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1864 stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1865 arg1, build_int_cst (t1, 1));
1866 gimple_set_location (stmt, loc);
1867 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1869 result = make_temp_ssa_name (t0, NULL, "powi");
1870 stmt = gimple_build_assign (result, COND_EXPR, cond,
1871 minus_one, one);
1872 gimple_set_location (stmt, loc);
1873 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1875 else
1877 if (!tree_fits_shwi_p (arg1))
1878 break;
1880 n = tree_to_shwi (arg1);
1881 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1884 if (result)
1886 tree lhs = gimple_get_lhs (stmt);
1887 gassign *new_stmt = gimple_build_assign (lhs, result);
1888 gimple_set_location (new_stmt, loc);
1889 unlink_stmt_vdef (stmt);
1890 gsi_replace (&gsi, new_stmt, true);
1891 cleanup_eh = true;
1892 if (gimple_vdef (stmt))
1893 release_ssa_name (gimple_vdef (stmt));
1895 break;
1897 CASE_CFN_CABS:
1898 arg0 = gimple_call_arg (stmt, 0);
1899 loc = gimple_location (stmt);
1900 result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1902 if (result)
1904 tree lhs = gimple_get_lhs (stmt);
1905 gassign *new_stmt = gimple_build_assign (lhs, result);
1906 gimple_set_location (new_stmt, loc);
1907 unlink_stmt_vdef (stmt);
1908 gsi_replace (&gsi, new_stmt, true);
1909 cleanup_eh = true;
1910 if (gimple_vdef (stmt))
1911 release_ssa_name (gimple_vdef (stmt));
1913 break;
1915 default:;
1919 if (cleanup_eh)
1920 cfg_changed |= gimple_purge_dead_eh_edges (bb);
1923 statistics_counter_event (fun, "sincos statements inserted",
1924 sincos_stats.inserted);
1926 return cfg_changed ? TODO_cleanup_cfg : 0;
1929 } // anon namespace
1931 gimple_opt_pass *
1932 make_pass_cse_sincos (gcc::context *ctxt)
1934 return new pass_cse_sincos (ctxt);
1937 /* A symbolic number structure is used to detect byte permutation and selection
1938 patterns of a source. To achieve that, its field N contains an artificial
1939 number consisting of BITS_PER_MARKER sized markers tracking where does each
1940 byte come from in the source:
1942 0 - target byte has the value 0
1943 FF - target byte has an unknown value (eg. due to sign extension)
1944 1..size - marker value is the byte index in the source (0 for lsb).
1946 To detect permutations on memory sources (arrays and structures), a symbolic
1947 number is also associated:
1948 - a base address BASE_ADDR and an OFFSET giving the address of the source;
1949 - a range which gives the difference between the highest and lowest accessed
1950 memory location to make such a symbolic number;
1951 - the address SRC of the source element of lowest address as a convenience
1952 to easily get BASE_ADDR + offset + lowest bytepos;
1953 - number of expressions N_OPS bitwise ored together to represent
1954 approximate cost of the computation.
1956 Note 1: the range is different from size as size reflects the size of the
1957 type of the current expression. For instance, for an array char a[],
1958 (short) a[0] | (short) a[3] would have a size of 2 but a range of 4 while
1959 (short) a[0] | ((short) a[0] << 1) would still have a size of 2 but this
1960 time a range of 1.
1962 Note 2: for non-memory sources, range holds the same value as size.
1964 Note 3: SRC points to the SSA_NAME in case of non-memory source. */
1966 struct symbolic_number {
1967 uint64_t n;
1968 tree type;
1969 tree base_addr;
1970 tree offset;
1971 HOST_WIDE_INT bytepos;
1972 tree src;
1973 tree alias_set;
1974 tree vuse;
1975 unsigned HOST_WIDE_INT range;
1976 int n_ops;
1979 #define BITS_PER_MARKER 8
1980 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1981 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1982 #define HEAD_MARKER(n, size) \
1983 ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1985 /* The number which the find_bswap_or_nop_1 result should match in
1986 order to have a nop. The number is masked according to the size of
1987 the symbolic number before using it. */
1988 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1989 (uint64_t)0x08070605 << 32 | 0x04030201)
1991 /* The number which the find_bswap_or_nop_1 result should match in
1992 order to have a byte swap. The number is masked according to the
1993 size of the symbolic number before using it. */
1994 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1995 (uint64_t)0x01020304 << 32 | 0x05060708)
1997 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1998 number N. Return false if the requested operation is not permitted
1999 on a symbolic number. */
2001 static inline bool
2002 do_shift_rotate (enum tree_code code,
2003 struct symbolic_number *n,
2004 int count)
2006 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2007 unsigned head_marker;
2009 if (count % BITS_PER_UNIT != 0)
2010 return false;
2011 count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
2013 /* Zero out the extra bits of N in order to avoid them being shifted
2014 into the significant bits. */
2015 if (size < 64 / BITS_PER_MARKER)
2016 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2018 switch (code)
2020 case LSHIFT_EXPR:
2021 n->n <<= count;
2022 break;
2023 case RSHIFT_EXPR:
2024 head_marker = HEAD_MARKER (n->n, size);
2025 n->n >>= count;
2026 /* Arithmetic shift of signed type: result is dependent on the value. */
2027 if (!TYPE_UNSIGNED (n->type) && head_marker)
2028 for (i = 0; i < count / BITS_PER_MARKER; i++)
2029 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2030 << ((size - 1 - i) * BITS_PER_MARKER);
2031 break;
2032 case LROTATE_EXPR:
2033 n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
2034 break;
2035 case RROTATE_EXPR:
2036 n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
2037 break;
2038 default:
2039 return false;
2041 /* Zero unused bits for size. */
2042 if (size < 64 / BITS_PER_MARKER)
2043 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2044 return true;
2047 /* Perform sanity checking for the symbolic number N and the gimple
2048 statement STMT. */
2050 static inline bool
2051 verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
2053 tree lhs_type;
2055 lhs_type = gimple_expr_type (stmt);
2057 if (TREE_CODE (lhs_type) != INTEGER_TYPE)
2058 return false;
2060 if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
2061 return false;
2063 return true;
2066 /* Initialize the symbolic number N for the bswap pass from the base element
2067 SRC manipulated by the bitwise OR expression. */
2069 static bool
2070 init_symbolic_number (struct symbolic_number *n, tree src)
2072 int size;
2074 if (! INTEGRAL_TYPE_P (TREE_TYPE (src)))
2075 return false;
2077 n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
2078 n->src = src;
2080 /* Set up the symbolic number N by setting each byte to a value between 1 and
2081 the byte size of rhs1. The highest order byte is set to n->size and the
2082 lowest order byte to 1. */
2083 n->type = TREE_TYPE (src);
2084 size = TYPE_PRECISION (n->type);
2085 if (size % BITS_PER_UNIT != 0)
2086 return false;
2087 size /= BITS_PER_UNIT;
2088 if (size > 64 / BITS_PER_MARKER)
2089 return false;
2090 n->range = size;
2091 n->n = CMPNOP;
2092 n->n_ops = 1;
2094 if (size < 64 / BITS_PER_MARKER)
2095 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2097 return true;
2100 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2101 the answer. If so, REF is that memory source and the base of the memory area
2102 accessed and the offset of the access from that base are recorded in N. */
2104 bool
2105 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
2107 /* Leaf node is an array or component ref. Memorize its base and
2108 offset from base to compare to other such leaf node. */
2109 HOST_WIDE_INT bitsize, bitpos;
2110 machine_mode mode;
2111 int unsignedp, reversep, volatilep;
2112 tree offset, base_addr;
2114 /* Not prepared to handle PDP endian. */
2115 if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2116 return false;
2118 if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2119 return false;
2121 base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2122 &unsignedp, &reversep, &volatilep);
2124 if (TREE_CODE (base_addr) == MEM_REF)
2126 offset_int bit_offset = 0;
2127 tree off = TREE_OPERAND (base_addr, 1);
2129 if (!integer_zerop (off))
2131 offset_int boff, coff = mem_ref_offset (base_addr);
2132 boff = coff << LOG2_BITS_PER_UNIT;
2133 bit_offset += boff;
2136 base_addr = TREE_OPERAND (base_addr, 0);
2138 /* Avoid returning a negative bitpos as this may wreak havoc later. */
2139 if (wi::neg_p (bit_offset))
2141 offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2142 offset_int tem = wi::bit_and_not (bit_offset, mask);
2143 /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2144 Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */
2145 bit_offset -= tem;
2146 tem >>= LOG2_BITS_PER_UNIT;
2147 if (offset)
2148 offset = size_binop (PLUS_EXPR, offset,
2149 wide_int_to_tree (sizetype, tem));
2150 else
2151 offset = wide_int_to_tree (sizetype, tem);
2154 bitpos += bit_offset.to_shwi ();
2157 if (bitpos % BITS_PER_UNIT)
2158 return false;
2159 if (bitsize % BITS_PER_UNIT)
2160 return false;
2161 if (reversep)
2162 return false;
2164 if (!init_symbolic_number (n, ref))
2165 return false;
2166 n->base_addr = base_addr;
2167 n->offset = offset;
2168 n->bytepos = bitpos / BITS_PER_UNIT;
2169 n->alias_set = reference_alias_ptr_type (ref);
2170 n->vuse = gimple_vuse (stmt);
2171 return true;
2174 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2175 symbolic number N1 and N2 whose source statements are respectively
2176 SOURCE_STMT1 and SOURCE_STMT2. */
2178 static gimple *
2179 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
2180 gimple *source_stmt2, struct symbolic_number *n2,
2181 struct symbolic_number *n)
2183 int i, size;
2184 uint64_t mask;
2185 gimple *source_stmt;
2186 struct symbolic_number *n_start;
2188 tree rhs1 = gimple_assign_rhs1 (source_stmt1);
2189 if (TREE_CODE (rhs1) == BIT_FIELD_REF
2190 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
2191 rhs1 = TREE_OPERAND (rhs1, 0);
2192 tree rhs2 = gimple_assign_rhs1 (source_stmt2);
2193 if (TREE_CODE (rhs2) == BIT_FIELD_REF
2194 && TREE_CODE (TREE_OPERAND (rhs2, 0)) == SSA_NAME)
2195 rhs2 = TREE_OPERAND (rhs2, 0);
2197 /* Sources are different, cancel bswap if they are not memory location with
2198 the same base (array, structure, ...). */
2199 if (rhs1 != rhs2)
2201 uint64_t inc;
2202 HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2203 struct symbolic_number *toinc_n_ptr, *n_end;
2204 basic_block bb1, bb2;
2206 if (!n1->base_addr || !n2->base_addr
2207 || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2208 return NULL;
2210 if (!n1->offset != !n2->offset
2211 || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2212 return NULL;
2214 if (n1->bytepos < n2->bytepos)
2216 n_start = n1;
2217 start_sub = n2->bytepos - n1->bytepos;
2219 else
2221 n_start = n2;
2222 start_sub = n1->bytepos - n2->bytepos;
2225 bb1 = gimple_bb (source_stmt1);
2226 bb2 = gimple_bb (source_stmt2);
2227 if (dominated_by_p (CDI_DOMINATORS, bb1, bb2))
2228 source_stmt = source_stmt1;
2229 else
2230 source_stmt = source_stmt2;
2232 /* Find the highest address at which a load is performed and
2233 compute related info. */
2234 end1 = n1->bytepos + (n1->range - 1);
2235 end2 = n2->bytepos + (n2->range - 1);
2236 if (end1 < end2)
2238 end = end2;
2239 end_sub = end2 - end1;
2241 else
2243 end = end1;
2244 end_sub = end1 - end2;
2246 n_end = (end2 > end1) ? n2 : n1;
2248 /* Find symbolic number whose lsb is the most significant. */
2249 if (BYTES_BIG_ENDIAN)
2250 toinc_n_ptr = (n_end == n1) ? n2 : n1;
2251 else
2252 toinc_n_ptr = (n_start == n1) ? n2 : n1;
2254 n->range = end - n_start->bytepos + 1;
2256 /* Check that the range of memory covered can be represented by
2257 a symbolic number. */
2258 if (n->range > 64 / BITS_PER_MARKER)
2259 return NULL;
2261 /* Reinterpret byte marks in symbolic number holding the value of
2262 bigger weight according to target endianness. */
2263 inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2264 size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2265 for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2267 unsigned marker
2268 = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2269 if (marker && marker != MARKER_BYTE_UNKNOWN)
2270 toinc_n_ptr->n += inc;
2273 else
2275 n->range = n1->range;
2276 n_start = n1;
2277 source_stmt = source_stmt1;
2280 if (!n1->alias_set
2281 || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2282 n->alias_set = n1->alias_set;
2283 else
2284 n->alias_set = ptr_type_node;
2285 n->vuse = n_start->vuse;
2286 n->base_addr = n_start->base_addr;
2287 n->offset = n_start->offset;
2288 n->src = n_start->src;
2289 n->bytepos = n_start->bytepos;
2290 n->type = n_start->type;
2291 size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2293 for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2295 uint64_t masked1, masked2;
2297 masked1 = n1->n & mask;
2298 masked2 = n2->n & mask;
2299 if (masked1 && masked2 && masked1 != masked2)
2300 return NULL;
2302 n->n = n1->n | n2->n;
2303 n->n_ops = n1->n_ops + n2->n_ops;
2305 return source_stmt;
2308 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2309 the operation given by the rhs of STMT on the result. If the operation
2310 could successfully be executed the function returns a gimple stmt whose
2311 rhs's first tree is the expression of the source operand and NULL
2312 otherwise. */
2314 static gimple *
2315 find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
2317 enum tree_code code;
2318 tree rhs1, rhs2 = NULL;
2319 gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
2320 enum gimple_rhs_class rhs_class;
2322 if (!limit || !is_gimple_assign (stmt))
2323 return NULL;
2325 rhs1 = gimple_assign_rhs1 (stmt);
2327 if (find_bswap_or_nop_load (stmt, rhs1, n))
2328 return stmt;
2330 /* Handle BIT_FIELD_REF. */
2331 if (TREE_CODE (rhs1) == BIT_FIELD_REF
2332 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
2334 unsigned HOST_WIDE_INT bitsize = tree_to_uhwi (TREE_OPERAND (rhs1, 1));
2335 unsigned HOST_WIDE_INT bitpos = tree_to_uhwi (TREE_OPERAND (rhs1, 2));
2336 if (bitpos % BITS_PER_UNIT == 0
2337 && bitsize % BITS_PER_UNIT == 0
2338 && init_symbolic_number (n, TREE_OPERAND (rhs1, 0)))
2340 /* Handle big-endian bit numbering in BIT_FIELD_REF. */
2341 if (BYTES_BIG_ENDIAN)
2342 bitpos = TYPE_PRECISION (n->type) - bitpos - bitsize;
2344 /* Shift. */
2345 if (!do_shift_rotate (RSHIFT_EXPR, n, bitpos))
2346 return NULL;
2348 /* Mask. */
2349 uint64_t mask = 0;
2350 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2351 for (unsigned i = 0; i < bitsize / BITS_PER_UNIT;
2352 i++, tmp <<= BITS_PER_UNIT)
2353 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2354 n->n &= mask;
2356 /* Convert. */
2357 n->type = TREE_TYPE (rhs1);
2358 if (!n->base_addr)
2359 n->range = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2361 return verify_symbolic_number_p (n, stmt) ? stmt : NULL;
2364 return NULL;
2367 if (TREE_CODE (rhs1) != SSA_NAME)
2368 return NULL;
2370 code = gimple_assign_rhs_code (stmt);
2371 rhs_class = gimple_assign_rhs_class (stmt);
2372 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2374 if (rhs_class == GIMPLE_BINARY_RHS)
2375 rhs2 = gimple_assign_rhs2 (stmt);
2377 /* Handle unary rhs and binary rhs with integer constants as second
2378 operand. */
2380 if (rhs_class == GIMPLE_UNARY_RHS
2381 || (rhs_class == GIMPLE_BINARY_RHS
2382 && TREE_CODE (rhs2) == INTEGER_CST))
2384 if (code != BIT_AND_EXPR
2385 && code != LSHIFT_EXPR
2386 && code != RSHIFT_EXPR
2387 && code != LROTATE_EXPR
2388 && code != RROTATE_EXPR
2389 && !CONVERT_EXPR_CODE_P (code))
2390 return NULL;
2392 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2394 /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2395 we have to initialize the symbolic number. */
2396 if (!source_stmt1)
2398 if (gimple_assign_load_p (stmt)
2399 || !init_symbolic_number (n, rhs1))
2400 return NULL;
2401 source_stmt1 = stmt;
2404 switch (code)
2406 case BIT_AND_EXPR:
2408 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2409 uint64_t val = int_cst_value (rhs2), mask = 0;
2410 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2412 /* Only constants masking full bytes are allowed. */
2413 for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2414 if ((val & tmp) != 0 && (val & tmp) != tmp)
2415 return NULL;
2416 else if (val & tmp)
2417 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2419 n->n &= mask;
2421 break;
2422 case LSHIFT_EXPR:
2423 case RSHIFT_EXPR:
2424 case LROTATE_EXPR:
2425 case RROTATE_EXPR:
2426 if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2427 return NULL;
2428 break;
2429 CASE_CONVERT:
2431 int i, type_size, old_type_size;
2432 tree type;
2434 type = gimple_expr_type (stmt);
2435 type_size = TYPE_PRECISION (type);
2436 if (type_size % BITS_PER_UNIT != 0)
2437 return NULL;
2438 type_size /= BITS_PER_UNIT;
2439 if (type_size > 64 / BITS_PER_MARKER)
2440 return NULL;
2442 /* Sign extension: result is dependent on the value. */
2443 old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2444 if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2445 && HEAD_MARKER (n->n, old_type_size))
2446 for (i = 0; i < type_size - old_type_size; i++)
2447 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2448 << ((type_size - 1 - i) * BITS_PER_MARKER);
2450 if (type_size < 64 / BITS_PER_MARKER)
2452 /* If STMT casts to a smaller type mask out the bits not
2453 belonging to the target type. */
2454 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2456 n->type = type;
2457 if (!n->base_addr)
2458 n->range = type_size;
2460 break;
2461 default:
2462 return NULL;
2464 return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2467 /* Handle binary rhs. */
2469 if (rhs_class == GIMPLE_BINARY_RHS)
2471 struct symbolic_number n1, n2;
2472 gimple *source_stmt, *source_stmt2;
2474 if (code != BIT_IOR_EXPR)
2475 return NULL;
2477 if (TREE_CODE (rhs2) != SSA_NAME)
2478 return NULL;
2480 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2482 switch (code)
2484 case BIT_IOR_EXPR:
2485 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2487 if (!source_stmt1)
2488 return NULL;
2490 source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2492 if (!source_stmt2)
2493 return NULL;
2495 if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2496 return NULL;
2498 if (!n1.vuse != !n2.vuse
2499 || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2500 return NULL;
2502 source_stmt
2503 = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2505 if (!source_stmt)
2506 return NULL;
2508 if (!verify_symbolic_number_p (n, stmt))
2509 return NULL;
2511 break;
2512 default:
2513 return NULL;
2515 return source_stmt;
2517 return NULL;
2520 /* Check if STMT completes a bswap implementation or a read in a given
2521 endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2522 accordingly. It also sets N to represent the kind of operations
2523 performed: size of the resulting expression and whether it works on
2524 a memory source, and if so alias-set and vuse. At last, the
2525 function returns a stmt whose rhs's first tree is the source
2526 expression. */
2528 static gimple *
2529 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
2531 unsigned rsize;
2532 uint64_t tmpn, mask;
2533 /* The number which the find_bswap_or_nop_1 result should match in order
2534 to have a full byte swap. The number is shifted to the right
2535 according to the size of the symbolic number before using it. */
2536 uint64_t cmpxchg = CMPXCHG;
2537 uint64_t cmpnop = CMPNOP;
2539 gimple *ins_stmt;
2540 int limit;
2542 /* The last parameter determines the depth search limit. It usually
2543 correlates directly to the number n of bytes to be touched. We
2544 increase that number by log2(n) + 1 here in order to also
2545 cover signed -> unsigned conversions of the src operand as can be seen
2546 in libgcc, and for initial shift/and operation of the src operand. */
2547 limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2548 limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2549 ins_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2551 if (!ins_stmt)
2552 return NULL;
2554 /* Find real size of result (highest non-zero byte). */
2555 if (n->base_addr)
2556 for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2557 else
2558 rsize = n->range;
2560 /* Zero out the bits corresponding to untouched bytes in original gimple
2561 expression. */
2562 if (n->range < (int) sizeof (int64_t))
2564 mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2565 cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2566 cmpnop &= mask;
2569 /* Zero out the bits corresponding to unused bytes in the result of the
2570 gimple expression. */
2571 if (rsize < n->range)
2573 if (BYTES_BIG_ENDIAN)
2575 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
2576 cmpxchg &= mask;
2577 cmpnop >>= (n->range - rsize) * BITS_PER_MARKER;
2579 else
2581 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
2582 cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER;
2583 cmpnop &= mask;
2585 n->range = rsize;
2588 /* A complete byte swap should make the symbolic number to start with
2589 the largest digit in the highest order byte. Unchanged symbolic
2590 number indicates a read with same endianness as target architecture. */
2591 if (n->n == cmpnop)
2592 *bswap = false;
2593 else if (n->n == cmpxchg)
2594 *bswap = true;
2595 else
2596 return NULL;
2598 /* Useless bit manipulation performed by code. */
2599 if (!n->base_addr && n->n == cmpnop && n->n_ops == 1)
2600 return NULL;
2602 n->range *= BITS_PER_UNIT;
2603 return ins_stmt;
2606 namespace {
2608 const pass_data pass_data_optimize_bswap =
2610 GIMPLE_PASS, /* type */
2611 "bswap", /* name */
2612 OPTGROUP_NONE, /* optinfo_flags */
2613 TV_NONE, /* tv_id */
2614 PROP_ssa, /* properties_required */
2615 0, /* properties_provided */
2616 0, /* properties_destroyed */
2617 0, /* todo_flags_start */
2618 0, /* todo_flags_finish */
2621 class pass_optimize_bswap : public gimple_opt_pass
2623 public:
2624 pass_optimize_bswap (gcc::context *ctxt)
2625 : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2628 /* opt_pass methods: */
2629 virtual bool gate (function *)
2631 return flag_expensive_optimizations && optimize;
2634 virtual unsigned int execute (function *);
2636 }; // class pass_optimize_bswap
2638 /* Perform the bswap optimization: replace the expression computed in the rhs
2639 of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2640 Which of these alternatives replace the rhs is given by N->base_addr (non
2641 null if a load is needed) and BSWAP. The type, VUSE and set-alias of the
2642 load to perform are also given in N while the builtin bswap invoke is given
2643 in FNDEL. Finally, if a load is involved, SRC_STMT refers to one of the
2644 load statements involved to construct the rhs in CUR_STMT and N->range gives
2645 the size of the rhs expression for maintaining some statistics.
2647 Note that if the replacement involve a load, CUR_STMT is moved just after
2648 SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2649 changing of basic block. */
2651 static bool
2652 bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl,
2653 tree bswap_type, tree load_type, struct symbolic_number *n,
2654 bool bswap)
2656 gimple_stmt_iterator gsi;
2657 tree src, tmp, tgt;
2658 gimple *bswap_stmt;
2660 gsi = gsi_for_stmt (cur_stmt);
2661 src = n->src;
2662 tgt = gimple_assign_lhs (cur_stmt);
2664 /* Need to load the value from memory first. */
2665 if (n->base_addr)
2667 gimple_stmt_iterator gsi_ins = gsi_for_stmt (ins_stmt);
2668 tree addr_expr, addr_tmp, val_expr, val_tmp;
2669 tree load_offset_ptr, aligned_load_type;
2670 gimple *addr_stmt, *load_stmt;
2671 unsigned align;
2672 HOST_WIDE_INT load_offset = 0;
2673 basic_block ins_bb, cur_bb;
2675 ins_bb = gimple_bb (ins_stmt);
2676 cur_bb = gimple_bb (cur_stmt);
2677 if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb))
2678 return false;
2680 align = get_object_alignment (src);
2682 /* Move cur_stmt just before one of the load of the original
2683 to ensure it has the same VUSE. See PR61517 for what could
2684 go wrong. */
2685 if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt))
2686 reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt));
2687 gsi_move_before (&gsi, &gsi_ins);
2688 gsi = gsi_for_stmt (cur_stmt);
2690 /* Compute address to load from and cast according to the size
2691 of the load. */
2692 addr_expr = build_fold_addr_expr (unshare_expr (src));
2693 if (is_gimple_mem_ref_addr (addr_expr))
2694 addr_tmp = addr_expr;
2695 else
2697 addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2698 "load_src");
2699 addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2700 gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2703 /* Perform the load. */
2704 aligned_load_type = load_type;
2705 if (align < TYPE_ALIGN (load_type))
2706 aligned_load_type = build_aligned_type (load_type, align);
2707 load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2708 val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2709 load_offset_ptr);
2711 if (!bswap)
2713 if (n->range == 16)
2714 nop_stats.found_16bit++;
2715 else if (n->range == 32)
2716 nop_stats.found_32bit++;
2717 else
2719 gcc_assert (n->range == 64);
2720 nop_stats.found_64bit++;
2723 /* Convert the result of load if necessary. */
2724 if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2726 val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2727 "load_dst");
2728 load_stmt = gimple_build_assign (val_tmp, val_expr);
2729 gimple_set_vuse (load_stmt, n->vuse);
2730 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2731 gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2733 else
2735 gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2736 gimple_set_vuse (cur_stmt, n->vuse);
2738 update_stmt (cur_stmt);
2740 if (dump_file)
2742 fprintf (dump_file,
2743 "%d bit load in target endianness found at: ",
2744 (int) n->range);
2745 print_gimple_stmt (dump_file, cur_stmt, 0);
2747 return true;
2749 else
2751 val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2752 load_stmt = gimple_build_assign (val_tmp, val_expr);
2753 gimple_set_vuse (load_stmt, n->vuse);
2754 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2756 src = val_tmp;
2758 else if (!bswap)
2760 gimple *g;
2761 if (!useless_type_conversion_p (TREE_TYPE (tgt), TREE_TYPE (src)))
2763 if (!is_gimple_val (src))
2764 return false;
2765 g = gimple_build_assign (tgt, NOP_EXPR, src);
2767 else
2768 g = gimple_build_assign (tgt, src);
2769 if (n->range == 16)
2770 nop_stats.found_16bit++;
2771 else if (n->range == 32)
2772 nop_stats.found_32bit++;
2773 else
2775 gcc_assert (n->range == 64);
2776 nop_stats.found_64bit++;
2778 if (dump_file)
2780 fprintf (dump_file,
2781 "%d bit reshuffle in target endianness found at: ",
2782 (int) n->range);
2783 print_gimple_stmt (dump_file, cur_stmt, 0);
2785 gsi_replace (&gsi, g, true);
2786 return true;
2788 else if (TREE_CODE (src) == BIT_FIELD_REF)
2789 src = TREE_OPERAND (src, 0);
2791 if (n->range == 16)
2792 bswap_stats.found_16bit++;
2793 else if (n->range == 32)
2794 bswap_stats.found_32bit++;
2795 else
2797 gcc_assert (n->range == 64);
2798 bswap_stats.found_64bit++;
2801 tmp = src;
2803 /* Convert the src expression if necessary. */
2804 if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2806 gimple *convert_stmt;
2808 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2809 convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2810 gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2813 /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values
2814 are considered as rotation of 2N bit values by N bits is generally not
2815 equivalent to a bswap. Consider for instance 0x01020304 r>> 16 which
2816 gives 0x03040102 while a bswap for that value is 0x04030201. */
2817 if (bswap && n->range == 16)
2819 tree count = build_int_cst (NULL, BITS_PER_UNIT);
2820 src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2821 bswap_stmt = gimple_build_assign (NULL, src);
2823 else
2824 bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2826 tmp = tgt;
2828 /* Convert the result if necessary. */
2829 if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2831 gimple *convert_stmt;
2833 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2834 convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2835 gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2838 gimple_set_lhs (bswap_stmt, tmp);
2840 if (dump_file)
2842 fprintf (dump_file, "%d bit bswap implementation found at: ",
2843 (int) n->range);
2844 print_gimple_stmt (dump_file, cur_stmt, 0);
2847 gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2848 gsi_remove (&gsi, true);
2849 return true;
2852 /* Find manual byte swap implementations as well as load in a given
2853 endianness. Byte swaps are turned into a bswap builtin invokation
2854 while endian loads are converted to bswap builtin invokation or
2855 simple load according to the target endianness. */
2857 unsigned int
2858 pass_optimize_bswap::execute (function *fun)
2860 basic_block bb;
2861 bool bswap32_p, bswap64_p;
2862 bool changed = false;
2863 tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2865 if (BITS_PER_UNIT != 8)
2866 return 0;
2868 bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2869 && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2870 bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2871 && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2872 || (bswap32_p && word_mode == SImode)));
2874 /* Determine the argument type of the builtins. The code later on
2875 assumes that the return and argument type are the same. */
2876 if (bswap32_p)
2878 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2879 bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2882 if (bswap64_p)
2884 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2885 bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2888 memset (&nop_stats, 0, sizeof (nop_stats));
2889 memset (&bswap_stats, 0, sizeof (bswap_stats));
2890 calculate_dominance_info (CDI_DOMINATORS);
2892 FOR_EACH_BB_FN (bb, fun)
2894 gimple_stmt_iterator gsi;
2896 /* We do a reverse scan for bswap patterns to make sure we get the
2897 widest match. As bswap pattern matching doesn't handle previously
2898 inserted smaller bswap replacements as sub-patterns, the wider
2899 variant wouldn't be detected. */
2900 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2902 gimple *ins_stmt, *cur_stmt = gsi_stmt (gsi);
2903 tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2904 enum tree_code code;
2905 struct symbolic_number n;
2906 bool bswap;
2908 /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2909 might be moved to a different basic block by bswap_replace and gsi
2910 must not points to it if that's the case. Moving the gsi_prev
2911 there make sure that gsi points to the statement previous to
2912 cur_stmt while still making sure that all statements are
2913 considered in this basic block. */
2914 gsi_prev (&gsi);
2916 if (!is_gimple_assign (cur_stmt))
2917 continue;
2919 code = gimple_assign_rhs_code (cur_stmt);
2920 switch (code)
2922 case LROTATE_EXPR:
2923 case RROTATE_EXPR:
2924 if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2925 || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2926 % BITS_PER_UNIT)
2927 continue;
2928 /* Fall through. */
2929 case BIT_IOR_EXPR:
2930 break;
2931 default:
2932 continue;
2935 ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2937 if (!ins_stmt)
2938 continue;
2940 switch (n.range)
2942 case 16:
2943 /* Already in canonical form, nothing to do. */
2944 if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2945 continue;
2946 load_type = bswap_type = uint16_type_node;
2947 break;
2948 case 32:
2949 load_type = uint32_type_node;
2950 if (bswap32_p)
2952 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2953 bswap_type = bswap32_type;
2955 break;
2956 case 64:
2957 load_type = uint64_type_node;
2958 if (bswap64_p)
2960 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2961 bswap_type = bswap64_type;
2963 break;
2964 default:
2965 continue;
2968 if (bswap && !fndecl && n.range != 16)
2969 continue;
2971 if (bswap_replace (cur_stmt, ins_stmt, fndecl, bswap_type, load_type,
2972 &n, bswap))
2973 changed = true;
2977 statistics_counter_event (fun, "16-bit nop implementations found",
2978 nop_stats.found_16bit);
2979 statistics_counter_event (fun, "32-bit nop implementations found",
2980 nop_stats.found_32bit);
2981 statistics_counter_event (fun, "64-bit nop implementations found",
2982 nop_stats.found_64bit);
2983 statistics_counter_event (fun, "16-bit bswap implementations found",
2984 bswap_stats.found_16bit);
2985 statistics_counter_event (fun, "32-bit bswap implementations found",
2986 bswap_stats.found_32bit);
2987 statistics_counter_event (fun, "64-bit bswap implementations found",
2988 bswap_stats.found_64bit);
2990 return (changed ? TODO_update_ssa : 0);
2993 } // anon namespace
2995 gimple_opt_pass *
2996 make_pass_optimize_bswap (gcc::context *ctxt)
2998 return new pass_optimize_bswap (ctxt);
3001 /* Return true if stmt is a type conversion operation that can be stripped
3002 when used in a widening multiply operation. */
3003 static bool
3004 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
3006 enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
3008 if (TREE_CODE (result_type) == INTEGER_TYPE)
3010 tree op_type;
3011 tree inner_op_type;
3013 if (!CONVERT_EXPR_CODE_P (rhs_code))
3014 return false;
3016 op_type = TREE_TYPE (gimple_assign_lhs (stmt));
3018 /* If the type of OP has the same precision as the result, then
3019 we can strip this conversion. The multiply operation will be
3020 selected to create the correct extension as a by-product. */
3021 if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
3022 return true;
3024 /* We can also strip a conversion if it preserves the signed-ness of
3025 the operation and doesn't narrow the range. */
3026 inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
3028 /* If the inner-most type is unsigned, then we can strip any
3029 intermediate widening operation. If it's signed, then the
3030 intermediate widening operation must also be signed. */
3031 if ((TYPE_UNSIGNED (inner_op_type)
3032 || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
3033 && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
3034 return true;
3036 return false;
3039 return rhs_code == FIXED_CONVERT_EXPR;
3042 /* Return true if RHS is a suitable operand for a widening multiplication,
3043 assuming a target type of TYPE.
3044 There are two cases:
3046 - RHS makes some value at least twice as wide. Store that value
3047 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
3049 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
3050 but leave *TYPE_OUT untouched. */
3052 static bool
3053 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
3054 tree *new_rhs_out)
3056 gimple *stmt;
3057 tree type1, rhs1;
3059 if (TREE_CODE (rhs) == SSA_NAME)
3061 stmt = SSA_NAME_DEF_STMT (rhs);
3062 if (is_gimple_assign (stmt))
3064 if (! widening_mult_conversion_strippable_p (type, stmt))
3065 rhs1 = rhs;
3066 else
3068 rhs1 = gimple_assign_rhs1 (stmt);
3070 if (TREE_CODE (rhs1) == INTEGER_CST)
3072 *new_rhs_out = rhs1;
3073 *type_out = NULL;
3074 return true;
3078 else
3079 rhs1 = rhs;
3081 type1 = TREE_TYPE (rhs1);
3083 if (TREE_CODE (type1) != TREE_CODE (type)
3084 || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
3085 return false;
3087 *new_rhs_out = rhs1;
3088 *type_out = type1;
3089 return true;
3092 if (TREE_CODE (rhs) == INTEGER_CST)
3094 *new_rhs_out = rhs;
3095 *type_out = NULL;
3096 return true;
3099 return false;
3102 /* Return true if STMT performs a widening multiplication, assuming the
3103 output type is TYPE. If so, store the unwidened types of the operands
3104 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
3105 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
3106 and *TYPE2_OUT would give the operands of the multiplication. */
3108 static bool
3109 is_widening_mult_p (gimple *stmt,
3110 tree *type1_out, tree *rhs1_out,
3111 tree *type2_out, tree *rhs2_out)
3113 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3115 if (TREE_CODE (type) != INTEGER_TYPE
3116 && TREE_CODE (type) != FIXED_POINT_TYPE)
3117 return false;
3119 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
3120 rhs1_out))
3121 return false;
3123 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
3124 rhs2_out))
3125 return false;
3127 if (*type1_out == NULL)
3129 if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
3130 return false;
3131 *type1_out = *type2_out;
3134 if (*type2_out == NULL)
3136 if (!int_fits_type_p (*rhs2_out, *type1_out))
3137 return false;
3138 *type2_out = *type1_out;
3141 /* Ensure that the larger of the two operands comes first. */
3142 if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
3144 std::swap (*type1_out, *type2_out);
3145 std::swap (*rhs1_out, *rhs2_out);
3148 return true;
3151 /* Check to see if the CALL statement is an invocation of copysign
3152 with 1. being the first argument. */
3153 static bool
3154 is_copysign_call_with_1 (gimple *call)
3156 gcall *c = dyn_cast <gcall *> (call);
3157 if (! c)
3158 return false;
3160 enum combined_fn code = gimple_call_combined_fn (c);
3162 if (code == CFN_LAST)
3163 return false;
3165 if (builtin_fn_p (code))
3167 switch (as_builtin_fn (code))
3169 CASE_FLT_FN (BUILT_IN_COPYSIGN):
3170 CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
3171 return real_onep (gimple_call_arg (c, 0));
3172 default:
3173 return false;
3177 if (internal_fn_p (code))
3179 switch (as_internal_fn (code))
3181 case IFN_COPYSIGN:
3182 return real_onep (gimple_call_arg (c, 0));
3183 default:
3184 return false;
3188 return false;
3191 /* Try to expand the pattern x * copysign (1, y) into xorsign (x, y).
3192 This only happens when the the xorsign optab is defined, if the
3193 pattern is not a xorsign pattern or if expansion fails FALSE is
3194 returned, otherwise TRUE is returned. */
3195 static bool
3196 convert_expand_mult_copysign (gimple *stmt, gimple_stmt_iterator *gsi)
3198 tree treeop0, treeop1, lhs, type;
3199 location_t loc = gimple_location (stmt);
3200 lhs = gimple_assign_lhs (stmt);
3201 treeop0 = gimple_assign_rhs1 (stmt);
3202 treeop1 = gimple_assign_rhs2 (stmt);
3203 type = TREE_TYPE (lhs);
3204 machine_mode mode = TYPE_MODE (type);
3206 if (HONOR_SNANS (type))
3207 return false;
3209 if (TREE_CODE (treeop0) == SSA_NAME && TREE_CODE (treeop1) == SSA_NAME)
3211 gimple *call0 = SSA_NAME_DEF_STMT (treeop0);
3212 if (!has_single_use (treeop0) || !is_copysign_call_with_1 (call0))
3214 call0 = SSA_NAME_DEF_STMT (treeop1);
3215 if (!has_single_use (treeop1) || !is_copysign_call_with_1 (call0))
3216 return false;
3218 treeop1 = treeop0;
3220 if (optab_handler (xorsign_optab, mode) == CODE_FOR_nothing)
3221 return false;
3223 gcall *c = as_a<gcall*> (call0);
3224 treeop0 = gimple_call_arg (c, 1);
3226 gcall *call_stmt
3227 = gimple_build_call_internal (IFN_XORSIGN, 2, treeop1, treeop0);
3228 gimple_set_lhs (call_stmt, lhs);
3229 gimple_set_location (call_stmt, loc);
3230 gsi_replace (gsi, call_stmt, true);
3231 return true;
3234 return false;
3237 /* Process a single gimple statement STMT, which has a MULT_EXPR as
3238 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
3239 value is true iff we converted the statement. */
3241 static bool
3242 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
3244 tree lhs, rhs1, rhs2, type, type1, type2;
3245 enum insn_code handler;
3246 scalar_int_mode to_mode, from_mode, actual_mode;
3247 optab op;
3248 int actual_precision;
3249 location_t loc = gimple_location (stmt);
3250 bool from_unsigned1, from_unsigned2;
3252 lhs = gimple_assign_lhs (stmt);
3253 type = TREE_TYPE (lhs);
3254 if (TREE_CODE (type) != INTEGER_TYPE)
3255 return false;
3257 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
3258 return false;
3260 to_mode = SCALAR_INT_TYPE_MODE (type);
3261 from_mode = SCALAR_INT_TYPE_MODE (type1);
3262 if (to_mode == from_mode)
3263 return false;
3265 from_unsigned1 = TYPE_UNSIGNED (type1);
3266 from_unsigned2 = TYPE_UNSIGNED (type2);
3268 if (from_unsigned1 && from_unsigned2)
3269 op = umul_widen_optab;
3270 else if (!from_unsigned1 && !from_unsigned2)
3271 op = smul_widen_optab;
3272 else
3273 op = usmul_widen_optab;
3275 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3276 &actual_mode);
3278 if (handler == CODE_FOR_nothing)
3280 if (op != smul_widen_optab)
3282 /* We can use a signed multiply with unsigned types as long as
3283 there is a wider mode to use, or it is the smaller of the two
3284 types that is unsigned. Note that type1 >= type2, always. */
3285 if ((TYPE_UNSIGNED (type1)
3286 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3287 || (TYPE_UNSIGNED (type2)
3288 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3290 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
3291 || GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
3292 return false;
3295 op = smul_widen_optab;
3296 handler = find_widening_optab_handler_and_mode (op, to_mode,
3297 from_mode,
3298 &actual_mode);
3300 if (handler == CODE_FOR_nothing)
3301 return false;
3303 from_unsigned1 = from_unsigned2 = false;
3305 else
3306 return false;
3309 /* Ensure that the inputs to the handler are in the correct precison
3310 for the opcode. This will be the full mode size. */
3311 actual_precision = GET_MODE_PRECISION (actual_mode);
3312 if (2 * actual_precision > TYPE_PRECISION (type))
3313 return false;
3314 if (actual_precision != TYPE_PRECISION (type1)
3315 || from_unsigned1 != TYPE_UNSIGNED (type1))
3316 rhs1 = build_and_insert_cast (gsi, loc,
3317 build_nonstandard_integer_type
3318 (actual_precision, from_unsigned1), rhs1);
3319 if (actual_precision != TYPE_PRECISION (type2)
3320 || from_unsigned2 != TYPE_UNSIGNED (type2))
3321 rhs2 = build_and_insert_cast (gsi, loc,
3322 build_nonstandard_integer_type
3323 (actual_precision, from_unsigned2), rhs2);
3325 /* Handle constants. */
3326 if (TREE_CODE (rhs1) == INTEGER_CST)
3327 rhs1 = fold_convert (type1, rhs1);
3328 if (TREE_CODE (rhs2) == INTEGER_CST)
3329 rhs2 = fold_convert (type2, rhs2);
3331 gimple_assign_set_rhs1 (stmt, rhs1);
3332 gimple_assign_set_rhs2 (stmt, rhs2);
3333 gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
3334 update_stmt (stmt);
3335 widen_mul_stats.widen_mults_inserted++;
3336 return true;
3339 /* Process a single gimple statement STMT, which is found at the
3340 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3341 rhs (given by CODE), and try to convert it into a
3342 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
3343 is true iff we converted the statement. */
3345 static bool
3346 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
3347 enum tree_code code)
3349 gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
3350 gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
3351 tree type, type1, type2, optype;
3352 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3353 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3354 optab this_optab;
3355 enum tree_code wmult_code;
3356 enum insn_code handler;
3357 scalar_mode to_mode, from_mode, actual_mode;
3358 location_t loc = gimple_location (stmt);
3359 int actual_precision;
3360 bool from_unsigned1, from_unsigned2;
3362 lhs = gimple_assign_lhs (stmt);
3363 type = TREE_TYPE (lhs);
3364 if (TREE_CODE (type) != INTEGER_TYPE
3365 && TREE_CODE (type) != FIXED_POINT_TYPE)
3366 return false;
3368 if (code == MINUS_EXPR)
3369 wmult_code = WIDEN_MULT_MINUS_EXPR;
3370 else
3371 wmult_code = WIDEN_MULT_PLUS_EXPR;
3373 rhs1 = gimple_assign_rhs1 (stmt);
3374 rhs2 = gimple_assign_rhs2 (stmt);
3376 if (TREE_CODE (rhs1) == SSA_NAME)
3378 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3379 if (is_gimple_assign (rhs1_stmt))
3380 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3383 if (TREE_CODE (rhs2) == SSA_NAME)
3385 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3386 if (is_gimple_assign (rhs2_stmt))
3387 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3390 /* Allow for one conversion statement between the multiply
3391 and addition/subtraction statement. If there are more than
3392 one conversions then we assume they would invalidate this
3393 transformation. If that's not the case then they should have
3394 been folded before now. */
3395 if (CONVERT_EXPR_CODE_P (rhs1_code))
3397 conv1_stmt = rhs1_stmt;
3398 rhs1 = gimple_assign_rhs1 (rhs1_stmt);
3399 if (TREE_CODE (rhs1) == SSA_NAME)
3401 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3402 if (is_gimple_assign (rhs1_stmt))
3403 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3405 else
3406 return false;
3408 if (CONVERT_EXPR_CODE_P (rhs2_code))
3410 conv2_stmt = rhs2_stmt;
3411 rhs2 = gimple_assign_rhs1 (rhs2_stmt);
3412 if (TREE_CODE (rhs2) == SSA_NAME)
3414 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3415 if (is_gimple_assign (rhs2_stmt))
3416 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3418 else
3419 return false;
3422 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3423 is_widening_mult_p, but we still need the rhs returns.
3425 It might also appear that it would be sufficient to use the existing
3426 operands of the widening multiply, but that would limit the choice of
3427 multiply-and-accumulate instructions.
3429 If the widened-multiplication result has more than one uses, it is
3430 probably wiser not to do the conversion. */
3431 if (code == PLUS_EXPR
3432 && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
3434 if (!has_single_use (rhs1)
3435 || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
3436 &type2, &mult_rhs2))
3437 return false;
3438 add_rhs = rhs2;
3439 conv_stmt = conv1_stmt;
3441 else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
3443 if (!has_single_use (rhs2)
3444 || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
3445 &type2, &mult_rhs2))
3446 return false;
3447 add_rhs = rhs1;
3448 conv_stmt = conv2_stmt;
3450 else
3451 return false;
3453 to_mode = SCALAR_TYPE_MODE (type);
3454 from_mode = SCALAR_TYPE_MODE (type1);
3455 if (to_mode == from_mode)
3456 return false;
3458 from_unsigned1 = TYPE_UNSIGNED (type1);
3459 from_unsigned2 = TYPE_UNSIGNED (type2);
3460 optype = type1;
3462 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
3463 if (from_unsigned1 != from_unsigned2)
3465 if (!INTEGRAL_TYPE_P (type))
3466 return false;
3467 /* We can use a signed multiply with unsigned types as long as
3468 there is a wider mode to use, or it is the smaller of the two
3469 types that is unsigned. Note that type1 >= type2, always. */
3470 if ((from_unsigned1
3471 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3472 || (from_unsigned2
3473 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3475 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
3476 || GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
3477 return false;
3480 from_unsigned1 = from_unsigned2 = false;
3481 optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3482 false);
3485 /* If there was a conversion between the multiply and addition
3486 then we need to make sure it fits a multiply-and-accumulate.
3487 The should be a single mode change which does not change the
3488 value. */
3489 if (conv_stmt)
3491 /* We use the original, unmodified data types for this. */
3492 tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3493 tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3494 int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3495 bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3497 if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3499 /* Conversion is a truncate. */
3500 if (TYPE_PRECISION (to_type) < data_size)
3501 return false;
3503 else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3505 /* Conversion is an extend. Check it's the right sort. */
3506 if (TYPE_UNSIGNED (from_type) != is_unsigned
3507 && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3508 return false;
3510 /* else convert is a no-op for our purposes. */
3513 /* Verify that the machine can perform a widening multiply
3514 accumulate in this mode/signedness combination, otherwise
3515 this transformation is likely to pessimize code. */
3516 this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3517 handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3518 from_mode, &actual_mode);
3520 if (handler == CODE_FOR_nothing)
3521 return false;
3523 /* Ensure that the inputs to the handler are in the correct precison
3524 for the opcode. This will be the full mode size. */
3525 actual_precision = GET_MODE_PRECISION (actual_mode);
3526 if (actual_precision != TYPE_PRECISION (type1)
3527 || from_unsigned1 != TYPE_UNSIGNED (type1))
3528 mult_rhs1 = build_and_insert_cast (gsi, loc,
3529 build_nonstandard_integer_type
3530 (actual_precision, from_unsigned1),
3531 mult_rhs1);
3532 if (actual_precision != TYPE_PRECISION (type2)
3533 || from_unsigned2 != TYPE_UNSIGNED (type2))
3534 mult_rhs2 = build_and_insert_cast (gsi, loc,
3535 build_nonstandard_integer_type
3536 (actual_precision, from_unsigned2),
3537 mult_rhs2);
3539 if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3540 add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3542 /* Handle constants. */
3543 if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3544 mult_rhs1 = fold_convert (type1, mult_rhs1);
3545 if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3546 mult_rhs2 = fold_convert (type2, mult_rhs2);
3548 gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3549 add_rhs);
3550 update_stmt (gsi_stmt (*gsi));
3551 widen_mul_stats.maccs_inserted++;
3552 return true;
3555 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3556 with uses in additions and subtractions to form fused multiply-add
3557 operations. Returns true if successful and MUL_STMT should be removed. */
3559 static bool
3560 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2)
3562 tree mul_result = gimple_get_lhs (mul_stmt);
3563 tree type = TREE_TYPE (mul_result);
3564 gimple *use_stmt, *neguse_stmt;
3565 gassign *fma_stmt;
3566 use_operand_p use_p;
3567 imm_use_iterator imm_iter;
3569 if (FLOAT_TYPE_P (type)
3570 && flag_fp_contract_mode == FP_CONTRACT_OFF)
3571 return false;
3573 /* We don't want to do bitfield reduction ops. */
3574 if (INTEGRAL_TYPE_P (type)
3575 && !type_has_mode_precision_p (type))
3576 return false;
3578 /* If the target doesn't support it, don't generate it. We assume that
3579 if fma isn't available then fms, fnma or fnms are not either. */
3580 if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3581 return false;
3583 /* If the multiplication has zero uses, it is kept around probably because
3584 of -fnon-call-exceptions. Don't optimize it away in that case,
3585 it is DCE job. */
3586 if (has_zero_uses (mul_result))
3587 return false;
3589 /* Make sure that the multiplication statement becomes dead after
3590 the transformation, thus that all uses are transformed to FMAs.
3591 This means we assume that an FMA operation has the same cost
3592 as an addition. */
3593 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3595 enum tree_code use_code;
3596 tree result = mul_result;
3597 bool negate_p = false;
3599 use_stmt = USE_STMT (use_p);
3601 if (is_gimple_debug (use_stmt))
3602 continue;
3604 /* For now restrict this operations to single basic blocks. In theory
3605 we would want to support sinking the multiplication in
3606 m = a*b;
3607 if ()
3608 ma = m + c;
3609 else
3610 d = m;
3611 to form a fma in the then block and sink the multiplication to the
3612 else block. */
3613 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3614 return false;
3616 if (!is_gimple_assign (use_stmt))
3617 return false;
3619 use_code = gimple_assign_rhs_code (use_stmt);
3621 /* A negate on the multiplication leads to FNMA. */
3622 if (use_code == NEGATE_EXPR)
3624 ssa_op_iter iter;
3625 use_operand_p usep;
3627 result = gimple_assign_lhs (use_stmt);
3629 /* Make sure the negate statement becomes dead with this
3630 single transformation. */
3631 if (!single_imm_use (gimple_assign_lhs (use_stmt),
3632 &use_p, &neguse_stmt))
3633 return false;
3635 /* Make sure the multiplication isn't also used on that stmt. */
3636 FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3637 if (USE_FROM_PTR (usep) == mul_result)
3638 return false;
3640 /* Re-validate. */
3641 use_stmt = neguse_stmt;
3642 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3643 return false;
3644 if (!is_gimple_assign (use_stmt))
3645 return false;
3647 use_code = gimple_assign_rhs_code (use_stmt);
3648 negate_p = true;
3651 switch (use_code)
3653 case MINUS_EXPR:
3654 if (gimple_assign_rhs2 (use_stmt) == result)
3655 negate_p = !negate_p;
3656 break;
3657 case PLUS_EXPR:
3658 break;
3659 default:
3660 /* FMA can only be formed from PLUS and MINUS. */
3661 return false;
3664 /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3665 by a MULT_EXPR that we'll visit later, we might be able to
3666 get a more profitable match with fnma.
3667 OTOH, if we don't, a negate / fma pair has likely lower latency
3668 that a mult / subtract pair. */
3669 if (use_code == MINUS_EXPR && !negate_p
3670 && gimple_assign_rhs1 (use_stmt) == result
3671 && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3672 && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3674 tree rhs2 = gimple_assign_rhs2 (use_stmt);
3676 if (TREE_CODE (rhs2) == SSA_NAME)
3678 gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2);
3679 if (has_single_use (rhs2)
3680 && is_gimple_assign (stmt2)
3681 && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3682 return false;
3686 /* We can't handle a * b + a * b. */
3687 if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3688 return false;
3690 /* While it is possible to validate whether or not the exact form
3691 that we've recognized is available in the backend, the assumption
3692 is that the transformation is never a loss. For instance, suppose
3693 the target only has the plain FMA pattern available. Consider
3694 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3695 is still two operations. Consider -(a*b)-c -> fma(-a,b,-c): we
3696 still have 3 operations, but in the FMA form the two NEGs are
3697 independent and could be run in parallel. */
3700 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3702 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3703 enum tree_code use_code;
3704 tree addop, mulop1 = op1, result = mul_result;
3705 bool negate_p = false;
3707 if (is_gimple_debug (use_stmt))
3708 continue;
3710 use_code = gimple_assign_rhs_code (use_stmt);
3711 if (use_code == NEGATE_EXPR)
3713 result = gimple_assign_lhs (use_stmt);
3714 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3715 gsi_remove (&gsi, true);
3716 release_defs (use_stmt);
3718 use_stmt = neguse_stmt;
3719 gsi = gsi_for_stmt (use_stmt);
3720 use_code = gimple_assign_rhs_code (use_stmt);
3721 negate_p = true;
3724 if (gimple_assign_rhs1 (use_stmt) == result)
3726 addop = gimple_assign_rhs2 (use_stmt);
3727 /* a * b - c -> a * b + (-c) */
3728 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3729 addop = force_gimple_operand_gsi (&gsi,
3730 build1 (NEGATE_EXPR,
3731 type, addop),
3732 true, NULL_TREE, true,
3733 GSI_SAME_STMT);
3735 else
3737 addop = gimple_assign_rhs1 (use_stmt);
3738 /* a - b * c -> (-b) * c + a */
3739 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3740 negate_p = !negate_p;
3743 if (negate_p)
3744 mulop1 = force_gimple_operand_gsi (&gsi,
3745 build1 (NEGATE_EXPR,
3746 type, mulop1),
3747 true, NULL_TREE, true,
3748 GSI_SAME_STMT);
3750 fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3751 FMA_EXPR, mulop1, op2, addop);
3752 gsi_replace (&gsi, fma_stmt, true);
3753 widen_mul_stats.fmas_inserted++;
3756 return true;
3760 /* Helper function of match_uaddsub_overflow. Return 1
3761 if USE_STMT is unsigned overflow check ovf != 0 for
3762 STMT, -1 if USE_STMT is unsigned overflow check ovf == 0
3763 and 0 otherwise. */
3765 static int
3766 uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt)
3768 enum tree_code ccode = ERROR_MARK;
3769 tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
3770 if (gimple_code (use_stmt) == GIMPLE_COND)
3772 ccode = gimple_cond_code (use_stmt);
3773 crhs1 = gimple_cond_lhs (use_stmt);
3774 crhs2 = gimple_cond_rhs (use_stmt);
3776 else if (is_gimple_assign (use_stmt))
3778 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3780 ccode = gimple_assign_rhs_code (use_stmt);
3781 crhs1 = gimple_assign_rhs1 (use_stmt);
3782 crhs2 = gimple_assign_rhs2 (use_stmt);
3784 else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
3786 tree cond = gimple_assign_rhs1 (use_stmt);
3787 if (COMPARISON_CLASS_P (cond))
3789 ccode = TREE_CODE (cond);
3790 crhs1 = TREE_OPERAND (cond, 0);
3791 crhs2 = TREE_OPERAND (cond, 1);
3793 else
3794 return 0;
3796 else
3797 return 0;
3799 else
3800 return 0;
3802 if (TREE_CODE_CLASS (ccode) != tcc_comparison)
3803 return 0;
3805 enum tree_code code = gimple_assign_rhs_code (stmt);
3806 tree lhs = gimple_assign_lhs (stmt);
3807 tree rhs1 = gimple_assign_rhs1 (stmt);
3808 tree rhs2 = gimple_assign_rhs2 (stmt);
3810 switch (ccode)
3812 case GT_EXPR:
3813 case LE_EXPR:
3814 /* r = a - b; r > a or r <= a
3815 r = a + b; a > r or a <= r or b > r or b <= r. */
3816 if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1)
3817 || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2)
3818 && crhs2 == lhs))
3819 return ccode == GT_EXPR ? 1 : -1;
3820 break;
3821 case LT_EXPR:
3822 case GE_EXPR:
3823 /* r = a - b; a < r or a >= r
3824 r = a + b; r < a or r >= a or r < b or r >= b. */
3825 if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs)
3826 || (code == PLUS_EXPR && crhs1 == lhs
3827 && (crhs2 == rhs1 || crhs2 == rhs2)))
3828 return ccode == LT_EXPR ? 1 : -1;
3829 break;
3830 default:
3831 break;
3833 return 0;
3836 /* Recognize for unsigned x
3837 x = y - z;
3838 if (x > y)
3839 where there are other uses of x and replace it with
3840 _7 = SUB_OVERFLOW (y, z);
3841 x = REALPART_EXPR <_7>;
3842 _8 = IMAGPART_EXPR <_7>;
3843 if (_8)
3844 and similarly for addition. */
3846 static bool
3847 match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
3848 enum tree_code code)
3850 tree lhs = gimple_assign_lhs (stmt);
3851 tree type = TREE_TYPE (lhs);
3852 use_operand_p use_p;
3853 imm_use_iterator iter;
3854 bool use_seen = false;
3855 bool ovf_use_seen = false;
3856 gimple *use_stmt;
3858 gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
3859 if (!INTEGRAL_TYPE_P (type)
3860 || !TYPE_UNSIGNED (type)
3861 || has_zero_uses (lhs)
3862 || has_single_use (lhs)
3863 || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
3864 TYPE_MODE (type)) == CODE_FOR_nothing)
3865 return false;
3867 FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
3869 use_stmt = USE_STMT (use_p);
3870 if (is_gimple_debug (use_stmt))
3871 continue;
3873 if (uaddsub_overflow_check_p (stmt, use_stmt))
3874 ovf_use_seen = true;
3875 else
3876 use_seen = true;
3877 if (ovf_use_seen && use_seen)
3878 break;
3881 if (!ovf_use_seen || !use_seen)
3882 return false;
3884 tree ctype = build_complex_type (type);
3885 tree rhs1 = gimple_assign_rhs1 (stmt);
3886 tree rhs2 = gimple_assign_rhs2 (stmt);
3887 gcall *g = gimple_build_call_internal (code == PLUS_EXPR
3888 ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW,
3889 2, rhs1, rhs2);
3890 tree ctmp = make_ssa_name (ctype);
3891 gimple_call_set_lhs (g, ctmp);
3892 gsi_insert_before (gsi, g, GSI_SAME_STMT);
3893 gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR,
3894 build1 (REALPART_EXPR, type, ctmp));
3895 gsi_replace (gsi, g2, true);
3896 tree ovf = make_ssa_name (type);
3897 g2 = gimple_build_assign (ovf, IMAGPART_EXPR,
3898 build1 (IMAGPART_EXPR, type, ctmp));
3899 gsi_insert_after (gsi, g2, GSI_NEW_STMT);
3901 FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
3903 if (is_gimple_debug (use_stmt))
3904 continue;
3906 int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt);
3907 if (ovf_use == 0)
3908 continue;
3909 if (gimple_code (use_stmt) == GIMPLE_COND)
3911 gcond *cond_stmt = as_a <gcond *> (use_stmt);
3912 gimple_cond_set_lhs (cond_stmt, ovf);
3913 gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0));
3914 gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3916 else
3918 gcc_checking_assert (is_gimple_assign (use_stmt));
3919 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3921 gimple_assign_set_rhs1 (use_stmt, ovf);
3922 gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0));
3923 gimple_assign_set_rhs_code (use_stmt,
3924 ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3926 else
3928 gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
3929 == COND_EXPR);
3930 tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR,
3931 boolean_type_node, ovf,
3932 build_int_cst (type, 0));
3933 gimple_assign_set_rhs1 (use_stmt, cond);
3936 update_stmt (use_stmt);
3938 return true;
3941 /* Return true if target has support for divmod. */
3943 static bool
3944 target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode)
3946 /* If target supports hardware divmod insn, use it for divmod. */
3947 if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing)
3948 return true;
3950 /* Check if libfunc for divmod is available. */
3951 rtx libfunc = optab_libfunc (divmod_optab, mode);
3952 if (libfunc != NULL_RTX)
3954 /* If optab_handler exists for div_optab, perhaps in a wider mode,
3955 we don't want to use the libfunc even if it exists for given mode. */
3956 machine_mode div_mode;
3957 FOR_EACH_MODE_FROM (div_mode, mode)
3958 if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing)
3959 return false;
3961 return targetm.expand_divmod_libfunc != NULL;
3964 return false;
3967 /* Check if stmt is candidate for divmod transform. */
3969 static bool
3970 divmod_candidate_p (gassign *stmt)
3972 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3973 machine_mode mode = TYPE_MODE (type);
3974 optab divmod_optab, div_optab;
3976 if (TYPE_UNSIGNED (type))
3978 divmod_optab = udivmod_optab;
3979 div_optab = udiv_optab;
3981 else
3983 divmod_optab = sdivmod_optab;
3984 div_optab = sdiv_optab;
3987 tree op1 = gimple_assign_rhs1 (stmt);
3988 tree op2 = gimple_assign_rhs2 (stmt);
3990 /* Disable the transform if either is a constant, since division-by-constant
3991 may have specialized expansion. */
3992 if (CONSTANT_CLASS_P (op1) || CONSTANT_CLASS_P (op2))
3993 return false;
3995 /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should
3996 expand using the [su]divv optabs. */
3997 if (TYPE_OVERFLOW_TRAPS (type))
3998 return false;
4000 if (!target_supports_divmod_p (divmod_optab, div_optab, mode))
4001 return false;
4003 return true;
4006 /* This function looks for:
4007 t1 = a TRUNC_DIV_EXPR b;
4008 t2 = a TRUNC_MOD_EXPR b;
4009 and transforms it to the following sequence:
4010 complex_tmp = DIVMOD (a, b);
4011 t1 = REALPART_EXPR(a);
4012 t2 = IMAGPART_EXPR(b);
4013 For conditions enabling the transform see divmod_candidate_p().
4015 The pass has three parts:
4016 1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all
4017 other trunc_div_expr and trunc_mod_expr stmts.
4018 2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt
4019 to stmts vector.
4020 3) Insert DIVMOD call just before top_stmt and update entries in
4021 stmts vector to use return value of DIMOVD (REALEXPR_PART for div,
4022 IMAGPART_EXPR for mod). */
4024 static bool
4025 convert_to_divmod (gassign *stmt)
4027 if (stmt_can_throw_internal (stmt)
4028 || !divmod_candidate_p (stmt))
4029 return false;
4031 tree op1 = gimple_assign_rhs1 (stmt);
4032 tree op2 = gimple_assign_rhs2 (stmt);
4034 imm_use_iterator use_iter;
4035 gimple *use_stmt;
4036 auto_vec<gimple *> stmts;
4038 gimple *top_stmt = stmt;
4039 basic_block top_bb = gimple_bb (stmt);
4041 /* Part 1: Try to set top_stmt to "topmost" stmt that dominates
4042 at-least stmt and possibly other trunc_div/trunc_mod stmts
4043 having same operands as stmt. */
4045 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1)
4047 if (is_gimple_assign (use_stmt)
4048 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
4049 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
4050 && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
4051 && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
4053 if (stmt_can_throw_internal (use_stmt))
4054 continue;
4056 basic_block bb = gimple_bb (use_stmt);
4058 if (bb == top_bb)
4060 if (gimple_uid (use_stmt) < gimple_uid (top_stmt))
4061 top_stmt = use_stmt;
4063 else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb))
4065 top_bb = bb;
4066 top_stmt = use_stmt;
4071 tree top_op1 = gimple_assign_rhs1 (top_stmt);
4072 tree top_op2 = gimple_assign_rhs2 (top_stmt);
4074 stmts.safe_push (top_stmt);
4075 bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR);
4077 /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb
4078 to stmts vector. The 2nd loop will always add stmt to stmts vector, since
4079 gimple_bb (top_stmt) dominates gimple_bb (stmt), so the
4080 2nd loop ends up adding at-least single trunc_mod_expr stmt. */
4082 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1)
4084 if (is_gimple_assign (use_stmt)
4085 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
4086 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
4087 && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
4088 && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
4090 if (use_stmt == top_stmt
4091 || stmt_can_throw_internal (use_stmt)
4092 || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
4093 continue;
4095 stmts.safe_push (use_stmt);
4096 if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
4097 div_seen = true;
4101 if (!div_seen)
4102 return false;
4104 /* Part 3: Create libcall to internal fn DIVMOD:
4105 divmod_tmp = DIVMOD (op1, op2). */
4107 gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
4108 tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)),
4109 call_stmt, "divmod_tmp");
4110 gimple_call_set_lhs (call_stmt, res);
4111 /* We rejected throwing statements above. */
4112 gimple_call_set_nothrow (call_stmt, true);
4114 /* Insert the call before top_stmt. */
4115 gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt);
4116 gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT);
4118 widen_mul_stats.divmod_calls_inserted++;
4120 /* Update all statements in stmts vector:
4121 lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp>
4122 lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>. */
4124 for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i)
4126 tree new_rhs;
4128 switch (gimple_assign_rhs_code (use_stmt))
4130 case TRUNC_DIV_EXPR:
4131 new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res);
4132 break;
4134 case TRUNC_MOD_EXPR:
4135 new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res);
4136 break;
4138 default:
4139 gcc_unreachable ();
4142 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
4143 gimple_assign_set_rhs_from_tree (&gsi, new_rhs);
4144 update_stmt (use_stmt);
4147 return true;
4150 /* Find integer multiplications where the operands are extended from
4151 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
4152 where appropriate. */
4154 namespace {
4156 const pass_data pass_data_optimize_widening_mul =
4158 GIMPLE_PASS, /* type */
4159 "widening_mul", /* name */
4160 OPTGROUP_NONE, /* optinfo_flags */
4161 TV_NONE, /* tv_id */
4162 PROP_ssa, /* properties_required */
4163 0, /* properties_provided */
4164 0, /* properties_destroyed */
4165 0, /* todo_flags_start */
4166 TODO_update_ssa, /* todo_flags_finish */
4169 class pass_optimize_widening_mul : public gimple_opt_pass
4171 public:
4172 pass_optimize_widening_mul (gcc::context *ctxt)
4173 : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
4176 /* opt_pass methods: */
4177 virtual bool gate (function *)
4179 return flag_expensive_optimizations && optimize;
4182 virtual unsigned int execute (function *);
4184 }; // class pass_optimize_widening_mul
4186 unsigned int
4187 pass_optimize_widening_mul::execute (function *fun)
4189 basic_block bb;
4190 bool cfg_changed = false;
4192 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
4193 calculate_dominance_info (CDI_DOMINATORS);
4194 renumber_gimple_stmt_uids ();
4196 FOR_EACH_BB_FN (bb, fun)
4198 gimple_stmt_iterator gsi;
4200 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
4202 gimple *stmt = gsi_stmt (gsi);
4203 enum tree_code code;
4205 if (is_gimple_assign (stmt))
4207 code = gimple_assign_rhs_code (stmt);
4208 switch (code)
4210 case MULT_EXPR:
4211 if (!convert_mult_to_widen (stmt, &gsi)
4212 && !convert_expand_mult_copysign (stmt, &gsi)
4213 && convert_mult_to_fma (stmt,
4214 gimple_assign_rhs1 (stmt),
4215 gimple_assign_rhs2 (stmt)))
4217 gsi_remove (&gsi, true);
4218 release_defs (stmt);
4219 continue;
4221 break;
4223 case PLUS_EXPR:
4224 case MINUS_EXPR:
4225 if (!convert_plusminus_to_widen (&gsi, stmt, code))
4226 match_uaddsub_overflow (&gsi, stmt, code);
4227 break;
4229 case TRUNC_MOD_EXPR:
4230 convert_to_divmod (as_a<gassign *> (stmt));
4231 break;
4233 default:;
4236 else if (is_gimple_call (stmt)
4237 && gimple_call_lhs (stmt))
4239 tree fndecl = gimple_call_fndecl (stmt);
4240 if (fndecl
4241 && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
4243 switch (DECL_FUNCTION_CODE (fndecl))
4245 case BUILT_IN_POWF:
4246 case BUILT_IN_POW:
4247 case BUILT_IN_POWL:
4248 if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
4249 && real_equal
4250 (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
4251 &dconst2)
4252 && convert_mult_to_fma (stmt,
4253 gimple_call_arg (stmt, 0),
4254 gimple_call_arg (stmt, 0)))
4256 unlink_stmt_vdef (stmt);
4257 if (gsi_remove (&gsi, true)
4258 && gimple_purge_dead_eh_edges (bb))
4259 cfg_changed = true;
4260 release_defs (stmt);
4261 continue;
4263 break;
4265 default:;
4269 gsi_next (&gsi);
4273 statistics_counter_event (fun, "widening multiplications inserted",
4274 widen_mul_stats.widen_mults_inserted);
4275 statistics_counter_event (fun, "widening maccs inserted",
4276 widen_mul_stats.maccs_inserted);
4277 statistics_counter_event (fun, "fused multiply-adds inserted",
4278 widen_mul_stats.fmas_inserted);
4279 statistics_counter_event (fun, "divmod calls inserted",
4280 widen_mul_stats.divmod_calls_inserted);
4282 return cfg_changed ? TODO_cleanup_cfg : 0;
4285 } // anon namespace
4287 gimple_opt_pass *
4288 make_pass_optimize_widening_mul (gcc::context *ctxt)
4290 return new pass_optimize_widening_mul (ctxt);