PR target/82524
[official-gcc.git] / gcc / tree-ssa-math-opts.c
blob17d62a82e8bfdf32b9755cf5d8397335fb09ded6
1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
23 modulus = sqrt(x*x + y*y + z*z);
24 x = x / modulus;
25 y = y / modulus;
26 z = z / modulus;
28 that can be optimized to
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
32 x = x * rmodulus;
33 y = y * rmodulus;
34 z = z * rmodulus;
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 by the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
49 this comment.
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "backend.h"
91 #include "target.h"
92 #include "rtl.h"
93 #include "tree.h"
94 #include "gimple.h"
95 #include "predict.h"
96 #include "alloc-pool.h"
97 #include "tree-pass.h"
98 #include "ssa.h"
99 #include "optabs-tree.h"
100 #include "gimple-pretty-print.h"
101 #include "alias.h"
102 #include "fold-const.h"
103 #include "gimple-fold.h"
104 #include "gimple-iterator.h"
105 #include "gimplify.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-dfa.h"
110 #include "tree-ssa.h"
111 #include "builtins.h"
112 #include "params.h"
113 #include "internal-fn.h"
114 #include "case-cfn-macros.h"
115 #include "optabs-libfuncs.h"
116 #include "tree-eh.h"
117 #include "targhooks.h"
119 /* This structure represents one basic block that either computes a
120 division, or is a common dominator for basic block that compute a
121 division. */
122 struct occurrence {
123 /* The basic block represented by this structure. */
124 basic_block bb;
126 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
127 inserted in BB. */
128 tree recip_def;
130 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
131 was inserted in BB. */
132 gimple *recip_def_stmt;
134 /* Pointer to a list of "struct occurrence"s for blocks dominated
135 by BB. */
136 struct occurrence *children;
138 /* Pointer to the next "struct occurrence"s in the list of blocks
139 sharing a common dominator. */
140 struct occurrence *next;
142 /* The number of divisions that are in BB before compute_merit. The
143 number of divisions that are in BB or post-dominate it after
144 compute_merit. */
145 int num_divisions;
147 /* True if the basic block has a division, false if it is a common
148 dominator for basic blocks that do. If it is false and trapping
149 math is active, BB is not a candidate for inserting a reciprocal. */
150 bool bb_has_division;
153 static struct
155 /* Number of 1.0/X ops inserted. */
156 int rdivs_inserted;
158 /* Number of 1.0/FUNC ops inserted. */
159 int rfuncs_inserted;
160 } reciprocal_stats;
162 static struct
164 /* Number of cexpi calls inserted. */
165 int inserted;
166 } sincos_stats;
168 static struct
170 /* Number of hand-written 16-bit nop / bswaps found. */
171 int found_16bit;
173 /* Number of hand-written 32-bit nop / bswaps found. */
174 int found_32bit;
176 /* Number of hand-written 64-bit nop / bswaps found. */
177 int found_64bit;
178 } nop_stats, bswap_stats;
180 static struct
182 /* Number of widening multiplication ops inserted. */
183 int widen_mults_inserted;
185 /* Number of integer multiply-and-accumulate ops inserted. */
186 int maccs_inserted;
188 /* Number of fp fused multiply-add ops inserted. */
189 int fmas_inserted;
191 /* Number of divmod calls inserted. */
192 int divmod_calls_inserted;
193 } widen_mul_stats;
195 /* The instance of "struct occurrence" representing the highest
196 interesting block in the dominator tree. */
197 static struct occurrence *occ_head;
199 /* Allocation pool for getting instances of "struct occurrence". */
200 static object_allocator<occurrence> *occ_pool;
204 /* Allocate and return a new struct occurrence for basic block BB, and
205 whose children list is headed by CHILDREN. */
206 static struct occurrence *
207 occ_new (basic_block bb, struct occurrence *children)
209 struct occurrence *occ;
211 bb->aux = occ = occ_pool->allocate ();
212 memset (occ, 0, sizeof (struct occurrence));
214 occ->bb = bb;
215 occ->children = children;
216 return occ;
220 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
221 list of "struct occurrence"s, one per basic block, having IDOM as
222 their common dominator.
224 We try to insert NEW_OCC as deep as possible in the tree, and we also
225 insert any other block that is a common dominator for BB and one
226 block already in the tree. */
228 static void
229 insert_bb (struct occurrence *new_occ, basic_block idom,
230 struct occurrence **p_head)
232 struct occurrence *occ, **p_occ;
234 for (p_occ = p_head; (occ = *p_occ) != NULL; )
236 basic_block bb = new_occ->bb, occ_bb = occ->bb;
237 basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
238 if (dom == bb)
240 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
241 from its list. */
242 *p_occ = occ->next;
243 occ->next = new_occ->children;
244 new_occ->children = occ;
246 /* Try the next block (it may as well be dominated by BB). */
249 else if (dom == occ_bb)
251 /* OCC_BB dominates BB. Tail recurse to look deeper. */
252 insert_bb (new_occ, dom, &occ->children);
253 return;
256 else if (dom != idom)
258 gcc_assert (!dom->aux);
260 /* There is a dominator between IDOM and BB, add it and make
261 two children out of NEW_OCC and OCC. First, remove OCC from
262 its list. */
263 *p_occ = occ->next;
264 new_occ->next = occ;
265 occ->next = NULL;
267 /* None of the previous blocks has DOM as a dominator: if we tail
268 recursed, we would reexamine them uselessly. Just switch BB with
269 DOM, and go on looking for blocks dominated by DOM. */
270 new_occ = occ_new (dom, new_occ);
273 else
275 /* Nothing special, go on with the next element. */
276 p_occ = &occ->next;
280 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
281 new_occ->next = *p_head;
282 *p_head = new_occ;
285 /* Register that we found a division in BB. */
287 static inline void
288 register_division_in (basic_block bb)
290 struct occurrence *occ;
292 occ = (struct occurrence *) bb->aux;
293 if (!occ)
295 occ = occ_new (bb, NULL);
296 insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
299 occ->bb_has_division = true;
300 occ->num_divisions++;
304 /* Compute the number of divisions that postdominate each block in OCC and
305 its children. */
307 static void
308 compute_merit (struct occurrence *occ)
310 struct occurrence *occ_child;
311 basic_block dom = occ->bb;
313 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
315 basic_block bb;
316 if (occ_child->children)
317 compute_merit (occ_child);
319 if (flag_exceptions)
320 bb = single_noncomplex_succ (dom);
321 else
322 bb = dom;
324 if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
325 occ->num_divisions += occ_child->num_divisions;
330 /* Return whether USE_STMT is a floating-point division by DEF. */
331 static inline bool
332 is_division_by (gimple *use_stmt, tree def)
334 return is_gimple_assign (use_stmt)
335 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
336 && gimple_assign_rhs2 (use_stmt) == def
337 /* Do not recognize x / x as valid division, as we are getting
338 confused later by replacing all immediate uses x in such
339 a stmt. */
340 && gimple_assign_rhs1 (use_stmt) != def;
343 /* Walk the subset of the dominator tree rooted at OCC, setting the
344 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
345 the given basic block. The field may be left NULL, of course,
346 if it is not possible or profitable to do the optimization.
348 DEF_BSI is an iterator pointing at the statement defining DEF.
349 If RECIP_DEF is set, a dominator already has a computation that can
350 be used. */
352 static void
353 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
354 tree def, tree recip_def, int threshold)
356 tree type;
357 gassign *new_stmt;
358 gimple_stmt_iterator gsi;
359 struct occurrence *occ_child;
361 if (!recip_def
362 && (occ->bb_has_division || !flag_trapping_math)
363 && occ->num_divisions >= threshold)
365 /* Make a variable with the replacement and substitute it. */
366 type = TREE_TYPE (def);
367 recip_def = create_tmp_reg (type, "reciptmp");
368 new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
369 build_one_cst (type), def);
371 if (occ->bb_has_division)
373 /* Case 1: insert before an existing division. */
374 gsi = gsi_after_labels (occ->bb);
375 while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
376 gsi_next (&gsi);
378 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
380 else if (def_gsi && occ->bb == def_gsi->bb)
382 /* Case 2: insert right after the definition. Note that this will
383 never happen if the definition statement can throw, because in
384 that case the sole successor of the statement's basic block will
385 dominate all the uses as well. */
386 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
388 else
390 /* Case 3: insert in a basic block not containing defs/uses. */
391 gsi = gsi_after_labels (occ->bb);
392 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
395 reciprocal_stats.rdivs_inserted++;
397 occ->recip_def_stmt = new_stmt;
400 occ->recip_def = recip_def;
401 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
402 insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
406 /* Replace the division at USE_P with a multiplication by the reciprocal, if
407 possible. */
409 static inline void
410 replace_reciprocal (use_operand_p use_p)
412 gimple *use_stmt = USE_STMT (use_p);
413 basic_block bb = gimple_bb (use_stmt);
414 struct occurrence *occ = (struct occurrence *) bb->aux;
416 if (optimize_bb_for_speed_p (bb)
417 && occ->recip_def && use_stmt != occ->recip_def_stmt)
419 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
420 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
421 SET_USE (use_p, occ->recip_def);
422 fold_stmt_inplace (&gsi);
423 update_stmt (use_stmt);
428 /* Free OCC and return one more "struct occurrence" to be freed. */
430 static struct occurrence *
431 free_bb (struct occurrence *occ)
433 struct occurrence *child, *next;
435 /* First get the two pointers hanging off OCC. */
436 next = occ->next;
437 child = occ->children;
438 occ->bb->aux = NULL;
439 occ_pool->remove (occ);
441 /* Now ensure that we don't recurse unless it is necessary. */
442 if (!child)
443 return next;
444 else
446 while (next)
447 next = free_bb (next);
449 return child;
454 /* Look for floating-point divisions among DEF's uses, and try to
455 replace them by multiplications with the reciprocal. Add
456 as many statements computing the reciprocal as needed.
458 DEF must be a GIMPLE register of a floating-point type. */
460 static void
461 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
463 use_operand_p use_p;
464 imm_use_iterator use_iter;
465 struct occurrence *occ;
466 int count = 0, threshold;
468 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
470 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
472 gimple *use_stmt = USE_STMT (use_p);
473 if (is_division_by (use_stmt, def))
475 register_division_in (gimple_bb (use_stmt));
476 count++;
480 /* Do the expensive part only if we can hope to optimize something. */
481 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
482 if (count >= threshold)
484 gimple *use_stmt;
485 for (occ = occ_head; occ; occ = occ->next)
487 compute_merit (occ);
488 insert_reciprocals (def_gsi, occ, def, NULL, threshold);
491 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
493 if (is_division_by (use_stmt, def))
495 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
496 replace_reciprocal (use_p);
501 for (occ = occ_head; occ; )
502 occ = free_bb (occ);
504 occ_head = NULL;
507 /* Return an internal function that implements the reciprocal of CALL,
508 or IFN_LAST if there is no such function that the target supports. */
510 internal_fn
511 internal_fn_reciprocal (gcall *call)
513 internal_fn ifn;
515 switch (gimple_call_combined_fn (call))
517 CASE_CFN_SQRT:
518 ifn = IFN_RSQRT;
519 break;
521 default:
522 return IFN_LAST;
525 tree_pair types = direct_internal_fn_types (ifn, call);
526 if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED))
527 return IFN_LAST;
529 return ifn;
532 /* Go through all the floating-point SSA_NAMEs, and call
533 execute_cse_reciprocals_1 on each of them. */
534 namespace {
536 const pass_data pass_data_cse_reciprocals =
538 GIMPLE_PASS, /* type */
539 "recip", /* name */
540 OPTGROUP_NONE, /* optinfo_flags */
541 TV_NONE, /* tv_id */
542 PROP_ssa, /* properties_required */
543 0, /* properties_provided */
544 0, /* properties_destroyed */
545 0, /* todo_flags_start */
546 TODO_update_ssa, /* todo_flags_finish */
549 class pass_cse_reciprocals : public gimple_opt_pass
551 public:
552 pass_cse_reciprocals (gcc::context *ctxt)
553 : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
556 /* opt_pass methods: */
557 virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
558 virtual unsigned int execute (function *);
560 }; // class pass_cse_reciprocals
562 unsigned int
563 pass_cse_reciprocals::execute (function *fun)
565 basic_block bb;
566 tree arg;
568 occ_pool = new object_allocator<occurrence> ("dominators for recip");
570 memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
571 calculate_dominance_info (CDI_DOMINATORS);
572 calculate_dominance_info (CDI_POST_DOMINATORS);
574 if (flag_checking)
575 FOR_EACH_BB_FN (bb, fun)
576 gcc_assert (!bb->aux);
578 for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
579 if (FLOAT_TYPE_P (TREE_TYPE (arg))
580 && is_gimple_reg (arg))
582 tree name = ssa_default_def (fun, arg);
583 if (name)
584 execute_cse_reciprocals_1 (NULL, name);
587 FOR_EACH_BB_FN (bb, fun)
589 tree def;
591 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
592 gsi_next (&gsi))
594 gphi *phi = gsi.phi ();
595 def = PHI_RESULT (phi);
596 if (! virtual_operand_p (def)
597 && FLOAT_TYPE_P (TREE_TYPE (def)))
598 execute_cse_reciprocals_1 (NULL, def);
601 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
602 gsi_next (&gsi))
604 gimple *stmt = gsi_stmt (gsi);
606 if (gimple_has_lhs (stmt)
607 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
608 && FLOAT_TYPE_P (TREE_TYPE (def))
609 && TREE_CODE (def) == SSA_NAME)
610 execute_cse_reciprocals_1 (&gsi, def);
613 if (optimize_bb_for_size_p (bb))
614 continue;
616 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
617 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
618 gsi_next (&gsi))
620 gimple *stmt = gsi_stmt (gsi);
622 if (is_gimple_assign (stmt)
623 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
625 tree arg1 = gimple_assign_rhs2 (stmt);
626 gimple *stmt1;
628 if (TREE_CODE (arg1) != SSA_NAME)
629 continue;
631 stmt1 = SSA_NAME_DEF_STMT (arg1);
633 if (is_gimple_call (stmt1)
634 && gimple_call_lhs (stmt1))
636 bool fail;
637 imm_use_iterator ui;
638 use_operand_p use_p;
639 tree fndecl = NULL_TREE;
641 gcall *call = as_a <gcall *> (stmt1);
642 internal_fn ifn = internal_fn_reciprocal (call);
643 if (ifn == IFN_LAST)
645 fndecl = gimple_call_fndecl (call);
646 if (!fndecl
647 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_MD)
648 continue;
649 fndecl = targetm.builtin_reciprocal (fndecl);
650 if (!fndecl)
651 continue;
654 /* Check that all uses of the SSA name are divisions,
655 otherwise replacing the defining statement will do
656 the wrong thing. */
657 fail = false;
658 FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
660 gimple *stmt2 = USE_STMT (use_p);
661 if (is_gimple_debug (stmt2))
662 continue;
663 if (!is_gimple_assign (stmt2)
664 || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
665 || gimple_assign_rhs1 (stmt2) == arg1
666 || gimple_assign_rhs2 (stmt2) != arg1)
668 fail = true;
669 break;
672 if (fail)
673 continue;
675 gimple_replace_ssa_lhs (call, arg1);
676 if (gimple_call_internal_p (call) != (ifn != IFN_LAST))
678 auto_vec<tree, 4> args;
679 for (unsigned int i = 0;
680 i < gimple_call_num_args (call); i++)
681 args.safe_push (gimple_call_arg (call, i));
682 gcall *stmt2;
683 if (ifn == IFN_LAST)
684 stmt2 = gimple_build_call_vec (fndecl, args);
685 else
686 stmt2 = gimple_build_call_internal_vec (ifn, args);
687 gimple_call_set_lhs (stmt2, arg1);
688 if (gimple_vdef (call))
690 gimple_set_vdef (stmt2, gimple_vdef (call));
691 SSA_NAME_DEF_STMT (gimple_vdef (stmt2)) = stmt2;
693 gimple_call_set_nothrow (stmt2,
694 gimple_call_nothrow_p (call));
695 gimple_set_vuse (stmt2, gimple_vuse (call));
696 gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
697 gsi_replace (&gsi2, stmt2, true);
699 else
701 if (ifn == IFN_LAST)
702 gimple_call_set_fndecl (call, fndecl);
703 else
704 gimple_call_set_internal_fn (call, ifn);
705 update_stmt (call);
707 reciprocal_stats.rfuncs_inserted++;
709 FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
711 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
712 gimple_assign_set_rhs_code (stmt, MULT_EXPR);
713 fold_stmt_inplace (&gsi);
714 update_stmt (stmt);
721 statistics_counter_event (fun, "reciprocal divs inserted",
722 reciprocal_stats.rdivs_inserted);
723 statistics_counter_event (fun, "reciprocal functions inserted",
724 reciprocal_stats.rfuncs_inserted);
726 free_dominance_info (CDI_DOMINATORS);
727 free_dominance_info (CDI_POST_DOMINATORS);
728 delete occ_pool;
729 return 0;
732 } // anon namespace
734 gimple_opt_pass *
735 make_pass_cse_reciprocals (gcc::context *ctxt)
737 return new pass_cse_reciprocals (ctxt);
740 /* Records an occurrence at statement USE_STMT in the vector of trees
741 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
742 is not yet initialized. Returns true if the occurrence was pushed on
743 the vector. Adjusts *TOP_BB to be the basic block dominating all
744 statements in the vector. */
746 static bool
747 maybe_record_sincos (vec<gimple *> *stmts,
748 basic_block *top_bb, gimple *use_stmt)
750 basic_block use_bb = gimple_bb (use_stmt);
751 if (*top_bb
752 && (*top_bb == use_bb
753 || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
754 stmts->safe_push (use_stmt);
755 else if (!*top_bb
756 || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
758 stmts->safe_push (use_stmt);
759 *top_bb = use_bb;
761 else
762 return false;
764 return true;
767 /* Look for sin, cos and cexpi calls with the same argument NAME and
768 create a single call to cexpi CSEing the result in this case.
769 We first walk over all immediate uses of the argument collecting
770 statements that we can CSE in a vector and in a second pass replace
771 the statement rhs with a REALPART or IMAGPART expression on the
772 result of the cexpi call we insert before the use statement that
773 dominates all other candidates. */
775 static bool
776 execute_cse_sincos_1 (tree name)
778 gimple_stmt_iterator gsi;
779 imm_use_iterator use_iter;
780 tree fndecl, res, type;
781 gimple *def_stmt, *use_stmt, *stmt;
782 int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
783 auto_vec<gimple *> stmts;
784 basic_block top_bb = NULL;
785 int i;
786 bool cfg_changed = false;
788 type = TREE_TYPE (name);
789 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
791 if (gimple_code (use_stmt) != GIMPLE_CALL
792 || !gimple_call_lhs (use_stmt))
793 continue;
795 switch (gimple_call_combined_fn (use_stmt))
797 CASE_CFN_COS:
798 seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
799 break;
801 CASE_CFN_SIN:
802 seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
803 break;
805 CASE_CFN_CEXPI:
806 seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
807 break;
809 default:;
813 if (seen_cos + seen_sin + seen_cexpi <= 1)
814 return false;
816 /* Simply insert cexpi at the beginning of top_bb but not earlier than
817 the name def statement. */
818 fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
819 if (!fndecl)
820 return false;
821 stmt = gimple_build_call (fndecl, 1, name);
822 res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
823 gimple_call_set_lhs (stmt, res);
825 def_stmt = SSA_NAME_DEF_STMT (name);
826 if (!SSA_NAME_IS_DEFAULT_DEF (name)
827 && gimple_code (def_stmt) != GIMPLE_PHI
828 && gimple_bb (def_stmt) == top_bb)
830 gsi = gsi_for_stmt (def_stmt);
831 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
833 else
835 gsi = gsi_after_labels (top_bb);
836 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
838 sincos_stats.inserted++;
840 /* And adjust the recorded old call sites. */
841 for (i = 0; stmts.iterate (i, &use_stmt); ++i)
843 tree rhs = NULL;
845 switch (gimple_call_combined_fn (use_stmt))
847 CASE_CFN_COS:
848 rhs = fold_build1 (REALPART_EXPR, type, res);
849 break;
851 CASE_CFN_SIN:
852 rhs = fold_build1 (IMAGPART_EXPR, type, res);
853 break;
855 CASE_CFN_CEXPI:
856 rhs = res;
857 break;
859 default:;
860 gcc_unreachable ();
863 /* Replace call with a copy. */
864 stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
866 gsi = gsi_for_stmt (use_stmt);
867 gsi_replace (&gsi, stmt, true);
868 if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
869 cfg_changed = true;
872 return cfg_changed;
875 /* To evaluate powi(x,n), the floating point value x raised to the
876 constant integer exponent n, we use a hybrid algorithm that
877 combines the "window method" with look-up tables. For an
878 introduction to exponentiation algorithms and "addition chains",
879 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
880 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
881 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
882 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
884 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
885 multiplications to inline before calling the system library's pow
886 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
887 so this default never requires calling pow, powf or powl. */
889 #ifndef POWI_MAX_MULTS
890 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
891 #endif
893 /* The size of the "optimal power tree" lookup table. All
894 exponents less than this value are simply looked up in the
895 powi_table below. This threshold is also used to size the
896 cache of pseudo registers that hold intermediate results. */
897 #define POWI_TABLE_SIZE 256
899 /* The size, in bits of the window, used in the "window method"
900 exponentiation algorithm. This is equivalent to a radix of
901 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
902 #define POWI_WINDOW_SIZE 3
904 /* The following table is an efficient representation of an
905 "optimal power tree". For each value, i, the corresponding
906 value, j, in the table states than an optimal evaluation
907 sequence for calculating pow(x,i) can be found by evaluating
908 pow(x,j)*pow(x,i-j). An optimal power tree for the first
909 100 integers is given in Knuth's "Seminumerical algorithms". */
911 static const unsigned char powi_table[POWI_TABLE_SIZE] =
913 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
914 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
915 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
916 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
917 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
918 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
919 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
920 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
921 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
922 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
923 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
924 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
925 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
926 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
927 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
928 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
929 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
930 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
931 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
932 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
933 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
934 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
935 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
936 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
937 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
938 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
939 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
940 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
941 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
942 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
943 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
944 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
948 /* Return the number of multiplications required to calculate
949 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
950 subroutine of powi_cost. CACHE is an array indicating
951 which exponents have already been calculated. */
953 static int
954 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
956 /* If we've already calculated this exponent, then this evaluation
957 doesn't require any additional multiplications. */
958 if (cache[n])
959 return 0;
961 cache[n] = true;
962 return powi_lookup_cost (n - powi_table[n], cache)
963 + powi_lookup_cost (powi_table[n], cache) + 1;
966 /* Return the number of multiplications required to calculate
967 powi(x,n) for an arbitrary x, given the exponent N. This
968 function needs to be kept in sync with powi_as_mults below. */
970 static int
971 powi_cost (HOST_WIDE_INT n)
973 bool cache[POWI_TABLE_SIZE];
974 unsigned HOST_WIDE_INT digit;
975 unsigned HOST_WIDE_INT val;
976 int result;
978 if (n == 0)
979 return 0;
981 /* Ignore the reciprocal when calculating the cost. */
982 val = (n < 0) ? -n : n;
984 /* Initialize the exponent cache. */
985 memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
986 cache[1] = true;
988 result = 0;
990 while (val >= POWI_TABLE_SIZE)
992 if (val & 1)
994 digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
995 result += powi_lookup_cost (digit, cache)
996 + POWI_WINDOW_SIZE + 1;
997 val >>= POWI_WINDOW_SIZE;
999 else
1001 val >>= 1;
1002 result++;
1006 return result + powi_lookup_cost (val, cache);
1009 /* Recursive subroutine of powi_as_mults. This function takes the
1010 array, CACHE, of already calculated exponents and an exponent N and
1011 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
1013 static tree
1014 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
1015 HOST_WIDE_INT n, tree *cache)
1017 tree op0, op1, ssa_target;
1018 unsigned HOST_WIDE_INT digit;
1019 gassign *mult_stmt;
1021 if (n < POWI_TABLE_SIZE && cache[n])
1022 return cache[n];
1024 ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1026 if (n < POWI_TABLE_SIZE)
1028 cache[n] = ssa_target;
1029 op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1030 op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1032 else if (n & 1)
1034 digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1035 op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1036 op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1038 else
1040 op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1041 op1 = op0;
1044 mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1045 gimple_set_location (mult_stmt, loc);
1046 gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1048 return ssa_target;
1051 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1052 This function needs to be kept in sync with powi_cost above. */
1054 static tree
1055 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1056 tree arg0, HOST_WIDE_INT n)
1058 tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1059 gassign *div_stmt;
1060 tree target;
1062 if (n == 0)
1063 return build_real (type, dconst1);
1065 memset (cache, 0, sizeof (cache));
1066 cache[1] = arg0;
1068 result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1069 if (n >= 0)
1070 return result;
1072 /* If the original exponent was negative, reciprocate the result. */
1073 target = make_temp_ssa_name (type, NULL, "powmult");
1074 div_stmt = gimple_build_assign (target, RDIV_EXPR,
1075 build_real (type, dconst1), result);
1076 gimple_set_location (div_stmt, loc);
1077 gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1079 return target;
1082 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1083 location info LOC. If the arguments are appropriate, create an
1084 equivalent sequence of statements prior to GSI using an optimal
1085 number of multiplications, and return an expession holding the
1086 result. */
1088 static tree
1089 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1090 tree arg0, HOST_WIDE_INT n)
1092 /* Avoid largest negative number. */
1093 if (n != -n
1094 && ((n >= -1 && n <= 2)
1095 || (optimize_function_for_speed_p (cfun)
1096 && powi_cost (n) <= POWI_MAX_MULTS)))
1097 return powi_as_mults (gsi, loc, arg0, n);
1099 return NULL_TREE;
1102 /* Build a gimple call statement that calls FN with argument ARG.
1103 Set the lhs of the call statement to a fresh SSA name. Insert the
1104 statement prior to GSI's current position, and return the fresh
1105 SSA name. */
1107 static tree
1108 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1109 tree fn, tree arg)
1111 gcall *call_stmt;
1112 tree ssa_target;
1114 call_stmt = gimple_build_call (fn, 1, arg);
1115 ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1116 gimple_set_lhs (call_stmt, ssa_target);
1117 gimple_set_location (call_stmt, loc);
1118 gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1120 return ssa_target;
1123 /* Build a gimple binary operation with the given CODE and arguments
1124 ARG0, ARG1, assigning the result to a new SSA name for variable
1125 TARGET. Insert the statement prior to GSI's current position, and
1126 return the fresh SSA name.*/
1128 static tree
1129 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1130 const char *name, enum tree_code code,
1131 tree arg0, tree arg1)
1133 tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1134 gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1135 gimple_set_location (stmt, loc);
1136 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1137 return result;
1140 /* Build a gimple reference operation with the given CODE and argument
1141 ARG, assigning the result to a new SSA name of TYPE with NAME.
1142 Insert the statement prior to GSI's current position, and return
1143 the fresh SSA name. */
1145 static inline tree
1146 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1147 const char *name, enum tree_code code, tree arg0)
1149 tree result = make_temp_ssa_name (type, NULL, name);
1150 gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1151 gimple_set_location (stmt, loc);
1152 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1153 return result;
1156 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1157 prior to GSI's current position, and return the fresh SSA name. */
1159 static tree
1160 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1161 tree type, tree val)
1163 tree result = make_ssa_name (type);
1164 gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1165 gimple_set_location (stmt, loc);
1166 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1167 return result;
1170 struct pow_synth_sqrt_info
1172 bool *factors;
1173 unsigned int deepest;
1174 unsigned int num_mults;
1177 /* Return true iff the real value C can be represented as a
1178 sum of powers of 0.5 up to N. That is:
1179 C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1180 Record in INFO the various parameters of the synthesis algorithm such
1181 as the factors a[i], the maximum 0.5 power and the number of
1182 multiplications that will be required. */
1184 bool
1185 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1186 struct pow_synth_sqrt_info *info)
1188 REAL_VALUE_TYPE factor = dconsthalf;
1189 REAL_VALUE_TYPE remainder = c;
1191 info->deepest = 0;
1192 info->num_mults = 0;
1193 memset (info->factors, 0, n * sizeof (bool));
1195 for (unsigned i = 0; i < n; i++)
1197 REAL_VALUE_TYPE res;
1199 /* If something inexact happened bail out now. */
1200 if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1201 return false;
1203 /* We have hit zero. The number is representable as a sum
1204 of powers of 0.5. */
1205 if (real_equal (&res, &dconst0))
1207 info->factors[i] = true;
1208 info->deepest = i + 1;
1209 return true;
1211 else if (!REAL_VALUE_NEGATIVE (res))
1213 remainder = res;
1214 info->factors[i] = true;
1215 info->num_mults++;
1217 else
1218 info->factors[i] = false;
1220 real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1222 return false;
1225 /* Return the tree corresponding to FN being applied
1226 to ARG N times at GSI and LOC.
1227 Look up previous results from CACHE if need be.
1228 cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times. */
1230 static tree
1231 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1232 tree fn, location_t loc, tree *cache)
1234 tree res = cache[n];
1235 if (!res)
1237 tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1238 res = build_and_insert_call (gsi, loc, fn, prev);
1239 cache[n] = res;
1242 return res;
1245 /* Print to STREAM the repeated application of function FNAME to ARG
1246 N times. So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1247 "foo (foo (x))". */
1249 static void
1250 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1251 unsigned int n)
1253 if (n == 0)
1254 fprintf (stream, "%s", arg);
1255 else
1257 fprintf (stream, "%s (", fname);
1258 print_nested_fn (stream, fname, arg, n - 1);
1259 fprintf (stream, ")");
1263 /* Print to STREAM the fractional sequence of sqrt chains
1264 applied to ARG, described by INFO. Used for the dump file. */
1266 static void
1267 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1268 struct pow_synth_sqrt_info *info)
1270 for (unsigned int i = 0; i < info->deepest; i++)
1272 bool is_set = info->factors[i];
1273 if (is_set)
1275 print_nested_fn (stream, "sqrt", arg, i + 1);
1276 if (i != info->deepest - 1)
1277 fprintf (stream, " * ");
1282 /* Print to STREAM a representation of raising ARG to an integer
1283 power N. Used for the dump file. */
1285 static void
1286 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1288 if (n > 1)
1289 fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1290 else if (n == 1)
1291 fprintf (stream, "%s", arg);
1294 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1295 square roots. Place at GSI and LOC. Limit the maximum depth
1296 of the sqrt chains to MAX_DEPTH. Return the tree holding the
1297 result of the expanded sequence or NULL_TREE if the expansion failed.
1299 This routine assumes that ARG1 is a real number with a fractional part
1300 (the integer exponent case will have been handled earlier in
1301 gimple_expand_builtin_pow).
1303 For ARG1 > 0.0:
1304 * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1305 FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1306 FRAC_PART == ARG1 - WHOLE_PART:
1307 Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1308 POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1309 if it can be expressed as such, that is if FRAC_PART satisfies:
1310 FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1311 where integer a[i] is either 0 or 1.
1313 Example:
1314 POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1315 --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1317 For ARG1 < 0.0 there are two approaches:
1318 * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1319 is calculated as above.
1321 Example:
1322 POW (x, -5.625) == 1.0 / POW (x, 5.625)
1323 --> 1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1325 * (B) : WHOLE_PART := - ceil (abs (ARG1))
1326 FRAC_PART := ARG1 - WHOLE_PART
1327 and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1328 Example:
1329 POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1330 --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1332 For ARG1 < 0.0 we choose between (A) and (B) depending on
1333 how many multiplications we'd have to do.
1334 So, for the example in (B): POW (x, -5.875), if we were to
1335 follow algorithm (A) we would produce:
1336 1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1337 which contains more multiplications than approach (B).
1339 Hopefully, this approach will eliminate potentially expensive POW library
1340 calls when unsafe floating point math is enabled and allow the compiler to
1341 further optimise the multiplies, square roots and divides produced by this
1342 function. */
1344 static tree
1345 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1346 tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1348 tree type = TREE_TYPE (arg0);
1349 machine_mode mode = TYPE_MODE (type);
1350 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1351 bool one_over = true;
1353 if (!sqrtfn)
1354 return NULL_TREE;
1356 if (TREE_CODE (arg1) != REAL_CST)
1357 return NULL_TREE;
1359 REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1361 gcc_assert (max_depth > 0);
1362 tree *cache = XALLOCAVEC (tree, max_depth + 1);
1364 struct pow_synth_sqrt_info synth_info;
1365 synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1366 synth_info.deepest = 0;
1367 synth_info.num_mults = 0;
1369 bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1370 REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1372 /* The whole and fractional parts of exp. */
1373 REAL_VALUE_TYPE whole_part;
1374 REAL_VALUE_TYPE frac_part;
1376 real_floor (&whole_part, mode, &exp);
1377 real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1380 REAL_VALUE_TYPE ceil_whole = dconst0;
1381 REAL_VALUE_TYPE ceil_fract = dconst0;
1383 if (neg_exp)
1385 real_ceil (&ceil_whole, mode, &exp);
1386 real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1389 if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1390 return NULL_TREE;
1392 /* Check whether it's more profitable to not use 1.0 / ... */
1393 if (neg_exp)
1395 struct pow_synth_sqrt_info alt_synth_info;
1396 alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1397 alt_synth_info.deepest = 0;
1398 alt_synth_info.num_mults = 0;
1400 if (representable_as_half_series_p (ceil_fract, max_depth,
1401 &alt_synth_info)
1402 && alt_synth_info.deepest <= synth_info.deepest
1403 && alt_synth_info.num_mults < synth_info.num_mults)
1405 whole_part = ceil_whole;
1406 frac_part = ceil_fract;
1407 synth_info.deepest = alt_synth_info.deepest;
1408 synth_info.num_mults = alt_synth_info.num_mults;
1409 memcpy (synth_info.factors, alt_synth_info.factors,
1410 (max_depth + 1) * sizeof (bool));
1411 one_over = false;
1415 HOST_WIDE_INT n = real_to_integer (&whole_part);
1416 REAL_VALUE_TYPE cint;
1417 real_from_integer (&cint, VOIDmode, n, SIGNED);
1419 if (!real_identical (&whole_part, &cint))
1420 return NULL_TREE;
1422 if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1423 return NULL_TREE;
1425 memset (cache, 0, (max_depth + 1) * sizeof (tree));
1427 tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1429 /* Calculate the integer part of the exponent. */
1430 if (n > 1)
1432 integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1433 if (!integer_res)
1434 return NULL_TREE;
1437 if (dump_file)
1439 char string[64];
1441 real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1442 fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1444 if (neg_exp)
1446 if (one_over)
1448 fprintf (dump_file, "1.0 / (");
1449 dump_integer_part (dump_file, "x", n);
1450 if (n > 0)
1451 fprintf (dump_file, " * ");
1452 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1453 fprintf (dump_file, ")");
1455 else
1457 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1458 fprintf (dump_file, " / (");
1459 dump_integer_part (dump_file, "x", n);
1460 fprintf (dump_file, ")");
1463 else
1465 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1466 if (n > 0)
1467 fprintf (dump_file, " * ");
1468 dump_integer_part (dump_file, "x", n);
1471 fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1475 tree fract_res = NULL_TREE;
1476 cache[0] = arg0;
1478 /* Calculate the fractional part of the exponent. */
1479 for (unsigned i = 0; i < synth_info.deepest; i++)
1481 if (synth_info.factors[i])
1483 tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1485 if (!fract_res)
1486 fract_res = sqrt_chain;
1488 else
1489 fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1490 fract_res, sqrt_chain);
1494 tree res = NULL_TREE;
1496 if (neg_exp)
1498 if (one_over)
1500 if (n > 0)
1501 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1502 fract_res, integer_res);
1503 else
1504 res = fract_res;
1506 res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1507 build_real (type, dconst1), res);
1509 else
1511 res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1512 fract_res, integer_res);
1515 else
1516 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1517 fract_res, integer_res);
1518 return res;
1521 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1522 with location info LOC. If possible, create an equivalent and
1523 less expensive sequence of statements prior to GSI, and return an
1524 expession holding the result. */
1526 static tree
1527 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1528 tree arg0, tree arg1)
1530 REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1531 REAL_VALUE_TYPE c2, dconst3;
1532 HOST_WIDE_INT n;
1533 tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1534 machine_mode mode;
1535 bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1536 bool hw_sqrt_exists, c_is_int, c2_is_int;
1538 dconst1_4 = dconst1;
1539 SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1541 /* If the exponent isn't a constant, there's nothing of interest
1542 to be done. */
1543 if (TREE_CODE (arg1) != REAL_CST)
1544 return NULL_TREE;
1546 /* Don't perform the operation if flag_signaling_nans is on
1547 and the operand is a signaling NaN. */
1548 if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1)))
1549 && ((TREE_CODE (arg0) == REAL_CST
1550 && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0)))
1551 || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1))))
1552 return NULL_TREE;
1554 /* If the exponent is equivalent to an integer, expand to an optimal
1555 multiplication sequence when profitable. */
1556 c = TREE_REAL_CST (arg1);
1557 n = real_to_integer (&c);
1558 real_from_integer (&cint, VOIDmode, n, SIGNED);
1559 c_is_int = real_identical (&c, &cint);
1561 if (c_is_int
1562 && ((n >= -1 && n <= 2)
1563 || (flag_unsafe_math_optimizations
1564 && speed_p
1565 && powi_cost (n) <= POWI_MAX_MULTS)))
1566 return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1568 /* Attempt various optimizations using sqrt and cbrt. */
1569 type = TREE_TYPE (arg0);
1570 mode = TYPE_MODE (type);
1571 sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1573 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1574 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1575 sqrt(-0) = -0. */
1576 if (sqrtfn
1577 && real_equal (&c, &dconsthalf)
1578 && !HONOR_SIGNED_ZEROS (mode))
1579 return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1581 hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1583 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1584 optimizations since 1./3. is not exactly representable. If x
1585 is negative and finite, the correct value of pow(x,1./3.) is
1586 a NaN with the "invalid" exception raised, because the value
1587 of 1./3. actually has an even denominator. The correct value
1588 of cbrt(x) is a negative real value. */
1589 cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1590 dconst1_3 = real_value_truncate (mode, dconst_third ());
1592 if (flag_unsafe_math_optimizations
1593 && cbrtfn
1594 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1595 && real_equal (&c, &dconst1_3))
1596 return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1598 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1599 if we don't have a hardware sqrt insn. */
1600 dconst1_6 = dconst1_3;
1601 SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1603 if (flag_unsafe_math_optimizations
1604 && sqrtfn
1605 && cbrtfn
1606 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1607 && speed_p
1608 && hw_sqrt_exists
1609 && real_equal (&c, &dconst1_6))
1611 /* sqrt(x) */
1612 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1614 /* cbrt(sqrt(x)) */
1615 return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1619 /* Attempt to expand the POW as a product of square root chains.
1620 Expand the 0.25 case even when otpimising for size. */
1621 if (flag_unsafe_math_optimizations
1622 && sqrtfn
1623 && hw_sqrt_exists
1624 && (speed_p || real_equal (&c, &dconst1_4))
1625 && !HONOR_SIGNED_ZEROS (mode))
1627 unsigned int max_depth = speed_p
1628 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1629 : 2;
1631 tree expand_with_sqrts
1632 = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1634 if (expand_with_sqrts)
1635 return expand_with_sqrts;
1638 real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1639 n = real_to_integer (&c2);
1640 real_from_integer (&cint, VOIDmode, n, SIGNED);
1641 c2_is_int = real_identical (&c2, &cint);
1643 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1645 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
1646 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
1648 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
1649 different from pow(x, 1./3.) due to rounding and behavior with
1650 negative x, we need to constrain this transformation to unsafe
1651 math and positive x or finite math. */
1652 real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1653 real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1654 real_round (&c2, mode, &c2);
1655 n = real_to_integer (&c2);
1656 real_from_integer (&cint, VOIDmode, n, SIGNED);
1657 real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1658 real_convert (&c2, mode, &c2);
1660 if (flag_unsafe_math_optimizations
1661 && cbrtfn
1662 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1663 && real_identical (&c2, &c)
1664 && !c2_is_int
1665 && optimize_function_for_speed_p (cfun)
1666 && powi_cost (n / 3) <= POWI_MAX_MULTS)
1668 tree powi_x_ndiv3 = NULL_TREE;
1670 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
1671 possible or profitable, give up. Skip the degenerate case when
1672 abs(n) < 3, where the result is always 1. */
1673 if (absu_hwi (n) >= 3)
1675 powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1676 abs_hwi (n / 3));
1677 if (!powi_x_ndiv3)
1678 return NULL_TREE;
1681 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
1682 as that creates an unnecessary variable. Instead, just produce
1683 either cbrt(x) or cbrt(x) * cbrt(x). */
1684 cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1686 if (absu_hwi (n) % 3 == 1)
1687 powi_cbrt_x = cbrt_x;
1688 else
1689 powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1690 cbrt_x, cbrt_x);
1692 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
1693 if (absu_hwi (n) < 3)
1694 result = powi_cbrt_x;
1695 else
1696 result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1697 powi_x_ndiv3, powi_cbrt_x);
1699 /* If n is negative, reciprocate the result. */
1700 if (n < 0)
1701 result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1702 build_real (type, dconst1), result);
1704 return result;
1707 /* No optimizations succeeded. */
1708 return NULL_TREE;
1711 /* ARG is the argument to a cabs builtin call in GSI with location info
1712 LOC. Create a sequence of statements prior to GSI that calculates
1713 sqrt(R*R + I*I), where R and I are the real and imaginary components
1714 of ARG, respectively. Return an expression holding the result. */
1716 static tree
1717 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1719 tree real_part, imag_part, addend1, addend2, sum, result;
1720 tree type = TREE_TYPE (TREE_TYPE (arg));
1721 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1722 machine_mode mode = TYPE_MODE (type);
1724 if (!flag_unsafe_math_optimizations
1725 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1726 || !sqrtfn
1727 || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1728 return NULL_TREE;
1730 real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1731 REALPART_EXPR, arg);
1732 addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1733 real_part, real_part);
1734 imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1735 IMAGPART_EXPR, arg);
1736 addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1737 imag_part, imag_part);
1738 sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1739 result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1741 return result;
1744 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1745 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
1746 an optimal number of multiplies, when n is a constant. */
1748 namespace {
1750 const pass_data pass_data_cse_sincos =
1752 GIMPLE_PASS, /* type */
1753 "sincos", /* name */
1754 OPTGROUP_NONE, /* optinfo_flags */
1755 TV_NONE, /* tv_id */
1756 PROP_ssa, /* properties_required */
1757 PROP_gimple_opt_math, /* properties_provided */
1758 0, /* properties_destroyed */
1759 0, /* todo_flags_start */
1760 TODO_update_ssa, /* todo_flags_finish */
1763 class pass_cse_sincos : public gimple_opt_pass
1765 public:
1766 pass_cse_sincos (gcc::context *ctxt)
1767 : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1770 /* opt_pass methods: */
1771 virtual bool gate (function *)
1773 /* We no longer require either sincos or cexp, since powi expansion
1774 piggybacks on this pass. */
1775 return optimize;
1778 virtual unsigned int execute (function *);
1780 }; // class pass_cse_sincos
1782 unsigned int
1783 pass_cse_sincos::execute (function *fun)
1785 basic_block bb;
1786 bool cfg_changed = false;
1788 calculate_dominance_info (CDI_DOMINATORS);
1789 memset (&sincos_stats, 0, sizeof (sincos_stats));
1791 FOR_EACH_BB_FN (bb, fun)
1793 gimple_stmt_iterator gsi;
1794 bool cleanup_eh = false;
1796 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1798 gimple *stmt = gsi_stmt (gsi);
1800 /* Only the last stmt in a bb could throw, no need to call
1801 gimple_purge_dead_eh_edges if we change something in the middle
1802 of a basic block. */
1803 cleanup_eh = false;
1805 if (is_gimple_call (stmt)
1806 && gimple_call_lhs (stmt))
1808 tree arg, arg0, arg1, result;
1809 HOST_WIDE_INT n;
1810 location_t loc;
1812 switch (gimple_call_combined_fn (stmt))
1814 CASE_CFN_COS:
1815 CASE_CFN_SIN:
1816 CASE_CFN_CEXPI:
1817 /* Make sure we have either sincos or cexp. */
1818 if (!targetm.libc_has_function (function_c99_math_complex)
1819 && !targetm.libc_has_function (function_sincos))
1820 break;
1822 arg = gimple_call_arg (stmt, 0);
1823 if (TREE_CODE (arg) == SSA_NAME)
1824 cfg_changed |= execute_cse_sincos_1 (arg);
1825 break;
1827 CASE_CFN_POW:
1828 arg0 = gimple_call_arg (stmt, 0);
1829 arg1 = gimple_call_arg (stmt, 1);
1831 loc = gimple_location (stmt);
1832 result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1834 if (result)
1836 tree lhs = gimple_get_lhs (stmt);
1837 gassign *new_stmt = gimple_build_assign (lhs, result);
1838 gimple_set_location (new_stmt, loc);
1839 unlink_stmt_vdef (stmt);
1840 gsi_replace (&gsi, new_stmt, true);
1841 cleanup_eh = true;
1842 if (gimple_vdef (stmt))
1843 release_ssa_name (gimple_vdef (stmt));
1845 break;
1847 CASE_CFN_POWI:
1848 arg0 = gimple_call_arg (stmt, 0);
1849 arg1 = gimple_call_arg (stmt, 1);
1850 loc = gimple_location (stmt);
1852 if (real_minus_onep (arg0))
1854 tree t0, t1, cond, one, minus_one;
1855 gassign *stmt;
1857 t0 = TREE_TYPE (arg0);
1858 t1 = TREE_TYPE (arg1);
1859 one = build_real (t0, dconst1);
1860 minus_one = build_real (t0, dconstm1);
1862 cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1863 stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1864 arg1, build_int_cst (t1, 1));
1865 gimple_set_location (stmt, loc);
1866 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1868 result = make_temp_ssa_name (t0, NULL, "powi");
1869 stmt = gimple_build_assign (result, COND_EXPR, cond,
1870 minus_one, one);
1871 gimple_set_location (stmt, loc);
1872 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1874 else
1876 if (!tree_fits_shwi_p (arg1))
1877 break;
1879 n = tree_to_shwi (arg1);
1880 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1883 if (result)
1885 tree lhs = gimple_get_lhs (stmt);
1886 gassign *new_stmt = gimple_build_assign (lhs, result);
1887 gimple_set_location (new_stmt, loc);
1888 unlink_stmt_vdef (stmt);
1889 gsi_replace (&gsi, new_stmt, true);
1890 cleanup_eh = true;
1891 if (gimple_vdef (stmt))
1892 release_ssa_name (gimple_vdef (stmt));
1894 break;
1896 CASE_CFN_CABS:
1897 arg0 = gimple_call_arg (stmt, 0);
1898 loc = gimple_location (stmt);
1899 result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1901 if (result)
1903 tree lhs = gimple_get_lhs (stmt);
1904 gassign *new_stmt = gimple_build_assign (lhs, result);
1905 gimple_set_location (new_stmt, loc);
1906 unlink_stmt_vdef (stmt);
1907 gsi_replace (&gsi, new_stmt, true);
1908 cleanup_eh = true;
1909 if (gimple_vdef (stmt))
1910 release_ssa_name (gimple_vdef (stmt));
1912 break;
1914 default:;
1918 if (cleanup_eh)
1919 cfg_changed |= gimple_purge_dead_eh_edges (bb);
1922 statistics_counter_event (fun, "sincos statements inserted",
1923 sincos_stats.inserted);
1925 return cfg_changed ? TODO_cleanup_cfg : 0;
1928 } // anon namespace
1930 gimple_opt_pass *
1931 make_pass_cse_sincos (gcc::context *ctxt)
1933 return new pass_cse_sincos (ctxt);
1936 /* A symbolic number structure is used to detect byte permutation and selection
1937 patterns of a source. To achieve that, its field N contains an artificial
1938 number consisting of BITS_PER_MARKER sized markers tracking where does each
1939 byte come from in the source:
1941 0 - target byte has the value 0
1942 FF - target byte has an unknown value (eg. due to sign extension)
1943 1..size - marker value is the byte index in the source (0 for lsb).
1945 To detect permutations on memory sources (arrays and structures), a symbolic
1946 number is also associated:
1947 - a base address BASE_ADDR and an OFFSET giving the address of the source;
1948 - a range which gives the difference between the highest and lowest accessed
1949 memory location to make such a symbolic number;
1950 - the address SRC of the source element of lowest address as a convenience
1951 to easily get BASE_ADDR + offset + lowest bytepos;
1952 - number of expressions N_OPS bitwise ored together to represent
1953 approximate cost of the computation.
1955 Note 1: the range is different from size as size reflects the size of the
1956 type of the current expression. For instance, for an array char a[],
1957 (short) a[0] | (short) a[3] would have a size of 2 but a range of 4 while
1958 (short) a[0] | ((short) a[0] << 1) would still have a size of 2 but this
1959 time a range of 1.
1961 Note 2: for non-memory sources, range holds the same value as size.
1963 Note 3: SRC points to the SSA_NAME in case of non-memory source. */
1965 struct symbolic_number {
1966 uint64_t n;
1967 tree type;
1968 tree base_addr;
1969 tree offset;
1970 HOST_WIDE_INT bytepos;
1971 tree src;
1972 tree alias_set;
1973 tree vuse;
1974 unsigned HOST_WIDE_INT range;
1975 int n_ops;
1978 #define BITS_PER_MARKER 8
1979 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1980 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1981 #define HEAD_MARKER(n, size) \
1982 ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1984 /* The number which the find_bswap_or_nop_1 result should match in
1985 order to have a nop. The number is masked according to the size of
1986 the symbolic number before using it. */
1987 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1988 (uint64_t)0x08070605 << 32 | 0x04030201)
1990 /* The number which the find_bswap_or_nop_1 result should match in
1991 order to have a byte swap. The number is masked according to the
1992 size of the symbolic number before using it. */
1993 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1994 (uint64_t)0x01020304 << 32 | 0x05060708)
1996 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1997 number N. Return false if the requested operation is not permitted
1998 on a symbolic number. */
2000 static inline bool
2001 do_shift_rotate (enum tree_code code,
2002 struct symbolic_number *n,
2003 int count)
2005 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2006 unsigned head_marker;
2008 if (count % BITS_PER_UNIT != 0)
2009 return false;
2010 count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
2012 /* Zero out the extra bits of N in order to avoid them being shifted
2013 into the significant bits. */
2014 if (size < 64 / BITS_PER_MARKER)
2015 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2017 switch (code)
2019 case LSHIFT_EXPR:
2020 n->n <<= count;
2021 break;
2022 case RSHIFT_EXPR:
2023 head_marker = HEAD_MARKER (n->n, size);
2024 n->n >>= count;
2025 /* Arithmetic shift of signed type: result is dependent on the value. */
2026 if (!TYPE_UNSIGNED (n->type) && head_marker)
2027 for (i = 0; i < count / BITS_PER_MARKER; i++)
2028 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2029 << ((size - 1 - i) * BITS_PER_MARKER);
2030 break;
2031 case LROTATE_EXPR:
2032 n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
2033 break;
2034 case RROTATE_EXPR:
2035 n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
2036 break;
2037 default:
2038 return false;
2040 /* Zero unused bits for size. */
2041 if (size < 64 / BITS_PER_MARKER)
2042 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2043 return true;
2046 /* Perform sanity checking for the symbolic number N and the gimple
2047 statement STMT. */
2049 static inline bool
2050 verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
2052 tree lhs_type;
2054 lhs_type = gimple_expr_type (stmt);
2056 if (TREE_CODE (lhs_type) != INTEGER_TYPE)
2057 return false;
2059 if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
2060 return false;
2062 return true;
2065 /* Initialize the symbolic number N for the bswap pass from the base element
2066 SRC manipulated by the bitwise OR expression. */
2068 static bool
2069 init_symbolic_number (struct symbolic_number *n, tree src)
2071 int size;
2073 if (! INTEGRAL_TYPE_P (TREE_TYPE (src)))
2074 return false;
2076 n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
2077 n->src = src;
2079 /* Set up the symbolic number N by setting each byte to a value between 1 and
2080 the byte size of rhs1. The highest order byte is set to n->size and the
2081 lowest order byte to 1. */
2082 n->type = TREE_TYPE (src);
2083 size = TYPE_PRECISION (n->type);
2084 if (size % BITS_PER_UNIT != 0)
2085 return false;
2086 size /= BITS_PER_UNIT;
2087 if (size > 64 / BITS_PER_MARKER)
2088 return false;
2089 n->range = size;
2090 n->n = CMPNOP;
2091 n->n_ops = 1;
2093 if (size < 64 / BITS_PER_MARKER)
2094 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2096 return true;
2099 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2100 the answer. If so, REF is that memory source and the base of the memory area
2101 accessed and the offset of the access from that base are recorded in N. */
2103 bool
2104 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
2106 /* Leaf node is an array or component ref. Memorize its base and
2107 offset from base to compare to other such leaf node. */
2108 HOST_WIDE_INT bitsize, bitpos;
2109 machine_mode mode;
2110 int unsignedp, reversep, volatilep;
2111 tree offset, base_addr;
2113 /* Not prepared to handle PDP endian. */
2114 if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2115 return false;
2117 if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2118 return false;
2120 base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2121 &unsignedp, &reversep, &volatilep);
2123 if (TREE_CODE (base_addr) == MEM_REF)
2125 offset_int bit_offset = 0;
2126 tree off = TREE_OPERAND (base_addr, 1);
2128 if (!integer_zerop (off))
2130 offset_int boff, coff = mem_ref_offset (base_addr);
2131 boff = coff << LOG2_BITS_PER_UNIT;
2132 bit_offset += boff;
2135 base_addr = TREE_OPERAND (base_addr, 0);
2137 /* Avoid returning a negative bitpos as this may wreak havoc later. */
2138 if (wi::neg_p (bit_offset))
2140 offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2141 offset_int tem = wi::bit_and_not (bit_offset, mask);
2142 /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2143 Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */
2144 bit_offset -= tem;
2145 tem >>= LOG2_BITS_PER_UNIT;
2146 if (offset)
2147 offset = size_binop (PLUS_EXPR, offset,
2148 wide_int_to_tree (sizetype, tem));
2149 else
2150 offset = wide_int_to_tree (sizetype, tem);
2153 bitpos += bit_offset.to_shwi ();
2156 if (bitpos % BITS_PER_UNIT)
2157 return false;
2158 if (bitsize % BITS_PER_UNIT)
2159 return false;
2160 if (reversep)
2161 return false;
2163 if (!init_symbolic_number (n, ref))
2164 return false;
2165 n->base_addr = base_addr;
2166 n->offset = offset;
2167 n->bytepos = bitpos / BITS_PER_UNIT;
2168 n->alias_set = reference_alias_ptr_type (ref);
2169 n->vuse = gimple_vuse (stmt);
2170 return true;
2173 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2174 symbolic number N1 and N2 whose source statements are respectively
2175 SOURCE_STMT1 and SOURCE_STMT2. */
2177 static gimple *
2178 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
2179 gimple *source_stmt2, struct symbolic_number *n2,
2180 struct symbolic_number *n)
2182 int i, size;
2183 uint64_t mask;
2184 gimple *source_stmt;
2185 struct symbolic_number *n_start;
2187 tree rhs1 = gimple_assign_rhs1 (source_stmt1);
2188 if (TREE_CODE (rhs1) == BIT_FIELD_REF
2189 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
2190 rhs1 = TREE_OPERAND (rhs1, 0);
2191 tree rhs2 = gimple_assign_rhs1 (source_stmt2);
2192 if (TREE_CODE (rhs2) == BIT_FIELD_REF
2193 && TREE_CODE (TREE_OPERAND (rhs2, 0)) == SSA_NAME)
2194 rhs2 = TREE_OPERAND (rhs2, 0);
2196 /* Sources are different, cancel bswap if they are not memory location with
2197 the same base (array, structure, ...). */
2198 if (rhs1 != rhs2)
2200 uint64_t inc;
2201 HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2202 struct symbolic_number *toinc_n_ptr, *n_end;
2203 basic_block bb1, bb2;
2205 if (!n1->base_addr || !n2->base_addr
2206 || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2207 return NULL;
2209 if (!n1->offset != !n2->offset
2210 || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2211 return NULL;
2213 if (n1->bytepos < n2->bytepos)
2215 n_start = n1;
2216 start_sub = n2->bytepos - n1->bytepos;
2218 else
2220 n_start = n2;
2221 start_sub = n1->bytepos - n2->bytepos;
2224 bb1 = gimple_bb (source_stmt1);
2225 bb2 = gimple_bb (source_stmt2);
2226 if (dominated_by_p (CDI_DOMINATORS, bb1, bb2))
2227 source_stmt = source_stmt1;
2228 else
2229 source_stmt = source_stmt2;
2231 /* Find the highest address at which a load is performed and
2232 compute related info. */
2233 end1 = n1->bytepos + (n1->range - 1);
2234 end2 = n2->bytepos + (n2->range - 1);
2235 if (end1 < end2)
2237 end = end2;
2238 end_sub = end2 - end1;
2240 else
2242 end = end1;
2243 end_sub = end1 - end2;
2245 n_end = (end2 > end1) ? n2 : n1;
2247 /* Find symbolic number whose lsb is the most significant. */
2248 if (BYTES_BIG_ENDIAN)
2249 toinc_n_ptr = (n_end == n1) ? n2 : n1;
2250 else
2251 toinc_n_ptr = (n_start == n1) ? n2 : n1;
2253 n->range = end - n_start->bytepos + 1;
2255 /* Check that the range of memory covered can be represented by
2256 a symbolic number. */
2257 if (n->range > 64 / BITS_PER_MARKER)
2258 return NULL;
2260 /* Reinterpret byte marks in symbolic number holding the value of
2261 bigger weight according to target endianness. */
2262 inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2263 size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2264 for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2266 unsigned marker
2267 = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2268 if (marker && marker != MARKER_BYTE_UNKNOWN)
2269 toinc_n_ptr->n += inc;
2272 else
2274 n->range = n1->range;
2275 n_start = n1;
2276 source_stmt = source_stmt1;
2279 if (!n1->alias_set
2280 || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2281 n->alias_set = n1->alias_set;
2282 else
2283 n->alias_set = ptr_type_node;
2284 n->vuse = n_start->vuse;
2285 n->base_addr = n_start->base_addr;
2286 n->offset = n_start->offset;
2287 n->src = n_start->src;
2288 n->bytepos = n_start->bytepos;
2289 n->type = n_start->type;
2290 size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2292 for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2294 uint64_t masked1, masked2;
2296 masked1 = n1->n & mask;
2297 masked2 = n2->n & mask;
2298 if (masked1 && masked2 && masked1 != masked2)
2299 return NULL;
2301 n->n = n1->n | n2->n;
2302 n->n_ops = n1->n_ops + n2->n_ops;
2304 return source_stmt;
2307 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2308 the operation given by the rhs of STMT on the result. If the operation
2309 could successfully be executed the function returns a gimple stmt whose
2310 rhs's first tree is the expression of the source operand and NULL
2311 otherwise. */
2313 static gimple *
2314 find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
2316 enum tree_code code;
2317 tree rhs1, rhs2 = NULL;
2318 gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
2319 enum gimple_rhs_class rhs_class;
2321 if (!limit || !is_gimple_assign (stmt))
2322 return NULL;
2324 rhs1 = gimple_assign_rhs1 (stmt);
2326 if (find_bswap_or_nop_load (stmt, rhs1, n))
2327 return stmt;
2329 /* Handle BIT_FIELD_REF. */
2330 if (TREE_CODE (rhs1) == BIT_FIELD_REF
2331 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
2333 unsigned HOST_WIDE_INT bitsize = tree_to_uhwi (TREE_OPERAND (rhs1, 1));
2334 unsigned HOST_WIDE_INT bitpos = tree_to_uhwi (TREE_OPERAND (rhs1, 2));
2335 if (bitpos % BITS_PER_UNIT == 0
2336 && bitsize % BITS_PER_UNIT == 0
2337 && init_symbolic_number (n, TREE_OPERAND (rhs1, 0)))
2339 /* Handle big-endian bit numbering in BIT_FIELD_REF. */
2340 if (BYTES_BIG_ENDIAN)
2341 bitpos = TYPE_PRECISION (n->type) - bitpos - bitsize;
2343 /* Shift. */
2344 if (!do_shift_rotate (RSHIFT_EXPR, n, bitpos))
2345 return NULL;
2347 /* Mask. */
2348 uint64_t mask = 0;
2349 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2350 for (unsigned i = 0; i < bitsize / BITS_PER_UNIT;
2351 i++, tmp <<= BITS_PER_UNIT)
2352 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2353 n->n &= mask;
2355 /* Convert. */
2356 n->type = TREE_TYPE (rhs1);
2357 if (!n->base_addr)
2358 n->range = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2360 return verify_symbolic_number_p (n, stmt) ? stmt : NULL;
2363 return NULL;
2366 if (TREE_CODE (rhs1) != SSA_NAME)
2367 return NULL;
2369 code = gimple_assign_rhs_code (stmt);
2370 rhs_class = gimple_assign_rhs_class (stmt);
2371 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2373 if (rhs_class == GIMPLE_BINARY_RHS)
2374 rhs2 = gimple_assign_rhs2 (stmt);
2376 /* Handle unary rhs and binary rhs with integer constants as second
2377 operand. */
2379 if (rhs_class == GIMPLE_UNARY_RHS
2380 || (rhs_class == GIMPLE_BINARY_RHS
2381 && TREE_CODE (rhs2) == INTEGER_CST))
2383 if (code != BIT_AND_EXPR
2384 && code != LSHIFT_EXPR
2385 && code != RSHIFT_EXPR
2386 && code != LROTATE_EXPR
2387 && code != RROTATE_EXPR
2388 && !CONVERT_EXPR_CODE_P (code))
2389 return NULL;
2391 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2393 /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2394 we have to initialize the symbolic number. */
2395 if (!source_stmt1)
2397 if (gimple_assign_load_p (stmt)
2398 || !init_symbolic_number (n, rhs1))
2399 return NULL;
2400 source_stmt1 = stmt;
2403 switch (code)
2405 case BIT_AND_EXPR:
2407 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2408 uint64_t val = int_cst_value (rhs2), mask = 0;
2409 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2411 /* Only constants masking full bytes are allowed. */
2412 for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2413 if ((val & tmp) != 0 && (val & tmp) != tmp)
2414 return NULL;
2415 else if (val & tmp)
2416 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2418 n->n &= mask;
2420 break;
2421 case LSHIFT_EXPR:
2422 case RSHIFT_EXPR:
2423 case LROTATE_EXPR:
2424 case RROTATE_EXPR:
2425 if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2426 return NULL;
2427 break;
2428 CASE_CONVERT:
2430 int i, type_size, old_type_size;
2431 tree type;
2433 type = gimple_expr_type (stmt);
2434 type_size = TYPE_PRECISION (type);
2435 if (type_size % BITS_PER_UNIT != 0)
2436 return NULL;
2437 type_size /= BITS_PER_UNIT;
2438 if (type_size > 64 / BITS_PER_MARKER)
2439 return NULL;
2441 /* Sign extension: result is dependent on the value. */
2442 old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2443 if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2444 && HEAD_MARKER (n->n, old_type_size))
2445 for (i = 0; i < type_size - old_type_size; i++)
2446 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2447 << ((type_size - 1 - i) * BITS_PER_MARKER);
2449 if (type_size < 64 / BITS_PER_MARKER)
2451 /* If STMT casts to a smaller type mask out the bits not
2452 belonging to the target type. */
2453 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2455 n->type = type;
2456 if (!n->base_addr)
2457 n->range = type_size;
2459 break;
2460 default:
2461 return NULL;
2463 return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2466 /* Handle binary rhs. */
2468 if (rhs_class == GIMPLE_BINARY_RHS)
2470 struct symbolic_number n1, n2;
2471 gimple *source_stmt, *source_stmt2;
2473 if (code != BIT_IOR_EXPR)
2474 return NULL;
2476 if (TREE_CODE (rhs2) != SSA_NAME)
2477 return NULL;
2479 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2481 switch (code)
2483 case BIT_IOR_EXPR:
2484 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2486 if (!source_stmt1)
2487 return NULL;
2489 source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2491 if (!source_stmt2)
2492 return NULL;
2494 if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2495 return NULL;
2497 if (!n1.vuse != !n2.vuse
2498 || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2499 return NULL;
2501 source_stmt
2502 = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2504 if (!source_stmt)
2505 return NULL;
2507 if (!verify_symbolic_number_p (n, stmt))
2508 return NULL;
2510 break;
2511 default:
2512 return NULL;
2514 return source_stmt;
2516 return NULL;
2519 /* Check if STMT completes a bswap implementation or a read in a given
2520 endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2521 accordingly. It also sets N to represent the kind of operations
2522 performed: size of the resulting expression and whether it works on
2523 a memory source, and if so alias-set and vuse. At last, the
2524 function returns a stmt whose rhs's first tree is the source
2525 expression. */
2527 static gimple *
2528 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
2530 unsigned rsize;
2531 uint64_t tmpn, mask;
2532 /* The number which the find_bswap_or_nop_1 result should match in order
2533 to have a full byte swap. The number is shifted to the right
2534 according to the size of the symbolic number before using it. */
2535 uint64_t cmpxchg = CMPXCHG;
2536 uint64_t cmpnop = CMPNOP;
2538 gimple *ins_stmt;
2539 int limit;
2541 /* The last parameter determines the depth search limit. It usually
2542 correlates directly to the number n of bytes to be touched. We
2543 increase that number by log2(n) + 1 here in order to also
2544 cover signed -> unsigned conversions of the src operand as can be seen
2545 in libgcc, and for initial shift/and operation of the src operand. */
2546 limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2547 limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2548 ins_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2550 if (!ins_stmt)
2551 return NULL;
2553 /* Find real size of result (highest non-zero byte). */
2554 if (n->base_addr)
2555 for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2556 else
2557 rsize = n->range;
2559 /* Zero out the bits corresponding to untouched bytes in original gimple
2560 expression. */
2561 if (n->range < (int) sizeof (int64_t))
2563 mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2564 cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2565 cmpnop &= mask;
2568 /* Zero out the bits corresponding to unused bytes in the result of the
2569 gimple expression. */
2570 if (rsize < n->range)
2572 if (BYTES_BIG_ENDIAN)
2574 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
2575 cmpxchg &= mask;
2576 cmpnop >>= (n->range - rsize) * BITS_PER_MARKER;
2578 else
2580 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
2581 cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER;
2582 cmpnop &= mask;
2584 n->range = rsize;
2587 /* A complete byte swap should make the symbolic number to start with
2588 the largest digit in the highest order byte. Unchanged symbolic
2589 number indicates a read with same endianness as target architecture. */
2590 if (n->n == cmpnop)
2591 *bswap = false;
2592 else if (n->n == cmpxchg)
2593 *bswap = true;
2594 else
2595 return NULL;
2597 /* Useless bit manipulation performed by code. */
2598 if (!n->base_addr && n->n == cmpnop && n->n_ops == 1)
2599 return NULL;
2601 n->range *= BITS_PER_UNIT;
2602 return ins_stmt;
2605 namespace {
2607 const pass_data pass_data_optimize_bswap =
2609 GIMPLE_PASS, /* type */
2610 "bswap", /* name */
2611 OPTGROUP_NONE, /* optinfo_flags */
2612 TV_NONE, /* tv_id */
2613 PROP_ssa, /* properties_required */
2614 0, /* properties_provided */
2615 0, /* properties_destroyed */
2616 0, /* todo_flags_start */
2617 0, /* todo_flags_finish */
2620 class pass_optimize_bswap : public gimple_opt_pass
2622 public:
2623 pass_optimize_bswap (gcc::context *ctxt)
2624 : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2627 /* opt_pass methods: */
2628 virtual bool gate (function *)
2630 return flag_expensive_optimizations && optimize;
2633 virtual unsigned int execute (function *);
2635 }; // class pass_optimize_bswap
2637 /* Perform the bswap optimization: replace the expression computed in the rhs
2638 of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2639 Which of these alternatives replace the rhs is given by N->base_addr (non
2640 null if a load is needed) and BSWAP. The type, VUSE and set-alias of the
2641 load to perform are also given in N while the builtin bswap invoke is given
2642 in FNDEL. Finally, if a load is involved, SRC_STMT refers to one of the
2643 load statements involved to construct the rhs in CUR_STMT and N->range gives
2644 the size of the rhs expression for maintaining some statistics.
2646 Note that if the replacement involve a load, CUR_STMT is moved just after
2647 SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2648 changing of basic block. */
2650 static bool
2651 bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl,
2652 tree bswap_type, tree load_type, struct symbolic_number *n,
2653 bool bswap)
2655 gimple_stmt_iterator gsi;
2656 tree src, tmp, tgt;
2657 gimple *bswap_stmt;
2659 gsi = gsi_for_stmt (cur_stmt);
2660 src = n->src;
2661 tgt = gimple_assign_lhs (cur_stmt);
2663 /* Need to load the value from memory first. */
2664 if (n->base_addr)
2666 gimple_stmt_iterator gsi_ins = gsi_for_stmt (ins_stmt);
2667 tree addr_expr, addr_tmp, val_expr, val_tmp;
2668 tree load_offset_ptr, aligned_load_type;
2669 gimple *addr_stmt, *load_stmt;
2670 unsigned align;
2671 HOST_WIDE_INT load_offset = 0;
2672 basic_block ins_bb, cur_bb;
2674 ins_bb = gimple_bb (ins_stmt);
2675 cur_bb = gimple_bb (cur_stmt);
2676 if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb))
2677 return false;
2679 align = get_object_alignment (src);
2681 /* Move cur_stmt just before one of the load of the original
2682 to ensure it has the same VUSE. See PR61517 for what could
2683 go wrong. */
2684 if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt))
2685 reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt));
2686 gsi_move_before (&gsi, &gsi_ins);
2687 gsi = gsi_for_stmt (cur_stmt);
2689 /* Compute address to load from and cast according to the size
2690 of the load. */
2691 addr_expr = build_fold_addr_expr (unshare_expr (src));
2692 if (is_gimple_mem_ref_addr (addr_expr))
2693 addr_tmp = addr_expr;
2694 else
2696 addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2697 "load_src");
2698 addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2699 gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2702 /* Perform the load. */
2703 aligned_load_type = load_type;
2704 if (align < TYPE_ALIGN (load_type))
2705 aligned_load_type = build_aligned_type (load_type, align);
2706 load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2707 val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2708 load_offset_ptr);
2710 if (!bswap)
2712 if (n->range == 16)
2713 nop_stats.found_16bit++;
2714 else if (n->range == 32)
2715 nop_stats.found_32bit++;
2716 else
2718 gcc_assert (n->range == 64);
2719 nop_stats.found_64bit++;
2722 /* Convert the result of load if necessary. */
2723 if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2725 val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2726 "load_dst");
2727 load_stmt = gimple_build_assign (val_tmp, val_expr);
2728 gimple_set_vuse (load_stmt, n->vuse);
2729 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2730 gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2732 else
2734 gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2735 gimple_set_vuse (cur_stmt, n->vuse);
2737 update_stmt (cur_stmt);
2739 if (dump_file)
2741 fprintf (dump_file,
2742 "%d bit load in target endianness found at: ",
2743 (int) n->range);
2744 print_gimple_stmt (dump_file, cur_stmt, 0);
2746 return true;
2748 else
2750 val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2751 load_stmt = gimple_build_assign (val_tmp, val_expr);
2752 gimple_set_vuse (load_stmt, n->vuse);
2753 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2755 src = val_tmp;
2757 else if (!bswap)
2759 gimple *g;
2760 if (!useless_type_conversion_p (TREE_TYPE (tgt), TREE_TYPE (src)))
2762 if (!is_gimple_val (src))
2763 return false;
2764 g = gimple_build_assign (tgt, NOP_EXPR, src);
2766 else
2767 g = gimple_build_assign (tgt, src);
2768 if (n->range == 16)
2769 nop_stats.found_16bit++;
2770 else if (n->range == 32)
2771 nop_stats.found_32bit++;
2772 else
2774 gcc_assert (n->range == 64);
2775 nop_stats.found_64bit++;
2777 if (dump_file)
2779 fprintf (dump_file,
2780 "%d bit reshuffle in target endianness found at: ",
2781 (int) n->range);
2782 print_gimple_stmt (dump_file, cur_stmt, 0);
2784 gsi_replace (&gsi, g, true);
2785 return true;
2787 else if (TREE_CODE (src) == BIT_FIELD_REF)
2788 src = TREE_OPERAND (src, 0);
2790 if (n->range == 16)
2791 bswap_stats.found_16bit++;
2792 else if (n->range == 32)
2793 bswap_stats.found_32bit++;
2794 else
2796 gcc_assert (n->range == 64);
2797 bswap_stats.found_64bit++;
2800 tmp = src;
2802 /* Convert the src expression if necessary. */
2803 if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2805 gimple *convert_stmt;
2807 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2808 convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2809 gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2812 /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values
2813 are considered as rotation of 2N bit values by N bits is generally not
2814 equivalent to a bswap. Consider for instance 0x01020304 r>> 16 which
2815 gives 0x03040102 while a bswap for that value is 0x04030201. */
2816 if (bswap && n->range == 16)
2818 tree count = build_int_cst (NULL, BITS_PER_UNIT);
2819 src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2820 bswap_stmt = gimple_build_assign (NULL, src);
2822 else
2823 bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2825 tmp = tgt;
2827 /* Convert the result if necessary. */
2828 if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2830 gimple *convert_stmt;
2832 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2833 convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2834 gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2837 gimple_set_lhs (bswap_stmt, tmp);
2839 if (dump_file)
2841 fprintf (dump_file, "%d bit bswap implementation found at: ",
2842 (int) n->range);
2843 print_gimple_stmt (dump_file, cur_stmt, 0);
2846 gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2847 gsi_remove (&gsi, true);
2848 return true;
2851 /* Find manual byte swap implementations as well as load in a given
2852 endianness. Byte swaps are turned into a bswap builtin invokation
2853 while endian loads are converted to bswap builtin invokation or
2854 simple load according to the target endianness. */
2856 unsigned int
2857 pass_optimize_bswap::execute (function *fun)
2859 basic_block bb;
2860 bool bswap32_p, bswap64_p;
2861 bool changed = false;
2862 tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2864 if (BITS_PER_UNIT != 8)
2865 return 0;
2867 bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2868 && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2869 bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2870 && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2871 || (bswap32_p && word_mode == SImode)));
2873 /* Determine the argument type of the builtins. The code later on
2874 assumes that the return and argument type are the same. */
2875 if (bswap32_p)
2877 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2878 bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2881 if (bswap64_p)
2883 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2884 bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2887 memset (&nop_stats, 0, sizeof (nop_stats));
2888 memset (&bswap_stats, 0, sizeof (bswap_stats));
2889 calculate_dominance_info (CDI_DOMINATORS);
2891 FOR_EACH_BB_FN (bb, fun)
2893 gimple_stmt_iterator gsi;
2895 /* We do a reverse scan for bswap patterns to make sure we get the
2896 widest match. As bswap pattern matching doesn't handle previously
2897 inserted smaller bswap replacements as sub-patterns, the wider
2898 variant wouldn't be detected. */
2899 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2901 gimple *ins_stmt, *cur_stmt = gsi_stmt (gsi);
2902 tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2903 enum tree_code code;
2904 struct symbolic_number n;
2905 bool bswap;
2907 /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2908 might be moved to a different basic block by bswap_replace and gsi
2909 must not points to it if that's the case. Moving the gsi_prev
2910 there make sure that gsi points to the statement previous to
2911 cur_stmt while still making sure that all statements are
2912 considered in this basic block. */
2913 gsi_prev (&gsi);
2915 if (!is_gimple_assign (cur_stmt))
2916 continue;
2918 code = gimple_assign_rhs_code (cur_stmt);
2919 switch (code)
2921 case LROTATE_EXPR:
2922 case RROTATE_EXPR:
2923 if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2924 || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2925 % BITS_PER_UNIT)
2926 continue;
2927 /* Fall through. */
2928 case BIT_IOR_EXPR:
2929 break;
2930 default:
2931 continue;
2934 ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2936 if (!ins_stmt)
2937 continue;
2939 switch (n.range)
2941 case 16:
2942 /* Already in canonical form, nothing to do. */
2943 if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2944 continue;
2945 load_type = bswap_type = uint16_type_node;
2946 break;
2947 case 32:
2948 load_type = uint32_type_node;
2949 if (bswap32_p)
2951 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2952 bswap_type = bswap32_type;
2954 break;
2955 case 64:
2956 load_type = uint64_type_node;
2957 if (bswap64_p)
2959 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2960 bswap_type = bswap64_type;
2962 break;
2963 default:
2964 continue;
2967 if (bswap && !fndecl && n.range != 16)
2968 continue;
2970 if (bswap_replace (cur_stmt, ins_stmt, fndecl, bswap_type, load_type,
2971 &n, bswap))
2972 changed = true;
2976 statistics_counter_event (fun, "16-bit nop implementations found",
2977 nop_stats.found_16bit);
2978 statistics_counter_event (fun, "32-bit nop implementations found",
2979 nop_stats.found_32bit);
2980 statistics_counter_event (fun, "64-bit nop implementations found",
2981 nop_stats.found_64bit);
2982 statistics_counter_event (fun, "16-bit bswap implementations found",
2983 bswap_stats.found_16bit);
2984 statistics_counter_event (fun, "32-bit bswap implementations found",
2985 bswap_stats.found_32bit);
2986 statistics_counter_event (fun, "64-bit bswap implementations found",
2987 bswap_stats.found_64bit);
2989 return (changed ? TODO_update_ssa : 0);
2992 } // anon namespace
2994 gimple_opt_pass *
2995 make_pass_optimize_bswap (gcc::context *ctxt)
2997 return new pass_optimize_bswap (ctxt);
3000 /* Return true if stmt is a type conversion operation that can be stripped
3001 when used in a widening multiply operation. */
3002 static bool
3003 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
3005 enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
3007 if (TREE_CODE (result_type) == INTEGER_TYPE)
3009 tree op_type;
3010 tree inner_op_type;
3012 if (!CONVERT_EXPR_CODE_P (rhs_code))
3013 return false;
3015 op_type = TREE_TYPE (gimple_assign_lhs (stmt));
3017 /* If the type of OP has the same precision as the result, then
3018 we can strip this conversion. The multiply operation will be
3019 selected to create the correct extension as a by-product. */
3020 if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
3021 return true;
3023 /* We can also strip a conversion if it preserves the signed-ness of
3024 the operation and doesn't narrow the range. */
3025 inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
3027 /* If the inner-most type is unsigned, then we can strip any
3028 intermediate widening operation. If it's signed, then the
3029 intermediate widening operation must also be signed. */
3030 if ((TYPE_UNSIGNED (inner_op_type)
3031 || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
3032 && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
3033 return true;
3035 return false;
3038 return rhs_code == FIXED_CONVERT_EXPR;
3041 /* Return true if RHS is a suitable operand for a widening multiplication,
3042 assuming a target type of TYPE.
3043 There are two cases:
3045 - RHS makes some value at least twice as wide. Store that value
3046 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
3048 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
3049 but leave *TYPE_OUT untouched. */
3051 static bool
3052 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
3053 tree *new_rhs_out)
3055 gimple *stmt;
3056 tree type1, rhs1;
3058 if (TREE_CODE (rhs) == SSA_NAME)
3060 stmt = SSA_NAME_DEF_STMT (rhs);
3061 if (is_gimple_assign (stmt))
3063 if (! widening_mult_conversion_strippable_p (type, stmt))
3064 rhs1 = rhs;
3065 else
3067 rhs1 = gimple_assign_rhs1 (stmt);
3069 if (TREE_CODE (rhs1) == INTEGER_CST)
3071 *new_rhs_out = rhs1;
3072 *type_out = NULL;
3073 return true;
3077 else
3078 rhs1 = rhs;
3080 type1 = TREE_TYPE (rhs1);
3082 if (TREE_CODE (type1) != TREE_CODE (type)
3083 || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
3084 return false;
3086 *new_rhs_out = rhs1;
3087 *type_out = type1;
3088 return true;
3091 if (TREE_CODE (rhs) == INTEGER_CST)
3093 *new_rhs_out = rhs;
3094 *type_out = NULL;
3095 return true;
3098 return false;
3101 /* Return true if STMT performs a widening multiplication, assuming the
3102 output type is TYPE. If so, store the unwidened types of the operands
3103 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
3104 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
3105 and *TYPE2_OUT would give the operands of the multiplication. */
3107 static bool
3108 is_widening_mult_p (gimple *stmt,
3109 tree *type1_out, tree *rhs1_out,
3110 tree *type2_out, tree *rhs2_out)
3112 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3114 if (TREE_CODE (type) != INTEGER_TYPE
3115 && TREE_CODE (type) != FIXED_POINT_TYPE)
3116 return false;
3118 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
3119 rhs1_out))
3120 return false;
3122 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
3123 rhs2_out))
3124 return false;
3126 if (*type1_out == NULL)
3128 if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
3129 return false;
3130 *type1_out = *type2_out;
3133 if (*type2_out == NULL)
3135 if (!int_fits_type_p (*rhs2_out, *type1_out))
3136 return false;
3137 *type2_out = *type1_out;
3140 /* Ensure that the larger of the two operands comes first. */
3141 if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
3143 std::swap (*type1_out, *type2_out);
3144 std::swap (*rhs1_out, *rhs2_out);
3147 return true;
3150 /* Check to see if the CALL statement is an invocation of copysign
3151 with 1. being the first argument. */
3152 static bool
3153 is_copysign_call_with_1 (gimple *call)
3155 gcall *c = dyn_cast <gcall *> (call);
3156 if (! c)
3157 return false;
3159 enum combined_fn code = gimple_call_combined_fn (c);
3161 if (code == CFN_LAST)
3162 return false;
3164 if (builtin_fn_p (code))
3166 switch (as_builtin_fn (code))
3168 CASE_FLT_FN (BUILT_IN_COPYSIGN):
3169 CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
3170 return real_onep (gimple_call_arg (c, 0));
3171 default:
3172 return false;
3176 if (internal_fn_p (code))
3178 switch (as_internal_fn (code))
3180 case IFN_COPYSIGN:
3181 return real_onep (gimple_call_arg (c, 0));
3182 default:
3183 return false;
3187 return false;
3190 /* Try to expand the pattern x * copysign (1, y) into xorsign (x, y).
3191 This only happens when the the xorsign optab is defined, if the
3192 pattern is not a xorsign pattern or if expansion fails FALSE is
3193 returned, otherwise TRUE is returned. */
3194 static bool
3195 convert_expand_mult_copysign (gimple *stmt, gimple_stmt_iterator *gsi)
3197 tree treeop0, treeop1, lhs, type;
3198 location_t loc = gimple_location (stmt);
3199 lhs = gimple_assign_lhs (stmt);
3200 treeop0 = gimple_assign_rhs1 (stmt);
3201 treeop1 = gimple_assign_rhs2 (stmt);
3202 type = TREE_TYPE (lhs);
3203 machine_mode mode = TYPE_MODE (type);
3205 if (HONOR_SNANS (type))
3206 return false;
3208 if (TREE_CODE (treeop0) == SSA_NAME && TREE_CODE (treeop1) == SSA_NAME)
3210 gimple *call0 = SSA_NAME_DEF_STMT (treeop0);
3211 if (!has_single_use (treeop0) || !is_copysign_call_with_1 (call0))
3213 call0 = SSA_NAME_DEF_STMT (treeop1);
3214 if (!has_single_use (treeop1) || !is_copysign_call_with_1 (call0))
3215 return false;
3217 treeop1 = treeop0;
3219 if (optab_handler (xorsign_optab, mode) == CODE_FOR_nothing)
3220 return false;
3222 gcall *c = as_a<gcall*> (call0);
3223 treeop0 = gimple_call_arg (c, 1);
3225 gcall *call_stmt
3226 = gimple_build_call_internal (IFN_XORSIGN, 2, treeop1, treeop0);
3227 gimple_set_lhs (call_stmt, lhs);
3228 gimple_set_location (call_stmt, loc);
3229 gsi_replace (gsi, call_stmt, true);
3230 return true;
3233 return false;
3236 /* Process a single gimple statement STMT, which has a MULT_EXPR as
3237 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
3238 value is true iff we converted the statement. */
3240 static bool
3241 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
3243 tree lhs, rhs1, rhs2, type, type1, type2;
3244 enum insn_code handler;
3245 machine_mode to_mode, from_mode, actual_mode;
3246 optab op;
3247 int actual_precision;
3248 location_t loc = gimple_location (stmt);
3249 bool from_unsigned1, from_unsigned2;
3251 lhs = gimple_assign_lhs (stmt);
3252 type = TREE_TYPE (lhs);
3253 if (TREE_CODE (type) != INTEGER_TYPE)
3254 return false;
3256 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
3257 return false;
3259 to_mode = SCALAR_INT_TYPE_MODE (type);
3260 from_mode = SCALAR_INT_TYPE_MODE (type1);
3261 from_unsigned1 = TYPE_UNSIGNED (type1);
3262 from_unsigned2 = TYPE_UNSIGNED (type2);
3264 if (from_unsigned1 && from_unsigned2)
3265 op = umul_widen_optab;
3266 else if (!from_unsigned1 && !from_unsigned2)
3267 op = smul_widen_optab;
3268 else
3269 op = usmul_widen_optab;
3271 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3272 0, &actual_mode);
3274 if (handler == CODE_FOR_nothing)
3276 if (op != smul_widen_optab)
3278 /* We can use a signed multiply with unsigned types as long as
3279 there is a wider mode to use, or it is the smaller of the two
3280 types that is unsigned. Note that type1 >= type2, always. */
3281 if ((TYPE_UNSIGNED (type1)
3282 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3283 || (TYPE_UNSIGNED (type2)
3284 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3286 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
3287 || GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
3288 return false;
3291 op = smul_widen_optab;
3292 handler = find_widening_optab_handler_and_mode (op, to_mode,
3293 from_mode, 0,
3294 &actual_mode);
3296 if (handler == CODE_FOR_nothing)
3297 return false;
3299 from_unsigned1 = from_unsigned2 = false;
3301 else
3302 return false;
3305 /* Ensure that the inputs to the handler are in the correct precison
3306 for the opcode. This will be the full mode size. */
3307 actual_precision = GET_MODE_PRECISION (actual_mode);
3308 if (2 * actual_precision > TYPE_PRECISION (type))
3309 return false;
3310 if (actual_precision != TYPE_PRECISION (type1)
3311 || from_unsigned1 != TYPE_UNSIGNED (type1))
3312 rhs1 = build_and_insert_cast (gsi, loc,
3313 build_nonstandard_integer_type
3314 (actual_precision, from_unsigned1), rhs1);
3315 if (actual_precision != TYPE_PRECISION (type2)
3316 || from_unsigned2 != TYPE_UNSIGNED (type2))
3317 rhs2 = build_and_insert_cast (gsi, loc,
3318 build_nonstandard_integer_type
3319 (actual_precision, from_unsigned2), rhs2);
3321 /* Handle constants. */
3322 if (TREE_CODE (rhs1) == INTEGER_CST)
3323 rhs1 = fold_convert (type1, rhs1);
3324 if (TREE_CODE (rhs2) == INTEGER_CST)
3325 rhs2 = fold_convert (type2, rhs2);
3327 gimple_assign_set_rhs1 (stmt, rhs1);
3328 gimple_assign_set_rhs2 (stmt, rhs2);
3329 gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
3330 update_stmt (stmt);
3331 widen_mul_stats.widen_mults_inserted++;
3332 return true;
3335 /* Process a single gimple statement STMT, which is found at the
3336 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3337 rhs (given by CODE), and try to convert it into a
3338 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
3339 is true iff we converted the statement. */
3341 static bool
3342 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
3343 enum tree_code code)
3345 gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
3346 gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
3347 tree type, type1, type2, optype;
3348 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3349 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3350 optab this_optab;
3351 enum tree_code wmult_code;
3352 enum insn_code handler;
3353 scalar_mode to_mode, from_mode;
3354 machine_mode actual_mode;
3355 location_t loc = gimple_location (stmt);
3356 int actual_precision;
3357 bool from_unsigned1, from_unsigned2;
3359 lhs = gimple_assign_lhs (stmt);
3360 type = TREE_TYPE (lhs);
3361 if (TREE_CODE (type) != INTEGER_TYPE
3362 && TREE_CODE (type) != FIXED_POINT_TYPE)
3363 return false;
3365 if (code == MINUS_EXPR)
3366 wmult_code = WIDEN_MULT_MINUS_EXPR;
3367 else
3368 wmult_code = WIDEN_MULT_PLUS_EXPR;
3370 rhs1 = gimple_assign_rhs1 (stmt);
3371 rhs2 = gimple_assign_rhs2 (stmt);
3373 if (TREE_CODE (rhs1) == SSA_NAME)
3375 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3376 if (is_gimple_assign (rhs1_stmt))
3377 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3380 if (TREE_CODE (rhs2) == SSA_NAME)
3382 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3383 if (is_gimple_assign (rhs2_stmt))
3384 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3387 /* Allow for one conversion statement between the multiply
3388 and addition/subtraction statement. If there are more than
3389 one conversions then we assume they would invalidate this
3390 transformation. If that's not the case then they should have
3391 been folded before now. */
3392 if (CONVERT_EXPR_CODE_P (rhs1_code))
3394 conv1_stmt = rhs1_stmt;
3395 rhs1 = gimple_assign_rhs1 (rhs1_stmt);
3396 if (TREE_CODE (rhs1) == SSA_NAME)
3398 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3399 if (is_gimple_assign (rhs1_stmt))
3400 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3402 else
3403 return false;
3405 if (CONVERT_EXPR_CODE_P (rhs2_code))
3407 conv2_stmt = rhs2_stmt;
3408 rhs2 = gimple_assign_rhs1 (rhs2_stmt);
3409 if (TREE_CODE (rhs2) == SSA_NAME)
3411 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3412 if (is_gimple_assign (rhs2_stmt))
3413 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3415 else
3416 return false;
3419 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3420 is_widening_mult_p, but we still need the rhs returns.
3422 It might also appear that it would be sufficient to use the existing
3423 operands of the widening multiply, but that would limit the choice of
3424 multiply-and-accumulate instructions.
3426 If the widened-multiplication result has more than one uses, it is
3427 probably wiser not to do the conversion. */
3428 if (code == PLUS_EXPR
3429 && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
3431 if (!has_single_use (rhs1)
3432 || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
3433 &type2, &mult_rhs2))
3434 return false;
3435 add_rhs = rhs2;
3436 conv_stmt = conv1_stmt;
3438 else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
3440 if (!has_single_use (rhs2)
3441 || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
3442 &type2, &mult_rhs2))
3443 return false;
3444 add_rhs = rhs1;
3445 conv_stmt = conv2_stmt;
3447 else
3448 return false;
3450 to_mode = SCALAR_TYPE_MODE (type);
3451 from_mode = SCALAR_TYPE_MODE (type1);
3452 from_unsigned1 = TYPE_UNSIGNED (type1);
3453 from_unsigned2 = TYPE_UNSIGNED (type2);
3454 optype = type1;
3456 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
3457 if (from_unsigned1 != from_unsigned2)
3459 if (!INTEGRAL_TYPE_P (type))
3460 return false;
3461 /* We can use a signed multiply with unsigned types as long as
3462 there is a wider mode to use, or it is the smaller of the two
3463 types that is unsigned. Note that type1 >= type2, always. */
3464 if ((from_unsigned1
3465 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3466 || (from_unsigned2
3467 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3469 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
3470 || GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
3471 return false;
3474 from_unsigned1 = from_unsigned2 = false;
3475 optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3476 false);
3479 /* If there was a conversion between the multiply and addition
3480 then we need to make sure it fits a multiply-and-accumulate.
3481 The should be a single mode change which does not change the
3482 value. */
3483 if (conv_stmt)
3485 /* We use the original, unmodified data types for this. */
3486 tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3487 tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3488 int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3489 bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3491 if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3493 /* Conversion is a truncate. */
3494 if (TYPE_PRECISION (to_type) < data_size)
3495 return false;
3497 else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3499 /* Conversion is an extend. Check it's the right sort. */
3500 if (TYPE_UNSIGNED (from_type) != is_unsigned
3501 && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3502 return false;
3504 /* else convert is a no-op for our purposes. */
3507 /* Verify that the machine can perform a widening multiply
3508 accumulate in this mode/signedness combination, otherwise
3509 this transformation is likely to pessimize code. */
3510 this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3511 handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3512 from_mode, 0, &actual_mode);
3514 if (handler == CODE_FOR_nothing)
3515 return false;
3517 /* Ensure that the inputs to the handler are in the correct precison
3518 for the opcode. This will be the full mode size. */
3519 actual_precision = GET_MODE_PRECISION (actual_mode);
3520 if (actual_precision != TYPE_PRECISION (type1)
3521 || from_unsigned1 != TYPE_UNSIGNED (type1))
3522 mult_rhs1 = build_and_insert_cast (gsi, loc,
3523 build_nonstandard_integer_type
3524 (actual_precision, from_unsigned1),
3525 mult_rhs1);
3526 if (actual_precision != TYPE_PRECISION (type2)
3527 || from_unsigned2 != TYPE_UNSIGNED (type2))
3528 mult_rhs2 = build_and_insert_cast (gsi, loc,
3529 build_nonstandard_integer_type
3530 (actual_precision, from_unsigned2),
3531 mult_rhs2);
3533 if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3534 add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3536 /* Handle constants. */
3537 if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3538 mult_rhs1 = fold_convert (type1, mult_rhs1);
3539 if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3540 mult_rhs2 = fold_convert (type2, mult_rhs2);
3542 gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3543 add_rhs);
3544 update_stmt (gsi_stmt (*gsi));
3545 widen_mul_stats.maccs_inserted++;
3546 return true;
3549 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3550 with uses in additions and subtractions to form fused multiply-add
3551 operations. Returns true if successful and MUL_STMT should be removed. */
3553 static bool
3554 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2)
3556 tree mul_result = gimple_get_lhs (mul_stmt);
3557 tree type = TREE_TYPE (mul_result);
3558 gimple *use_stmt, *neguse_stmt;
3559 gassign *fma_stmt;
3560 use_operand_p use_p;
3561 imm_use_iterator imm_iter;
3563 if (FLOAT_TYPE_P (type)
3564 && flag_fp_contract_mode == FP_CONTRACT_OFF)
3565 return false;
3567 /* We don't want to do bitfield reduction ops. */
3568 if (INTEGRAL_TYPE_P (type)
3569 && !type_has_mode_precision_p (type))
3570 return false;
3572 /* If the target doesn't support it, don't generate it. We assume that
3573 if fma isn't available then fms, fnma or fnms are not either. */
3574 if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3575 return false;
3577 /* If the multiplication has zero uses, it is kept around probably because
3578 of -fnon-call-exceptions. Don't optimize it away in that case,
3579 it is DCE job. */
3580 if (has_zero_uses (mul_result))
3581 return false;
3583 /* Make sure that the multiplication statement becomes dead after
3584 the transformation, thus that all uses are transformed to FMAs.
3585 This means we assume that an FMA operation has the same cost
3586 as an addition. */
3587 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3589 enum tree_code use_code;
3590 tree result = mul_result;
3591 bool negate_p = false;
3593 use_stmt = USE_STMT (use_p);
3595 if (is_gimple_debug (use_stmt))
3596 continue;
3598 /* For now restrict this operations to single basic blocks. In theory
3599 we would want to support sinking the multiplication in
3600 m = a*b;
3601 if ()
3602 ma = m + c;
3603 else
3604 d = m;
3605 to form a fma in the then block and sink the multiplication to the
3606 else block. */
3607 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3608 return false;
3610 if (!is_gimple_assign (use_stmt))
3611 return false;
3613 use_code = gimple_assign_rhs_code (use_stmt);
3615 /* A negate on the multiplication leads to FNMA. */
3616 if (use_code == NEGATE_EXPR)
3618 ssa_op_iter iter;
3619 use_operand_p usep;
3621 result = gimple_assign_lhs (use_stmt);
3623 /* Make sure the negate statement becomes dead with this
3624 single transformation. */
3625 if (!single_imm_use (gimple_assign_lhs (use_stmt),
3626 &use_p, &neguse_stmt))
3627 return false;
3629 /* Make sure the multiplication isn't also used on that stmt. */
3630 FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3631 if (USE_FROM_PTR (usep) == mul_result)
3632 return false;
3634 /* Re-validate. */
3635 use_stmt = neguse_stmt;
3636 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3637 return false;
3638 if (!is_gimple_assign (use_stmt))
3639 return false;
3641 use_code = gimple_assign_rhs_code (use_stmt);
3642 negate_p = true;
3645 switch (use_code)
3647 case MINUS_EXPR:
3648 if (gimple_assign_rhs2 (use_stmt) == result)
3649 negate_p = !negate_p;
3650 break;
3651 case PLUS_EXPR:
3652 break;
3653 default:
3654 /* FMA can only be formed from PLUS and MINUS. */
3655 return false;
3658 /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3659 by a MULT_EXPR that we'll visit later, we might be able to
3660 get a more profitable match with fnma.
3661 OTOH, if we don't, a negate / fma pair has likely lower latency
3662 that a mult / subtract pair. */
3663 if (use_code == MINUS_EXPR && !negate_p
3664 && gimple_assign_rhs1 (use_stmt) == result
3665 && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3666 && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3668 tree rhs2 = gimple_assign_rhs2 (use_stmt);
3670 if (TREE_CODE (rhs2) == SSA_NAME)
3672 gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2);
3673 if (has_single_use (rhs2)
3674 && is_gimple_assign (stmt2)
3675 && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3676 return false;
3680 /* We can't handle a * b + a * b. */
3681 if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3682 return false;
3684 /* While it is possible to validate whether or not the exact form
3685 that we've recognized is available in the backend, the assumption
3686 is that the transformation is never a loss. For instance, suppose
3687 the target only has the plain FMA pattern available. Consider
3688 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3689 is still two operations. Consider -(a*b)-c -> fma(-a,b,-c): we
3690 still have 3 operations, but in the FMA form the two NEGs are
3691 independent and could be run in parallel. */
3694 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3696 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3697 enum tree_code use_code;
3698 tree addop, mulop1 = op1, result = mul_result;
3699 bool negate_p = false;
3701 if (is_gimple_debug (use_stmt))
3702 continue;
3704 use_code = gimple_assign_rhs_code (use_stmt);
3705 if (use_code == NEGATE_EXPR)
3707 result = gimple_assign_lhs (use_stmt);
3708 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3709 gsi_remove (&gsi, true);
3710 release_defs (use_stmt);
3712 use_stmt = neguse_stmt;
3713 gsi = gsi_for_stmt (use_stmt);
3714 use_code = gimple_assign_rhs_code (use_stmt);
3715 negate_p = true;
3718 if (gimple_assign_rhs1 (use_stmt) == result)
3720 addop = gimple_assign_rhs2 (use_stmt);
3721 /* a * b - c -> a * b + (-c) */
3722 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3723 addop = force_gimple_operand_gsi (&gsi,
3724 build1 (NEGATE_EXPR,
3725 type, addop),
3726 true, NULL_TREE, true,
3727 GSI_SAME_STMT);
3729 else
3731 addop = gimple_assign_rhs1 (use_stmt);
3732 /* a - b * c -> (-b) * c + a */
3733 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3734 negate_p = !negate_p;
3737 if (negate_p)
3738 mulop1 = force_gimple_operand_gsi (&gsi,
3739 build1 (NEGATE_EXPR,
3740 type, mulop1),
3741 true, NULL_TREE, true,
3742 GSI_SAME_STMT);
3744 fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3745 FMA_EXPR, mulop1, op2, addop);
3746 gsi_replace (&gsi, fma_stmt, true);
3747 widen_mul_stats.fmas_inserted++;
3750 return true;
3754 /* Helper function of match_uaddsub_overflow. Return 1
3755 if USE_STMT is unsigned overflow check ovf != 0 for
3756 STMT, -1 if USE_STMT is unsigned overflow check ovf == 0
3757 and 0 otherwise. */
3759 static int
3760 uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt)
3762 enum tree_code ccode = ERROR_MARK;
3763 tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
3764 if (gimple_code (use_stmt) == GIMPLE_COND)
3766 ccode = gimple_cond_code (use_stmt);
3767 crhs1 = gimple_cond_lhs (use_stmt);
3768 crhs2 = gimple_cond_rhs (use_stmt);
3770 else if (is_gimple_assign (use_stmt))
3772 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3774 ccode = gimple_assign_rhs_code (use_stmt);
3775 crhs1 = gimple_assign_rhs1 (use_stmt);
3776 crhs2 = gimple_assign_rhs2 (use_stmt);
3778 else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
3780 tree cond = gimple_assign_rhs1 (use_stmt);
3781 if (COMPARISON_CLASS_P (cond))
3783 ccode = TREE_CODE (cond);
3784 crhs1 = TREE_OPERAND (cond, 0);
3785 crhs2 = TREE_OPERAND (cond, 1);
3787 else
3788 return 0;
3790 else
3791 return 0;
3793 else
3794 return 0;
3796 if (TREE_CODE_CLASS (ccode) != tcc_comparison)
3797 return 0;
3799 enum tree_code code = gimple_assign_rhs_code (stmt);
3800 tree lhs = gimple_assign_lhs (stmt);
3801 tree rhs1 = gimple_assign_rhs1 (stmt);
3802 tree rhs2 = gimple_assign_rhs2 (stmt);
3804 switch (ccode)
3806 case GT_EXPR:
3807 case LE_EXPR:
3808 /* r = a - b; r > a or r <= a
3809 r = a + b; a > r or a <= r or b > r or b <= r. */
3810 if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1)
3811 || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2)
3812 && crhs2 == lhs))
3813 return ccode == GT_EXPR ? 1 : -1;
3814 break;
3815 case LT_EXPR:
3816 case GE_EXPR:
3817 /* r = a - b; a < r or a >= r
3818 r = a + b; r < a or r >= a or r < b or r >= b. */
3819 if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs)
3820 || (code == PLUS_EXPR && crhs1 == lhs
3821 && (crhs2 == rhs1 || crhs2 == rhs2)))
3822 return ccode == LT_EXPR ? 1 : -1;
3823 break;
3824 default:
3825 break;
3827 return 0;
3830 /* Recognize for unsigned x
3831 x = y - z;
3832 if (x > y)
3833 where there are other uses of x and replace it with
3834 _7 = SUB_OVERFLOW (y, z);
3835 x = REALPART_EXPR <_7>;
3836 _8 = IMAGPART_EXPR <_7>;
3837 if (_8)
3838 and similarly for addition. */
3840 static bool
3841 match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
3842 enum tree_code code)
3844 tree lhs = gimple_assign_lhs (stmt);
3845 tree type = TREE_TYPE (lhs);
3846 use_operand_p use_p;
3847 imm_use_iterator iter;
3848 bool use_seen = false;
3849 bool ovf_use_seen = false;
3850 gimple *use_stmt;
3852 gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
3853 if (!INTEGRAL_TYPE_P (type)
3854 || !TYPE_UNSIGNED (type)
3855 || has_zero_uses (lhs)
3856 || has_single_use (lhs)
3857 || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
3858 TYPE_MODE (type)) == CODE_FOR_nothing)
3859 return false;
3861 FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
3863 use_stmt = USE_STMT (use_p);
3864 if (is_gimple_debug (use_stmt))
3865 continue;
3867 if (uaddsub_overflow_check_p (stmt, use_stmt))
3868 ovf_use_seen = true;
3869 else
3870 use_seen = true;
3871 if (ovf_use_seen && use_seen)
3872 break;
3875 if (!ovf_use_seen || !use_seen)
3876 return false;
3878 tree ctype = build_complex_type (type);
3879 tree rhs1 = gimple_assign_rhs1 (stmt);
3880 tree rhs2 = gimple_assign_rhs2 (stmt);
3881 gcall *g = gimple_build_call_internal (code == PLUS_EXPR
3882 ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW,
3883 2, rhs1, rhs2);
3884 tree ctmp = make_ssa_name (ctype);
3885 gimple_call_set_lhs (g, ctmp);
3886 gsi_insert_before (gsi, g, GSI_SAME_STMT);
3887 gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR,
3888 build1 (REALPART_EXPR, type, ctmp));
3889 gsi_replace (gsi, g2, true);
3890 tree ovf = make_ssa_name (type);
3891 g2 = gimple_build_assign (ovf, IMAGPART_EXPR,
3892 build1 (IMAGPART_EXPR, type, ctmp));
3893 gsi_insert_after (gsi, g2, GSI_NEW_STMT);
3895 FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
3897 if (is_gimple_debug (use_stmt))
3898 continue;
3900 int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt);
3901 if (ovf_use == 0)
3902 continue;
3903 if (gimple_code (use_stmt) == GIMPLE_COND)
3905 gcond *cond_stmt = as_a <gcond *> (use_stmt);
3906 gimple_cond_set_lhs (cond_stmt, ovf);
3907 gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0));
3908 gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3910 else
3912 gcc_checking_assert (is_gimple_assign (use_stmt));
3913 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3915 gimple_assign_set_rhs1 (use_stmt, ovf);
3916 gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0));
3917 gimple_assign_set_rhs_code (use_stmt,
3918 ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3920 else
3922 gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
3923 == COND_EXPR);
3924 tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR,
3925 boolean_type_node, ovf,
3926 build_int_cst (type, 0));
3927 gimple_assign_set_rhs1 (use_stmt, cond);
3930 update_stmt (use_stmt);
3932 return true;
3935 /* Return true if target has support for divmod. */
3937 static bool
3938 target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode)
3940 /* If target supports hardware divmod insn, use it for divmod. */
3941 if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing)
3942 return true;
3944 /* Check if libfunc for divmod is available. */
3945 rtx libfunc = optab_libfunc (divmod_optab, mode);
3946 if (libfunc != NULL_RTX)
3948 /* If optab_handler exists for div_optab, perhaps in a wider mode,
3949 we don't want to use the libfunc even if it exists for given mode. */
3950 machine_mode div_mode;
3951 FOR_EACH_MODE_FROM (div_mode, mode)
3952 if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing)
3953 return false;
3955 return targetm.expand_divmod_libfunc != NULL;
3958 return false;
3961 /* Check if stmt is candidate for divmod transform. */
3963 static bool
3964 divmod_candidate_p (gassign *stmt)
3966 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3967 machine_mode mode = TYPE_MODE (type);
3968 optab divmod_optab, div_optab;
3970 if (TYPE_UNSIGNED (type))
3972 divmod_optab = udivmod_optab;
3973 div_optab = udiv_optab;
3975 else
3977 divmod_optab = sdivmod_optab;
3978 div_optab = sdiv_optab;
3981 tree op1 = gimple_assign_rhs1 (stmt);
3982 tree op2 = gimple_assign_rhs2 (stmt);
3984 /* Disable the transform if either is a constant, since division-by-constant
3985 may have specialized expansion. */
3986 if (CONSTANT_CLASS_P (op1) || CONSTANT_CLASS_P (op2))
3987 return false;
3989 /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should
3990 expand using the [su]divv optabs. */
3991 if (TYPE_OVERFLOW_TRAPS (type))
3992 return false;
3994 if (!target_supports_divmod_p (divmod_optab, div_optab, mode))
3995 return false;
3997 return true;
4000 /* This function looks for:
4001 t1 = a TRUNC_DIV_EXPR b;
4002 t2 = a TRUNC_MOD_EXPR b;
4003 and transforms it to the following sequence:
4004 complex_tmp = DIVMOD (a, b);
4005 t1 = REALPART_EXPR(a);
4006 t2 = IMAGPART_EXPR(b);
4007 For conditions enabling the transform see divmod_candidate_p().
4009 The pass has three parts:
4010 1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all
4011 other trunc_div_expr and trunc_mod_expr stmts.
4012 2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt
4013 to stmts vector.
4014 3) Insert DIVMOD call just before top_stmt and update entries in
4015 stmts vector to use return value of DIMOVD (REALEXPR_PART for div,
4016 IMAGPART_EXPR for mod). */
4018 static bool
4019 convert_to_divmod (gassign *stmt)
4021 if (stmt_can_throw_internal (stmt)
4022 || !divmod_candidate_p (stmt))
4023 return false;
4025 tree op1 = gimple_assign_rhs1 (stmt);
4026 tree op2 = gimple_assign_rhs2 (stmt);
4028 imm_use_iterator use_iter;
4029 gimple *use_stmt;
4030 auto_vec<gimple *> stmts;
4032 gimple *top_stmt = stmt;
4033 basic_block top_bb = gimple_bb (stmt);
4035 /* Part 1: Try to set top_stmt to "topmost" stmt that dominates
4036 at-least stmt and possibly other trunc_div/trunc_mod stmts
4037 having same operands as stmt. */
4039 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1)
4041 if (is_gimple_assign (use_stmt)
4042 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
4043 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
4044 && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
4045 && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
4047 if (stmt_can_throw_internal (use_stmt))
4048 continue;
4050 basic_block bb = gimple_bb (use_stmt);
4052 if (bb == top_bb)
4054 if (gimple_uid (use_stmt) < gimple_uid (top_stmt))
4055 top_stmt = use_stmt;
4057 else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb))
4059 top_bb = bb;
4060 top_stmt = use_stmt;
4065 tree top_op1 = gimple_assign_rhs1 (top_stmt);
4066 tree top_op2 = gimple_assign_rhs2 (top_stmt);
4068 stmts.safe_push (top_stmt);
4069 bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR);
4071 /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb
4072 to stmts vector. The 2nd loop will always add stmt to stmts vector, since
4073 gimple_bb (top_stmt) dominates gimple_bb (stmt), so the
4074 2nd loop ends up adding at-least single trunc_mod_expr stmt. */
4076 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1)
4078 if (is_gimple_assign (use_stmt)
4079 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
4080 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
4081 && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
4082 && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
4084 if (use_stmt == top_stmt
4085 || stmt_can_throw_internal (use_stmt)
4086 || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
4087 continue;
4089 stmts.safe_push (use_stmt);
4090 if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
4091 div_seen = true;
4095 if (!div_seen)
4096 return false;
4098 /* Part 3: Create libcall to internal fn DIVMOD:
4099 divmod_tmp = DIVMOD (op1, op2). */
4101 gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
4102 tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)),
4103 call_stmt, "divmod_tmp");
4104 gimple_call_set_lhs (call_stmt, res);
4105 /* We rejected throwing statements above. */
4106 gimple_call_set_nothrow (call_stmt, true);
4108 /* Insert the call before top_stmt. */
4109 gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt);
4110 gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT);
4112 widen_mul_stats.divmod_calls_inserted++;
4114 /* Update all statements in stmts vector:
4115 lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp>
4116 lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>. */
4118 for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i)
4120 tree new_rhs;
4122 switch (gimple_assign_rhs_code (use_stmt))
4124 case TRUNC_DIV_EXPR:
4125 new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res);
4126 break;
4128 case TRUNC_MOD_EXPR:
4129 new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res);
4130 break;
4132 default:
4133 gcc_unreachable ();
4136 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
4137 gimple_assign_set_rhs_from_tree (&gsi, new_rhs);
4138 update_stmt (use_stmt);
4141 return true;
4144 /* Find integer multiplications where the operands are extended from
4145 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
4146 where appropriate. */
4148 namespace {
4150 const pass_data pass_data_optimize_widening_mul =
4152 GIMPLE_PASS, /* type */
4153 "widening_mul", /* name */
4154 OPTGROUP_NONE, /* optinfo_flags */
4155 TV_NONE, /* tv_id */
4156 PROP_ssa, /* properties_required */
4157 0, /* properties_provided */
4158 0, /* properties_destroyed */
4159 0, /* todo_flags_start */
4160 TODO_update_ssa, /* todo_flags_finish */
4163 class pass_optimize_widening_mul : public gimple_opt_pass
4165 public:
4166 pass_optimize_widening_mul (gcc::context *ctxt)
4167 : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
4170 /* opt_pass methods: */
4171 virtual bool gate (function *)
4173 return flag_expensive_optimizations && optimize;
4176 virtual unsigned int execute (function *);
4178 }; // class pass_optimize_widening_mul
4180 unsigned int
4181 pass_optimize_widening_mul::execute (function *fun)
4183 basic_block bb;
4184 bool cfg_changed = false;
4186 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
4187 calculate_dominance_info (CDI_DOMINATORS);
4188 renumber_gimple_stmt_uids ();
4190 FOR_EACH_BB_FN (bb, fun)
4192 gimple_stmt_iterator gsi;
4194 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
4196 gimple *stmt = gsi_stmt (gsi);
4197 enum tree_code code;
4199 if (is_gimple_assign (stmt))
4201 code = gimple_assign_rhs_code (stmt);
4202 switch (code)
4204 case MULT_EXPR:
4205 if (!convert_mult_to_widen (stmt, &gsi)
4206 && !convert_expand_mult_copysign (stmt, &gsi)
4207 && convert_mult_to_fma (stmt,
4208 gimple_assign_rhs1 (stmt),
4209 gimple_assign_rhs2 (stmt)))
4211 gsi_remove (&gsi, true);
4212 release_defs (stmt);
4213 continue;
4215 break;
4217 case PLUS_EXPR:
4218 case MINUS_EXPR:
4219 if (!convert_plusminus_to_widen (&gsi, stmt, code))
4220 match_uaddsub_overflow (&gsi, stmt, code);
4221 break;
4223 case TRUNC_MOD_EXPR:
4224 convert_to_divmod (as_a<gassign *> (stmt));
4225 break;
4227 default:;
4230 else if (is_gimple_call (stmt)
4231 && gimple_call_lhs (stmt))
4233 tree fndecl = gimple_call_fndecl (stmt);
4234 if (fndecl
4235 && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
4237 switch (DECL_FUNCTION_CODE (fndecl))
4239 case BUILT_IN_POWF:
4240 case BUILT_IN_POW:
4241 case BUILT_IN_POWL:
4242 if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
4243 && real_equal
4244 (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
4245 &dconst2)
4246 && convert_mult_to_fma (stmt,
4247 gimple_call_arg (stmt, 0),
4248 gimple_call_arg (stmt, 0)))
4250 unlink_stmt_vdef (stmt);
4251 if (gsi_remove (&gsi, true)
4252 && gimple_purge_dead_eh_edges (bb))
4253 cfg_changed = true;
4254 release_defs (stmt);
4255 continue;
4257 break;
4259 default:;
4263 gsi_next (&gsi);
4267 statistics_counter_event (fun, "widening multiplications inserted",
4268 widen_mul_stats.widen_mults_inserted);
4269 statistics_counter_event (fun, "widening maccs inserted",
4270 widen_mul_stats.maccs_inserted);
4271 statistics_counter_event (fun, "fused multiply-adds inserted",
4272 widen_mul_stats.fmas_inserted);
4273 statistics_counter_event (fun, "divmod calls inserted",
4274 widen_mul_stats.divmod_calls_inserted);
4276 return cfg_changed ? TODO_cleanup_cfg : 0;
4279 } // anon namespace
4281 gimple_opt_pass *
4282 make_pass_optimize_widening_mul (gcc::context *ctxt)
4284 return new pass_optimize_widening_mul (ctxt);