[RS6000] Power10 ICE running gcc.target/powerpc/ppc-ne0-1.c
[official-gcc.git] / gcc / tree-ssa-math-opts.c
blob90dfb982526df4c63397521af3f869b92f92ab0c
1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2020 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
23 modulus = sqrt(x*x + y*y + z*z);
24 x = x / modulus;
25 y = y / modulus;
26 z = z / modulus;
28 that can be optimized to
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
32 x = x * rmodulus;
33 y = y * rmodulus;
34 z = z * rmodulus;
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 by the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
49 this comment.
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "backend.h"
91 #include "target.h"
92 #include "rtl.h"
93 #include "tree.h"
94 #include "gimple.h"
95 #include "predict.h"
96 #include "alloc-pool.h"
97 #include "tree-pass.h"
98 #include "ssa.h"
99 #include "optabs-tree.h"
100 #include "gimple-pretty-print.h"
101 #include "alias.h"
102 #include "fold-const.h"
103 #include "gimple-fold.h"
104 #include "gimple-iterator.h"
105 #include "gimplify.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-dfa.h"
110 #include "tree-ssa.h"
111 #include "builtins.h"
112 #include "internal-fn.h"
113 #include "case-cfn-macros.h"
114 #include "optabs-libfuncs.h"
115 #include "tree-eh.h"
116 #include "targhooks.h"
117 #include "domwalk.h"
119 /* This structure represents one basic block that either computes a
120 division, or is a common dominator for basic block that compute a
121 division. */
122 struct occurrence {
123 /* The basic block represented by this structure. */
124 basic_block bb = basic_block();
126 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
127 inserted in BB. */
128 tree recip_def = tree();
130 /* If non-NULL, the SSA_NAME holding the definition for a squared
131 reciprocal inserted in BB. */
132 tree square_recip_def = tree();
134 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
135 was inserted in BB. */
136 gimple *recip_def_stmt = nullptr;
138 /* Pointer to a list of "struct occurrence"s for blocks dominated
139 by BB. */
140 struct occurrence *children = nullptr;
142 /* Pointer to the next "struct occurrence"s in the list of blocks
143 sharing a common dominator. */
144 struct occurrence *next = nullptr;
146 /* The number of divisions that are in BB before compute_merit. The
147 number of divisions that are in BB or post-dominate it after
148 compute_merit. */
149 int num_divisions = 0;
151 /* True if the basic block has a division, false if it is a common
152 dominator for basic blocks that do. If it is false and trapping
153 math is active, BB is not a candidate for inserting a reciprocal. */
154 bool bb_has_division = false;
156 /* Construct a struct occurrence for basic block BB, and whose
157 children list is headed by CHILDREN. */
158 occurrence (basic_block bb, struct occurrence *children)
159 : bb (bb), children (children)
161 bb->aux = this;
164 /* Destroy a struct occurrence and remove it from its basic block. */
165 ~occurrence ()
167 bb->aux = nullptr;
170 /* Allocate memory for a struct occurrence from OCC_POOL. */
171 static void* operator new (size_t);
173 /* Return memory for a struct occurrence to OCC_POOL. */
174 static void operator delete (void*, size_t);
177 static struct
179 /* Number of 1.0/X ops inserted. */
180 int rdivs_inserted;
182 /* Number of 1.0/FUNC ops inserted. */
183 int rfuncs_inserted;
184 } reciprocal_stats;
186 static struct
188 /* Number of cexpi calls inserted. */
189 int inserted;
190 } sincos_stats;
192 static struct
194 /* Number of widening multiplication ops inserted. */
195 int widen_mults_inserted;
197 /* Number of integer multiply-and-accumulate ops inserted. */
198 int maccs_inserted;
200 /* Number of fp fused multiply-add ops inserted. */
201 int fmas_inserted;
203 /* Number of divmod calls inserted. */
204 int divmod_calls_inserted;
205 } widen_mul_stats;
207 /* The instance of "struct occurrence" representing the highest
208 interesting block in the dominator tree. */
209 static struct occurrence *occ_head;
211 /* Allocation pool for getting instances of "struct occurrence". */
212 static object_allocator<occurrence> *occ_pool;
214 void* occurrence::operator new (size_t n)
216 gcc_assert (n == sizeof(occurrence));
217 return occ_pool->allocate_raw ();
220 void occurrence::operator delete (void *occ, size_t n)
222 gcc_assert (n == sizeof(occurrence));
223 occ_pool->remove_raw (occ);
226 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
227 list of "struct occurrence"s, one per basic block, having IDOM as
228 their common dominator.
230 We try to insert NEW_OCC as deep as possible in the tree, and we also
231 insert any other block that is a common dominator for BB and one
232 block already in the tree. */
234 static void
235 insert_bb (struct occurrence *new_occ, basic_block idom,
236 struct occurrence **p_head)
238 struct occurrence *occ, **p_occ;
240 for (p_occ = p_head; (occ = *p_occ) != NULL; )
242 basic_block bb = new_occ->bb, occ_bb = occ->bb;
243 basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
244 if (dom == bb)
246 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
247 from its list. */
248 *p_occ = occ->next;
249 occ->next = new_occ->children;
250 new_occ->children = occ;
252 /* Try the next block (it may as well be dominated by BB). */
255 else if (dom == occ_bb)
257 /* OCC_BB dominates BB. Tail recurse to look deeper. */
258 insert_bb (new_occ, dom, &occ->children);
259 return;
262 else if (dom != idom)
264 gcc_assert (!dom->aux);
266 /* There is a dominator between IDOM and BB, add it and make
267 two children out of NEW_OCC and OCC. First, remove OCC from
268 its list. */
269 *p_occ = occ->next;
270 new_occ->next = occ;
271 occ->next = NULL;
273 /* None of the previous blocks has DOM as a dominator: if we tail
274 recursed, we would reexamine them uselessly. Just switch BB with
275 DOM, and go on looking for blocks dominated by DOM. */
276 new_occ = new occurrence (dom, new_occ);
279 else
281 /* Nothing special, go on with the next element. */
282 p_occ = &occ->next;
286 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
287 new_occ->next = *p_head;
288 *p_head = new_occ;
291 /* Register that we found a division in BB.
292 IMPORTANCE is a measure of how much weighting to give
293 that division. Use IMPORTANCE = 2 to register a single
294 division. If the division is going to be found multiple
295 times use 1 (as it is with squares). */
297 static inline void
298 register_division_in (basic_block bb, int importance)
300 struct occurrence *occ;
302 occ = (struct occurrence *) bb->aux;
303 if (!occ)
305 occ = new occurrence (bb, NULL);
306 insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
309 occ->bb_has_division = true;
310 occ->num_divisions += importance;
314 /* Compute the number of divisions that postdominate each block in OCC and
315 its children. */
317 static void
318 compute_merit (struct occurrence *occ)
320 struct occurrence *occ_child;
321 basic_block dom = occ->bb;
323 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
325 basic_block bb;
326 if (occ_child->children)
327 compute_merit (occ_child);
329 if (flag_exceptions)
330 bb = single_noncomplex_succ (dom);
331 else
332 bb = dom;
334 if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
335 occ->num_divisions += occ_child->num_divisions;
340 /* Return whether USE_STMT is a floating-point division by DEF. */
341 static inline bool
342 is_division_by (gimple *use_stmt, tree def)
344 return is_gimple_assign (use_stmt)
345 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
346 && gimple_assign_rhs2 (use_stmt) == def
347 /* Do not recognize x / x as valid division, as we are getting
348 confused later by replacing all immediate uses x in such
349 a stmt. */
350 && gimple_assign_rhs1 (use_stmt) != def
351 && !stmt_can_throw_internal (cfun, use_stmt);
354 /* Return TRUE if USE_STMT is a multiplication of DEF by A. */
355 static inline bool
356 is_mult_by (gimple *use_stmt, tree def, tree a)
358 if (gimple_code (use_stmt) == GIMPLE_ASSIGN
359 && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
361 tree op0 = gimple_assign_rhs1 (use_stmt);
362 tree op1 = gimple_assign_rhs2 (use_stmt);
364 return (op0 == def && op1 == a)
365 || (op0 == a && op1 == def);
367 return 0;
370 /* Return whether USE_STMT is DEF * DEF. */
371 static inline bool
372 is_square_of (gimple *use_stmt, tree def)
374 return is_mult_by (use_stmt, def, def);
377 /* Return whether USE_STMT is a floating-point division by
378 DEF * DEF. */
379 static inline bool
380 is_division_by_square (gimple *use_stmt, tree def)
382 if (gimple_code (use_stmt) == GIMPLE_ASSIGN
383 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
384 && gimple_assign_rhs1 (use_stmt) != gimple_assign_rhs2 (use_stmt)
385 && !stmt_can_throw_internal (cfun, use_stmt))
387 tree denominator = gimple_assign_rhs2 (use_stmt);
388 if (TREE_CODE (denominator) == SSA_NAME)
389 return is_square_of (SSA_NAME_DEF_STMT (denominator), def);
391 return 0;
394 /* Walk the subset of the dominator tree rooted at OCC, setting the
395 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
396 the given basic block. The field may be left NULL, of course,
397 if it is not possible or profitable to do the optimization.
399 DEF_BSI is an iterator pointing at the statement defining DEF.
400 If RECIP_DEF is set, a dominator already has a computation that can
401 be used.
403 If should_insert_square_recip is set, then this also inserts
404 the square of the reciprocal immediately after the definition
405 of the reciprocal. */
407 static void
408 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
409 tree def, tree recip_def, tree square_recip_def,
410 int should_insert_square_recip, int threshold)
412 tree type;
413 gassign *new_stmt, *new_square_stmt;
414 gimple_stmt_iterator gsi;
415 struct occurrence *occ_child;
417 if (!recip_def
418 && (occ->bb_has_division || !flag_trapping_math)
419 /* Divide by two as all divisions are counted twice in
420 the costing loop. */
421 && occ->num_divisions / 2 >= threshold)
423 /* Make a variable with the replacement and substitute it. */
424 type = TREE_TYPE (def);
425 recip_def = create_tmp_reg (type, "reciptmp");
426 new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
427 build_one_cst (type), def);
429 if (should_insert_square_recip)
431 square_recip_def = create_tmp_reg (type, "powmult_reciptmp");
432 new_square_stmt = gimple_build_assign (square_recip_def, MULT_EXPR,
433 recip_def, recip_def);
436 if (occ->bb_has_division)
438 /* Case 1: insert before an existing division. */
439 gsi = gsi_after_labels (occ->bb);
440 while (!gsi_end_p (gsi)
441 && (!is_division_by (gsi_stmt (gsi), def))
442 && (!is_division_by_square (gsi_stmt (gsi), def)))
443 gsi_next (&gsi);
445 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
446 if (should_insert_square_recip)
447 gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
449 else if (def_gsi && occ->bb == gsi_bb (*def_gsi))
451 /* Case 2: insert right after the definition. Note that this will
452 never happen if the definition statement can throw, because in
453 that case the sole successor of the statement's basic block will
454 dominate all the uses as well. */
455 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
456 if (should_insert_square_recip)
457 gsi_insert_after (def_gsi, new_square_stmt, GSI_NEW_STMT);
459 else
461 /* Case 3: insert in a basic block not containing defs/uses. */
462 gsi = gsi_after_labels (occ->bb);
463 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
464 if (should_insert_square_recip)
465 gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
468 reciprocal_stats.rdivs_inserted++;
470 occ->recip_def_stmt = new_stmt;
473 occ->recip_def = recip_def;
474 occ->square_recip_def = square_recip_def;
475 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
476 insert_reciprocals (def_gsi, occ_child, def, recip_def,
477 square_recip_def, should_insert_square_recip,
478 threshold);
481 /* Replace occurrences of expr / (x * x) with expr * ((1 / x) * (1 / x)).
482 Take as argument the use for (x * x). */
483 static inline void
484 replace_reciprocal_squares (use_operand_p use_p)
486 gimple *use_stmt = USE_STMT (use_p);
487 basic_block bb = gimple_bb (use_stmt);
488 struct occurrence *occ = (struct occurrence *) bb->aux;
490 if (optimize_bb_for_speed_p (bb) && occ->square_recip_def
491 && occ->recip_def)
493 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
494 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
495 gimple_assign_set_rhs2 (use_stmt, occ->square_recip_def);
496 SET_USE (use_p, occ->square_recip_def);
497 fold_stmt_inplace (&gsi);
498 update_stmt (use_stmt);
503 /* Replace the division at USE_P with a multiplication by the reciprocal, if
504 possible. */
506 static inline void
507 replace_reciprocal (use_operand_p use_p)
509 gimple *use_stmt = USE_STMT (use_p);
510 basic_block bb = gimple_bb (use_stmt);
511 struct occurrence *occ = (struct occurrence *) bb->aux;
513 if (optimize_bb_for_speed_p (bb)
514 && occ->recip_def && use_stmt != occ->recip_def_stmt)
516 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
517 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
518 SET_USE (use_p, occ->recip_def);
519 fold_stmt_inplace (&gsi);
520 update_stmt (use_stmt);
525 /* Free OCC and return one more "struct occurrence" to be freed. */
527 static struct occurrence *
528 free_bb (struct occurrence *occ)
530 struct occurrence *child, *next;
532 /* First get the two pointers hanging off OCC. */
533 next = occ->next;
534 child = occ->children;
535 delete occ;
537 /* Now ensure that we don't recurse unless it is necessary. */
538 if (!child)
539 return next;
540 else
542 while (next)
543 next = free_bb (next);
545 return child;
549 /* Transform sequences like
550 t = sqrt (a)
551 x = 1.0 / t;
552 r1 = x * x;
553 r2 = a * x;
554 into:
555 t = sqrt (a)
556 r1 = 1.0 / a;
557 r2 = t;
558 x = r1 * r2;
559 depending on the uses of x, r1, r2. This removes one multiplication and
560 allows the sqrt and division operations to execute in parallel.
561 DEF_GSI is the gsi of the initial division by sqrt that defines
562 DEF (x in the example above). */
564 static void
565 optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
567 gimple *use_stmt;
568 imm_use_iterator use_iter;
569 gimple *stmt = gsi_stmt (*def_gsi);
570 tree x = def;
571 tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
572 tree div_rhs1 = gimple_assign_rhs1 (stmt);
574 if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
575 || TREE_CODE (div_rhs1) != REAL_CST
576 || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
577 return;
579 gcall *sqrt_stmt
580 = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
582 if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
583 return;
585 switch (gimple_call_combined_fn (sqrt_stmt))
587 CASE_CFN_SQRT:
588 CASE_CFN_SQRT_FN:
589 break;
591 default:
592 return;
594 tree a = gimple_call_arg (sqrt_stmt, 0);
596 /* We have 'a' and 'x'. Now analyze the uses of 'x'. */
598 /* Statements that use x in x * x. */
599 auto_vec<gimple *> sqr_stmts;
600 /* Statements that use x in a * x. */
601 auto_vec<gimple *> mult_stmts;
602 bool has_other_use = false;
603 bool mult_on_main_path = false;
605 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, x)
607 if (is_gimple_debug (use_stmt))
608 continue;
609 if (is_square_of (use_stmt, x))
611 sqr_stmts.safe_push (use_stmt);
612 if (gimple_bb (use_stmt) == gimple_bb (stmt))
613 mult_on_main_path = true;
615 else if (is_mult_by (use_stmt, x, a))
617 mult_stmts.safe_push (use_stmt);
618 if (gimple_bb (use_stmt) == gimple_bb (stmt))
619 mult_on_main_path = true;
621 else
622 has_other_use = true;
625 /* In the x * x and a * x cases we just rewire stmt operands or
626 remove multiplications. In the has_other_use case we introduce
627 a multiplication so make sure we don't introduce a multiplication
628 on a path where there was none. */
629 if (has_other_use && !mult_on_main_path)
630 return;
632 if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
633 return;
635 /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
636 to be able to compose it from the sqr and mult cases. */
637 if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
638 return;
640 if (dump_file)
642 fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
643 print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
644 print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
645 fprintf (dump_file, "\n");
648 bool delete_div = !has_other_use;
649 tree sqr_ssa_name = NULL_TREE;
650 if (!sqr_stmts.is_empty ())
652 /* r1 = x * x. Transform the original
653 x = 1.0 / t
654 into
655 tmp1 = 1.0 / a
656 r1 = tmp1. */
658 sqr_ssa_name
659 = make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
661 if (dump_file)
663 fprintf (dump_file, "Replacing original division\n");
664 print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
665 fprintf (dump_file, "with new division\n");
667 stmt
668 = gimple_build_assign (sqr_ssa_name, gimple_assign_rhs_code (stmt),
669 gimple_assign_rhs1 (stmt), a);
670 gsi_insert_before (def_gsi, stmt, GSI_SAME_STMT);
671 gsi_remove (def_gsi, true);
672 *def_gsi = gsi_for_stmt (stmt);
673 fold_stmt_inplace (def_gsi);
674 update_stmt (stmt);
676 if (dump_file)
677 print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
679 delete_div = false;
680 gimple *sqr_stmt;
681 unsigned int i;
682 FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
684 gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
685 gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
686 update_stmt (sqr_stmt);
689 if (!mult_stmts.is_empty ())
691 /* r2 = a * x. Transform this into:
692 r2 = t (The original sqrt (a)). */
693 unsigned int i;
694 gimple *mult_stmt = NULL;
695 FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
697 gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
699 if (dump_file)
701 fprintf (dump_file, "Replacing squaring multiplication\n");
702 print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
703 fprintf (dump_file, "with assignment\n");
705 gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name);
706 fold_stmt_inplace (&gsi2);
707 update_stmt (mult_stmt);
708 if (dump_file)
709 print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
713 if (has_other_use)
715 /* Using the two temporaries tmp1, tmp2 from above
716 the original x is now:
717 x = tmp1 * tmp2. */
718 gcc_assert (orig_sqrt_ssa_name);
719 gcc_assert (sqr_ssa_name);
721 gimple *new_stmt
722 = gimple_build_assign (x, MULT_EXPR,
723 orig_sqrt_ssa_name, sqr_ssa_name);
724 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
725 update_stmt (stmt);
727 else if (delete_div)
729 /* Remove the original division. */
730 gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
731 gsi_remove (&gsi2, true);
732 release_defs (stmt);
734 else
735 release_ssa_name (x);
738 /* Look for floating-point divisions among DEF's uses, and try to
739 replace them by multiplications with the reciprocal. Add
740 as many statements computing the reciprocal as needed.
742 DEF must be a GIMPLE register of a floating-point type. */
744 static void
745 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
747 use_operand_p use_p, square_use_p;
748 imm_use_iterator use_iter, square_use_iter;
749 tree square_def;
750 struct occurrence *occ;
751 int count = 0;
752 int threshold;
753 int square_recip_count = 0;
754 int sqrt_recip_count = 0;
756 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME);
757 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
759 /* If DEF is a square (x * x), count the number of divisions by x.
760 If there are more divisions by x than by (DEF * DEF), prefer to optimize
761 the reciprocal of x instead of DEF. This improves cases like:
762 def = x * x
763 t0 = a / def
764 t1 = b / def
765 t2 = c / x
766 Reciprocal optimization of x results in 1 division rather than 2 or 3. */
767 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
769 if (is_gimple_assign (def_stmt)
770 && gimple_assign_rhs_code (def_stmt) == MULT_EXPR
771 && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME
772 && gimple_assign_rhs1 (def_stmt) == gimple_assign_rhs2 (def_stmt))
774 tree op0 = gimple_assign_rhs1 (def_stmt);
776 FOR_EACH_IMM_USE_FAST (use_p, use_iter, op0)
778 gimple *use_stmt = USE_STMT (use_p);
779 if (is_division_by (use_stmt, op0))
780 sqrt_recip_count++;
784 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
786 gimple *use_stmt = USE_STMT (use_p);
787 if (is_division_by (use_stmt, def))
789 register_division_in (gimple_bb (use_stmt), 2);
790 count++;
793 if (is_square_of (use_stmt, def))
795 square_def = gimple_assign_lhs (use_stmt);
796 FOR_EACH_IMM_USE_FAST (square_use_p, square_use_iter, square_def)
798 gimple *square_use_stmt = USE_STMT (square_use_p);
799 if (is_division_by (square_use_stmt, square_def))
801 /* This is executed twice for each division by a square. */
802 register_division_in (gimple_bb (square_use_stmt), 1);
803 square_recip_count++;
809 /* Square reciprocals were counted twice above. */
810 square_recip_count /= 2;
812 /* If it is more profitable to optimize 1 / x, don't optimize 1 / (x * x). */
813 if (sqrt_recip_count > square_recip_count)
814 goto out;
816 /* Do the expensive part only if we can hope to optimize something. */
817 if (count + square_recip_count >= threshold && count >= 1)
819 gimple *use_stmt;
820 for (occ = occ_head; occ; occ = occ->next)
822 compute_merit (occ);
823 insert_reciprocals (def_gsi, occ, def, NULL, NULL,
824 square_recip_count, threshold);
827 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
829 if (is_division_by (use_stmt, def))
831 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
832 replace_reciprocal (use_p);
834 else if (square_recip_count > 0 && is_square_of (use_stmt, def))
836 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
838 /* Find all uses of the square that are divisions and
839 * replace them by multiplications with the inverse. */
840 imm_use_iterator square_iterator;
841 gimple *powmult_use_stmt = USE_STMT (use_p);
842 tree powmult_def_name = gimple_assign_lhs (powmult_use_stmt);
844 FOR_EACH_IMM_USE_STMT (powmult_use_stmt,
845 square_iterator, powmult_def_name)
846 FOR_EACH_IMM_USE_ON_STMT (square_use_p, square_iterator)
848 gimple *powmult_use_stmt = USE_STMT (square_use_p);
849 if (is_division_by (powmult_use_stmt, powmult_def_name))
850 replace_reciprocal_squares (square_use_p);
857 out:
858 for (occ = occ_head; occ; )
859 occ = free_bb (occ);
861 occ_head = NULL;
864 /* Return an internal function that implements the reciprocal of CALL,
865 or IFN_LAST if there is no such function that the target supports. */
867 internal_fn
868 internal_fn_reciprocal (gcall *call)
870 internal_fn ifn;
872 switch (gimple_call_combined_fn (call))
874 CASE_CFN_SQRT:
875 CASE_CFN_SQRT_FN:
876 ifn = IFN_RSQRT;
877 break;
879 default:
880 return IFN_LAST;
883 tree_pair types = direct_internal_fn_types (ifn, call);
884 if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED))
885 return IFN_LAST;
887 return ifn;
890 /* Go through all the floating-point SSA_NAMEs, and call
891 execute_cse_reciprocals_1 on each of them. */
892 namespace {
894 const pass_data pass_data_cse_reciprocals =
896 GIMPLE_PASS, /* type */
897 "recip", /* name */
898 OPTGROUP_NONE, /* optinfo_flags */
899 TV_TREE_RECIP, /* tv_id */
900 PROP_ssa, /* properties_required */
901 0, /* properties_provided */
902 0, /* properties_destroyed */
903 0, /* todo_flags_start */
904 TODO_update_ssa, /* todo_flags_finish */
907 class pass_cse_reciprocals : public gimple_opt_pass
909 public:
910 pass_cse_reciprocals (gcc::context *ctxt)
911 : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
914 /* opt_pass methods: */
915 virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
916 virtual unsigned int execute (function *);
918 }; // class pass_cse_reciprocals
920 unsigned int
921 pass_cse_reciprocals::execute (function *fun)
923 basic_block bb;
924 tree arg;
926 occ_pool = new object_allocator<occurrence> ("dominators for recip");
928 memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
929 calculate_dominance_info (CDI_DOMINATORS);
930 calculate_dominance_info (CDI_POST_DOMINATORS);
932 if (flag_checking)
933 FOR_EACH_BB_FN (bb, fun)
934 gcc_assert (!bb->aux);
936 for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
937 if (FLOAT_TYPE_P (TREE_TYPE (arg))
938 && is_gimple_reg (arg))
940 tree name = ssa_default_def (fun, arg);
941 if (name)
942 execute_cse_reciprocals_1 (NULL, name);
945 FOR_EACH_BB_FN (bb, fun)
947 tree def;
949 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
950 gsi_next (&gsi))
952 gphi *phi = gsi.phi ();
953 def = PHI_RESULT (phi);
954 if (! virtual_operand_p (def)
955 && FLOAT_TYPE_P (TREE_TYPE (def)))
956 execute_cse_reciprocals_1 (NULL, def);
959 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
960 gsi_next (&gsi))
962 gimple *stmt = gsi_stmt (gsi);
964 if (gimple_has_lhs (stmt)
965 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
966 && FLOAT_TYPE_P (TREE_TYPE (def))
967 && TREE_CODE (def) == SSA_NAME)
969 execute_cse_reciprocals_1 (&gsi, def);
970 stmt = gsi_stmt (gsi);
971 if (flag_unsafe_math_optimizations
972 && is_gimple_assign (stmt)
973 && gimple_assign_lhs (stmt) == def
974 && !stmt_can_throw_internal (cfun, stmt)
975 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
976 optimize_recip_sqrt (&gsi, def);
980 if (optimize_bb_for_size_p (bb))
981 continue;
983 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
984 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
985 gsi_next (&gsi))
987 gimple *stmt = gsi_stmt (gsi);
989 if (is_gimple_assign (stmt)
990 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
992 tree arg1 = gimple_assign_rhs2 (stmt);
993 gimple *stmt1;
995 if (TREE_CODE (arg1) != SSA_NAME)
996 continue;
998 stmt1 = SSA_NAME_DEF_STMT (arg1);
1000 if (is_gimple_call (stmt1)
1001 && gimple_call_lhs (stmt1))
1003 bool fail;
1004 imm_use_iterator ui;
1005 use_operand_p use_p;
1006 tree fndecl = NULL_TREE;
1008 gcall *call = as_a <gcall *> (stmt1);
1009 internal_fn ifn = internal_fn_reciprocal (call);
1010 if (ifn == IFN_LAST)
1012 fndecl = gimple_call_fndecl (call);
1013 if (!fndecl
1014 || !fndecl_built_in_p (fndecl, BUILT_IN_MD))
1015 continue;
1016 fndecl = targetm.builtin_reciprocal (fndecl);
1017 if (!fndecl)
1018 continue;
1021 /* Check that all uses of the SSA name are divisions,
1022 otherwise replacing the defining statement will do
1023 the wrong thing. */
1024 fail = false;
1025 FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
1027 gimple *stmt2 = USE_STMT (use_p);
1028 if (is_gimple_debug (stmt2))
1029 continue;
1030 if (!is_gimple_assign (stmt2)
1031 || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
1032 || gimple_assign_rhs1 (stmt2) == arg1
1033 || gimple_assign_rhs2 (stmt2) != arg1)
1035 fail = true;
1036 break;
1039 if (fail)
1040 continue;
1042 gimple_replace_ssa_lhs (call, arg1);
1043 if (gimple_call_internal_p (call) != (ifn != IFN_LAST))
1045 auto_vec<tree, 4> args;
1046 for (unsigned int i = 0;
1047 i < gimple_call_num_args (call); i++)
1048 args.safe_push (gimple_call_arg (call, i));
1049 gcall *stmt2;
1050 if (ifn == IFN_LAST)
1051 stmt2 = gimple_build_call_vec (fndecl, args);
1052 else
1053 stmt2 = gimple_build_call_internal_vec (ifn, args);
1054 gimple_call_set_lhs (stmt2, arg1);
1055 gimple_move_vops (stmt2, call);
1056 gimple_call_set_nothrow (stmt2,
1057 gimple_call_nothrow_p (call));
1058 gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
1059 gsi_replace (&gsi2, stmt2, true);
1061 else
1063 if (ifn == IFN_LAST)
1064 gimple_call_set_fndecl (call, fndecl);
1065 else
1066 gimple_call_set_internal_fn (call, ifn);
1067 update_stmt (call);
1069 reciprocal_stats.rfuncs_inserted++;
1071 FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
1073 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
1074 gimple_assign_set_rhs_code (stmt, MULT_EXPR);
1075 fold_stmt_inplace (&gsi);
1076 update_stmt (stmt);
1083 statistics_counter_event (fun, "reciprocal divs inserted",
1084 reciprocal_stats.rdivs_inserted);
1085 statistics_counter_event (fun, "reciprocal functions inserted",
1086 reciprocal_stats.rfuncs_inserted);
1088 free_dominance_info (CDI_DOMINATORS);
1089 free_dominance_info (CDI_POST_DOMINATORS);
1090 delete occ_pool;
1091 return 0;
1094 } // anon namespace
1096 gimple_opt_pass *
1097 make_pass_cse_reciprocals (gcc::context *ctxt)
1099 return new pass_cse_reciprocals (ctxt);
1102 /* Records an occurrence at statement USE_STMT in the vector of trees
1103 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
1104 is not yet initialized. Returns true if the occurrence was pushed on
1105 the vector. Adjusts *TOP_BB to be the basic block dominating all
1106 statements in the vector. */
1108 static bool
1109 maybe_record_sincos (vec<gimple *> *stmts,
1110 basic_block *top_bb, gimple *use_stmt)
1112 basic_block use_bb = gimple_bb (use_stmt);
1113 if (*top_bb
1114 && (*top_bb == use_bb
1115 || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
1116 stmts->safe_push (use_stmt);
1117 else if (!*top_bb
1118 || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
1120 stmts->safe_push (use_stmt);
1121 *top_bb = use_bb;
1123 else
1124 return false;
1126 return true;
1129 /* Look for sin, cos and cexpi calls with the same argument NAME and
1130 create a single call to cexpi CSEing the result in this case.
1131 We first walk over all immediate uses of the argument collecting
1132 statements that we can CSE in a vector and in a second pass replace
1133 the statement rhs with a REALPART or IMAGPART expression on the
1134 result of the cexpi call we insert before the use statement that
1135 dominates all other candidates. */
1137 static bool
1138 execute_cse_sincos_1 (tree name)
1140 gimple_stmt_iterator gsi;
1141 imm_use_iterator use_iter;
1142 tree fndecl, res, type = NULL_TREE;
1143 gimple *def_stmt, *use_stmt, *stmt;
1144 int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
1145 auto_vec<gimple *> stmts;
1146 basic_block top_bb = NULL;
1147 int i;
1148 bool cfg_changed = false;
1150 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
1152 if (gimple_code (use_stmt) != GIMPLE_CALL
1153 || !gimple_call_lhs (use_stmt))
1154 continue;
1156 switch (gimple_call_combined_fn (use_stmt))
1158 CASE_CFN_COS:
1159 seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1160 break;
1162 CASE_CFN_SIN:
1163 seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1164 break;
1166 CASE_CFN_CEXPI:
1167 seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1168 break;
1170 default:;
1171 continue;
1174 tree t = mathfn_built_in_type (gimple_call_combined_fn (use_stmt));
1175 if (!type)
1177 type = t;
1178 t = TREE_TYPE (name);
1180 /* This checks that NAME has the right type in the first round,
1181 and, in subsequent rounds, that the built_in type is the same
1182 type, or a compatible type. */
1183 if (type != t && !types_compatible_p (type, t))
1184 return false;
1186 if (seen_cos + seen_sin + seen_cexpi <= 1)
1187 return false;
1189 /* Simply insert cexpi at the beginning of top_bb but not earlier than
1190 the name def statement. */
1191 fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
1192 if (!fndecl)
1193 return false;
1194 stmt = gimple_build_call (fndecl, 1, name);
1195 res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
1196 gimple_call_set_lhs (stmt, res);
1198 def_stmt = SSA_NAME_DEF_STMT (name);
1199 if (!SSA_NAME_IS_DEFAULT_DEF (name)
1200 && gimple_code (def_stmt) != GIMPLE_PHI
1201 && gimple_bb (def_stmt) == top_bb)
1203 gsi = gsi_for_stmt (def_stmt);
1204 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
1206 else
1208 gsi = gsi_after_labels (top_bb);
1209 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1211 sincos_stats.inserted++;
1213 /* And adjust the recorded old call sites. */
1214 for (i = 0; stmts.iterate (i, &use_stmt); ++i)
1216 tree rhs = NULL;
1218 switch (gimple_call_combined_fn (use_stmt))
1220 CASE_CFN_COS:
1221 rhs = fold_build1 (REALPART_EXPR, type, res);
1222 break;
1224 CASE_CFN_SIN:
1225 rhs = fold_build1 (IMAGPART_EXPR, type, res);
1226 break;
1228 CASE_CFN_CEXPI:
1229 rhs = res;
1230 break;
1232 default:;
1233 gcc_unreachable ();
1236 /* Replace call with a copy. */
1237 stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
1239 gsi = gsi_for_stmt (use_stmt);
1240 gsi_replace (&gsi, stmt, true);
1241 if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
1242 cfg_changed = true;
1245 return cfg_changed;
1248 /* To evaluate powi(x,n), the floating point value x raised to the
1249 constant integer exponent n, we use a hybrid algorithm that
1250 combines the "window method" with look-up tables. For an
1251 introduction to exponentiation algorithms and "addition chains",
1252 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
1253 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
1254 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
1255 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
1257 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
1258 multiplications to inline before calling the system library's pow
1259 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
1260 so this default never requires calling pow, powf or powl. */
1262 #ifndef POWI_MAX_MULTS
1263 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
1264 #endif
1266 /* The size of the "optimal power tree" lookup table. All
1267 exponents less than this value are simply looked up in the
1268 powi_table below. This threshold is also used to size the
1269 cache of pseudo registers that hold intermediate results. */
1270 #define POWI_TABLE_SIZE 256
1272 /* The size, in bits of the window, used in the "window method"
1273 exponentiation algorithm. This is equivalent to a radix of
1274 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
1275 #define POWI_WINDOW_SIZE 3
1277 /* The following table is an efficient representation of an
1278 "optimal power tree". For each value, i, the corresponding
1279 value, j, in the table states than an optimal evaluation
1280 sequence for calculating pow(x,i) can be found by evaluating
1281 pow(x,j)*pow(x,i-j). An optimal power tree for the first
1282 100 integers is given in Knuth's "Seminumerical algorithms". */
1284 static const unsigned char powi_table[POWI_TABLE_SIZE] =
1286 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
1287 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
1288 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
1289 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
1290 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
1291 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
1292 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
1293 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
1294 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
1295 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
1296 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
1297 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
1298 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
1299 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
1300 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
1301 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
1302 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
1303 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
1304 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
1305 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
1306 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
1307 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
1308 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
1309 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
1310 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
1311 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
1312 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
1313 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
1314 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
1315 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
1316 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
1317 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
1321 /* Return the number of multiplications required to calculate
1322 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
1323 subroutine of powi_cost. CACHE is an array indicating
1324 which exponents have already been calculated. */
1326 static int
1327 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
1329 /* If we've already calculated this exponent, then this evaluation
1330 doesn't require any additional multiplications. */
1331 if (cache[n])
1332 return 0;
1334 cache[n] = true;
1335 return powi_lookup_cost (n - powi_table[n], cache)
1336 + powi_lookup_cost (powi_table[n], cache) + 1;
1339 /* Return the number of multiplications required to calculate
1340 powi(x,n) for an arbitrary x, given the exponent N. This
1341 function needs to be kept in sync with powi_as_mults below. */
1343 static int
1344 powi_cost (HOST_WIDE_INT n)
1346 bool cache[POWI_TABLE_SIZE];
1347 unsigned HOST_WIDE_INT digit;
1348 unsigned HOST_WIDE_INT val;
1349 int result;
1351 if (n == 0)
1352 return 0;
1354 /* Ignore the reciprocal when calculating the cost. */
1355 val = (n < 0) ? -n : n;
1357 /* Initialize the exponent cache. */
1358 memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
1359 cache[1] = true;
1361 result = 0;
1363 while (val >= POWI_TABLE_SIZE)
1365 if (val & 1)
1367 digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
1368 result += powi_lookup_cost (digit, cache)
1369 + POWI_WINDOW_SIZE + 1;
1370 val >>= POWI_WINDOW_SIZE;
1372 else
1374 val >>= 1;
1375 result++;
1379 return result + powi_lookup_cost (val, cache);
1382 /* Recursive subroutine of powi_as_mults. This function takes the
1383 array, CACHE, of already calculated exponents and an exponent N and
1384 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
1386 static tree
1387 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
1388 HOST_WIDE_INT n, tree *cache)
1390 tree op0, op1, ssa_target;
1391 unsigned HOST_WIDE_INT digit;
1392 gassign *mult_stmt;
1394 if (n < POWI_TABLE_SIZE && cache[n])
1395 return cache[n];
1397 ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1399 if (n < POWI_TABLE_SIZE)
1401 cache[n] = ssa_target;
1402 op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1403 op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1405 else if (n & 1)
1407 digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1408 op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1409 op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1411 else
1413 op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1414 op1 = op0;
1417 mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1418 gimple_set_location (mult_stmt, loc);
1419 gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1421 return ssa_target;
1424 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1425 This function needs to be kept in sync with powi_cost above. */
1427 static tree
1428 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1429 tree arg0, HOST_WIDE_INT n)
1431 tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1432 gassign *div_stmt;
1433 tree target;
1435 if (n == 0)
1436 return build_real (type, dconst1);
1438 memset (cache, 0, sizeof (cache));
1439 cache[1] = arg0;
1441 result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1442 if (n >= 0)
1443 return result;
1445 /* If the original exponent was negative, reciprocate the result. */
1446 target = make_temp_ssa_name (type, NULL, "powmult");
1447 div_stmt = gimple_build_assign (target, RDIV_EXPR,
1448 build_real (type, dconst1), result);
1449 gimple_set_location (div_stmt, loc);
1450 gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1452 return target;
1455 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1456 location info LOC. If the arguments are appropriate, create an
1457 equivalent sequence of statements prior to GSI using an optimal
1458 number of multiplications, and return an expession holding the
1459 result. */
1461 static tree
1462 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1463 tree arg0, HOST_WIDE_INT n)
1465 /* Avoid largest negative number. */
1466 if (n != -n
1467 && ((n >= -1 && n <= 2)
1468 || (optimize_function_for_speed_p (cfun)
1469 && powi_cost (n) <= POWI_MAX_MULTS)))
1470 return powi_as_mults (gsi, loc, arg0, n);
1472 return NULL_TREE;
1475 /* Build a gimple call statement that calls FN with argument ARG.
1476 Set the lhs of the call statement to a fresh SSA name. Insert the
1477 statement prior to GSI's current position, and return the fresh
1478 SSA name. */
1480 static tree
1481 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1482 tree fn, tree arg)
1484 gcall *call_stmt;
1485 tree ssa_target;
1487 call_stmt = gimple_build_call (fn, 1, arg);
1488 ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1489 gimple_set_lhs (call_stmt, ssa_target);
1490 gimple_set_location (call_stmt, loc);
1491 gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1493 return ssa_target;
1496 /* Build a gimple binary operation with the given CODE and arguments
1497 ARG0, ARG1, assigning the result to a new SSA name for variable
1498 TARGET. Insert the statement prior to GSI's current position, and
1499 return the fresh SSA name.*/
1501 static tree
1502 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1503 const char *name, enum tree_code code,
1504 tree arg0, tree arg1)
1506 tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1507 gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1508 gimple_set_location (stmt, loc);
1509 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1510 return result;
1513 /* Build a gimple reference operation with the given CODE and argument
1514 ARG, assigning the result to a new SSA name of TYPE with NAME.
1515 Insert the statement prior to GSI's current position, and return
1516 the fresh SSA name. */
1518 static inline tree
1519 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1520 const char *name, enum tree_code code, tree arg0)
1522 tree result = make_temp_ssa_name (type, NULL, name);
1523 gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1524 gimple_set_location (stmt, loc);
1525 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1526 return result;
1529 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1530 prior to GSI's current position, and return the fresh SSA name. */
1532 static tree
1533 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1534 tree type, tree val)
1536 tree result = make_ssa_name (type);
1537 gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1538 gimple_set_location (stmt, loc);
1539 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1540 return result;
1543 struct pow_synth_sqrt_info
1545 bool *factors;
1546 unsigned int deepest;
1547 unsigned int num_mults;
1550 /* Return true iff the real value C can be represented as a
1551 sum of powers of 0.5 up to N. That is:
1552 C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1553 Record in INFO the various parameters of the synthesis algorithm such
1554 as the factors a[i], the maximum 0.5 power and the number of
1555 multiplications that will be required. */
1557 bool
1558 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1559 struct pow_synth_sqrt_info *info)
1561 REAL_VALUE_TYPE factor = dconsthalf;
1562 REAL_VALUE_TYPE remainder = c;
1564 info->deepest = 0;
1565 info->num_mults = 0;
1566 memset (info->factors, 0, n * sizeof (bool));
1568 for (unsigned i = 0; i < n; i++)
1570 REAL_VALUE_TYPE res;
1572 /* If something inexact happened bail out now. */
1573 if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1574 return false;
1576 /* We have hit zero. The number is representable as a sum
1577 of powers of 0.5. */
1578 if (real_equal (&res, &dconst0))
1580 info->factors[i] = true;
1581 info->deepest = i + 1;
1582 return true;
1584 else if (!REAL_VALUE_NEGATIVE (res))
1586 remainder = res;
1587 info->factors[i] = true;
1588 info->num_mults++;
1590 else
1591 info->factors[i] = false;
1593 real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1595 return false;
1598 /* Return the tree corresponding to FN being applied
1599 to ARG N times at GSI and LOC.
1600 Look up previous results from CACHE if need be.
1601 cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times. */
1603 static tree
1604 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1605 tree fn, location_t loc, tree *cache)
1607 tree res = cache[n];
1608 if (!res)
1610 tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1611 res = build_and_insert_call (gsi, loc, fn, prev);
1612 cache[n] = res;
1615 return res;
1618 /* Print to STREAM the repeated application of function FNAME to ARG
1619 N times. So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1620 "foo (foo (x))". */
1622 static void
1623 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1624 unsigned int n)
1626 if (n == 0)
1627 fprintf (stream, "%s", arg);
1628 else
1630 fprintf (stream, "%s (", fname);
1631 print_nested_fn (stream, fname, arg, n - 1);
1632 fprintf (stream, ")");
1636 /* Print to STREAM the fractional sequence of sqrt chains
1637 applied to ARG, described by INFO. Used for the dump file. */
1639 static void
1640 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1641 struct pow_synth_sqrt_info *info)
1643 for (unsigned int i = 0; i < info->deepest; i++)
1645 bool is_set = info->factors[i];
1646 if (is_set)
1648 print_nested_fn (stream, "sqrt", arg, i + 1);
1649 if (i != info->deepest - 1)
1650 fprintf (stream, " * ");
1655 /* Print to STREAM a representation of raising ARG to an integer
1656 power N. Used for the dump file. */
1658 static void
1659 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1661 if (n > 1)
1662 fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1663 else if (n == 1)
1664 fprintf (stream, "%s", arg);
1667 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1668 square roots. Place at GSI and LOC. Limit the maximum depth
1669 of the sqrt chains to MAX_DEPTH. Return the tree holding the
1670 result of the expanded sequence or NULL_TREE if the expansion failed.
1672 This routine assumes that ARG1 is a real number with a fractional part
1673 (the integer exponent case will have been handled earlier in
1674 gimple_expand_builtin_pow).
1676 For ARG1 > 0.0:
1677 * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1678 FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1679 FRAC_PART == ARG1 - WHOLE_PART:
1680 Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1681 POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1682 if it can be expressed as such, that is if FRAC_PART satisfies:
1683 FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1684 where integer a[i] is either 0 or 1.
1686 Example:
1687 POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1688 --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1690 For ARG1 < 0.0 there are two approaches:
1691 * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1692 is calculated as above.
1694 Example:
1695 POW (x, -5.625) == 1.0 / POW (x, 5.625)
1696 --> 1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1698 * (B) : WHOLE_PART := - ceil (abs (ARG1))
1699 FRAC_PART := ARG1 - WHOLE_PART
1700 and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1701 Example:
1702 POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1703 --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1705 For ARG1 < 0.0 we choose between (A) and (B) depending on
1706 how many multiplications we'd have to do.
1707 So, for the example in (B): POW (x, -5.875), if we were to
1708 follow algorithm (A) we would produce:
1709 1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1710 which contains more multiplications than approach (B).
1712 Hopefully, this approach will eliminate potentially expensive POW library
1713 calls when unsafe floating point math is enabled and allow the compiler to
1714 further optimise the multiplies, square roots and divides produced by this
1715 function. */
1717 static tree
1718 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1719 tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1721 tree type = TREE_TYPE (arg0);
1722 machine_mode mode = TYPE_MODE (type);
1723 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1724 bool one_over = true;
1726 if (!sqrtfn)
1727 return NULL_TREE;
1729 if (TREE_CODE (arg1) != REAL_CST)
1730 return NULL_TREE;
1732 REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1734 gcc_assert (max_depth > 0);
1735 tree *cache = XALLOCAVEC (tree, max_depth + 1);
1737 struct pow_synth_sqrt_info synth_info;
1738 synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1739 synth_info.deepest = 0;
1740 synth_info.num_mults = 0;
1742 bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1743 REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1745 /* The whole and fractional parts of exp. */
1746 REAL_VALUE_TYPE whole_part;
1747 REAL_VALUE_TYPE frac_part;
1749 real_floor (&whole_part, mode, &exp);
1750 real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1753 REAL_VALUE_TYPE ceil_whole = dconst0;
1754 REAL_VALUE_TYPE ceil_fract = dconst0;
1756 if (neg_exp)
1758 real_ceil (&ceil_whole, mode, &exp);
1759 real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1762 if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1763 return NULL_TREE;
1765 /* Check whether it's more profitable to not use 1.0 / ... */
1766 if (neg_exp)
1768 struct pow_synth_sqrt_info alt_synth_info;
1769 alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1770 alt_synth_info.deepest = 0;
1771 alt_synth_info.num_mults = 0;
1773 if (representable_as_half_series_p (ceil_fract, max_depth,
1774 &alt_synth_info)
1775 && alt_synth_info.deepest <= synth_info.deepest
1776 && alt_synth_info.num_mults < synth_info.num_mults)
1778 whole_part = ceil_whole;
1779 frac_part = ceil_fract;
1780 synth_info.deepest = alt_synth_info.deepest;
1781 synth_info.num_mults = alt_synth_info.num_mults;
1782 memcpy (synth_info.factors, alt_synth_info.factors,
1783 (max_depth + 1) * sizeof (bool));
1784 one_over = false;
1788 HOST_WIDE_INT n = real_to_integer (&whole_part);
1789 REAL_VALUE_TYPE cint;
1790 real_from_integer (&cint, VOIDmode, n, SIGNED);
1792 if (!real_identical (&whole_part, &cint))
1793 return NULL_TREE;
1795 if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1796 return NULL_TREE;
1798 memset (cache, 0, (max_depth + 1) * sizeof (tree));
1800 tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1802 /* Calculate the integer part of the exponent. */
1803 if (n > 1)
1805 integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1806 if (!integer_res)
1807 return NULL_TREE;
1810 if (dump_file)
1812 char string[64];
1814 real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1815 fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1817 if (neg_exp)
1819 if (one_over)
1821 fprintf (dump_file, "1.0 / (");
1822 dump_integer_part (dump_file, "x", n);
1823 if (n > 0)
1824 fprintf (dump_file, " * ");
1825 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1826 fprintf (dump_file, ")");
1828 else
1830 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1831 fprintf (dump_file, " / (");
1832 dump_integer_part (dump_file, "x", n);
1833 fprintf (dump_file, ")");
1836 else
1838 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1839 if (n > 0)
1840 fprintf (dump_file, " * ");
1841 dump_integer_part (dump_file, "x", n);
1844 fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1848 tree fract_res = NULL_TREE;
1849 cache[0] = arg0;
1851 /* Calculate the fractional part of the exponent. */
1852 for (unsigned i = 0; i < synth_info.deepest; i++)
1854 if (synth_info.factors[i])
1856 tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1858 if (!fract_res)
1859 fract_res = sqrt_chain;
1861 else
1862 fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1863 fract_res, sqrt_chain);
1867 tree res = NULL_TREE;
1869 if (neg_exp)
1871 if (one_over)
1873 if (n > 0)
1874 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1875 fract_res, integer_res);
1876 else
1877 res = fract_res;
1879 res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1880 build_real (type, dconst1), res);
1882 else
1884 res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1885 fract_res, integer_res);
1888 else
1889 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1890 fract_res, integer_res);
1891 return res;
1894 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1895 with location info LOC. If possible, create an equivalent and
1896 less expensive sequence of statements prior to GSI, and return an
1897 expession holding the result. */
1899 static tree
1900 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1901 tree arg0, tree arg1)
1903 REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1904 REAL_VALUE_TYPE c2, dconst3;
1905 HOST_WIDE_INT n;
1906 tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1907 machine_mode mode;
1908 bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1909 bool hw_sqrt_exists, c_is_int, c2_is_int;
1911 dconst1_4 = dconst1;
1912 SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1914 /* If the exponent isn't a constant, there's nothing of interest
1915 to be done. */
1916 if (TREE_CODE (arg1) != REAL_CST)
1917 return NULL_TREE;
1919 /* Don't perform the operation if flag_signaling_nans is on
1920 and the operand is a signaling NaN. */
1921 if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1)))
1922 && ((TREE_CODE (arg0) == REAL_CST
1923 && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0)))
1924 || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1))))
1925 return NULL_TREE;
1927 /* If the exponent is equivalent to an integer, expand to an optimal
1928 multiplication sequence when profitable. */
1929 c = TREE_REAL_CST (arg1);
1930 n = real_to_integer (&c);
1931 real_from_integer (&cint, VOIDmode, n, SIGNED);
1932 c_is_int = real_identical (&c, &cint);
1934 if (c_is_int
1935 && ((n >= -1 && n <= 2)
1936 || (flag_unsafe_math_optimizations
1937 && speed_p
1938 && powi_cost (n) <= POWI_MAX_MULTS)))
1939 return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1941 /* Attempt various optimizations using sqrt and cbrt. */
1942 type = TREE_TYPE (arg0);
1943 mode = TYPE_MODE (type);
1944 sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1946 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1947 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1948 sqrt(-0) = -0. */
1949 if (sqrtfn
1950 && real_equal (&c, &dconsthalf)
1951 && !HONOR_SIGNED_ZEROS (mode))
1952 return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1954 hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1956 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1957 optimizations since 1./3. is not exactly representable. If x
1958 is negative and finite, the correct value of pow(x,1./3.) is
1959 a NaN with the "invalid" exception raised, because the value
1960 of 1./3. actually has an even denominator. The correct value
1961 of cbrt(x) is a negative real value. */
1962 cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1963 dconst1_3 = real_value_truncate (mode, dconst_third ());
1965 if (flag_unsafe_math_optimizations
1966 && cbrtfn
1967 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1968 && real_equal (&c, &dconst1_3))
1969 return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1971 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1972 if we don't have a hardware sqrt insn. */
1973 dconst1_6 = dconst1_3;
1974 SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1976 if (flag_unsafe_math_optimizations
1977 && sqrtfn
1978 && cbrtfn
1979 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1980 && speed_p
1981 && hw_sqrt_exists
1982 && real_equal (&c, &dconst1_6))
1984 /* sqrt(x) */
1985 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1987 /* cbrt(sqrt(x)) */
1988 return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1992 /* Attempt to expand the POW as a product of square root chains.
1993 Expand the 0.25 case even when otpimising for size. */
1994 if (flag_unsafe_math_optimizations
1995 && sqrtfn
1996 && hw_sqrt_exists
1997 && (speed_p || real_equal (&c, &dconst1_4))
1998 && !HONOR_SIGNED_ZEROS (mode))
2000 unsigned int max_depth = speed_p
2001 ? param_max_pow_sqrt_depth
2002 : 2;
2004 tree expand_with_sqrts
2005 = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
2007 if (expand_with_sqrts)
2008 return expand_with_sqrts;
2011 real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
2012 n = real_to_integer (&c2);
2013 real_from_integer (&cint, VOIDmode, n, SIGNED);
2014 c2_is_int = real_identical (&c2, &cint);
2016 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
2018 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
2019 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
2021 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
2022 different from pow(x, 1./3.) due to rounding and behavior with
2023 negative x, we need to constrain this transformation to unsafe
2024 math and positive x or finite math. */
2025 real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
2026 real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
2027 real_round (&c2, mode, &c2);
2028 n = real_to_integer (&c2);
2029 real_from_integer (&cint, VOIDmode, n, SIGNED);
2030 real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
2031 real_convert (&c2, mode, &c2);
2033 if (flag_unsafe_math_optimizations
2034 && cbrtfn
2035 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
2036 && real_identical (&c2, &c)
2037 && !c2_is_int
2038 && optimize_function_for_speed_p (cfun)
2039 && powi_cost (n / 3) <= POWI_MAX_MULTS)
2041 tree powi_x_ndiv3 = NULL_TREE;
2043 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
2044 possible or profitable, give up. Skip the degenerate case when
2045 abs(n) < 3, where the result is always 1. */
2046 if (absu_hwi (n) >= 3)
2048 powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
2049 abs_hwi (n / 3));
2050 if (!powi_x_ndiv3)
2051 return NULL_TREE;
2054 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
2055 as that creates an unnecessary variable. Instead, just produce
2056 either cbrt(x) or cbrt(x) * cbrt(x). */
2057 cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
2059 if (absu_hwi (n) % 3 == 1)
2060 powi_cbrt_x = cbrt_x;
2061 else
2062 powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2063 cbrt_x, cbrt_x);
2065 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
2066 if (absu_hwi (n) < 3)
2067 result = powi_cbrt_x;
2068 else
2069 result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2070 powi_x_ndiv3, powi_cbrt_x);
2072 /* If n is negative, reciprocate the result. */
2073 if (n < 0)
2074 result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
2075 build_real (type, dconst1), result);
2077 return result;
2080 /* No optimizations succeeded. */
2081 return NULL_TREE;
2084 /* ARG is the argument to a cabs builtin call in GSI with location info
2085 LOC. Create a sequence of statements prior to GSI that calculates
2086 sqrt(R*R + I*I), where R and I are the real and imaginary components
2087 of ARG, respectively. Return an expression holding the result. */
2089 static tree
2090 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
2092 tree real_part, imag_part, addend1, addend2, sum, result;
2093 tree type = TREE_TYPE (TREE_TYPE (arg));
2094 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
2095 machine_mode mode = TYPE_MODE (type);
2097 if (!flag_unsafe_math_optimizations
2098 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
2099 || !sqrtfn
2100 || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
2101 return NULL_TREE;
2103 real_part = build_and_insert_ref (gsi, loc, type, "cabs",
2104 REALPART_EXPR, arg);
2105 addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2106 real_part, real_part);
2107 imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
2108 IMAGPART_EXPR, arg);
2109 addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2110 imag_part, imag_part);
2111 sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
2112 result = build_and_insert_call (gsi, loc, sqrtfn, sum);
2114 return result;
2117 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
2118 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
2119 an optimal number of multiplies, when n is a constant. */
2121 namespace {
2123 const pass_data pass_data_cse_sincos =
2125 GIMPLE_PASS, /* type */
2126 "sincos", /* name */
2127 OPTGROUP_NONE, /* optinfo_flags */
2128 TV_TREE_SINCOS, /* tv_id */
2129 PROP_ssa, /* properties_required */
2130 PROP_gimple_opt_math, /* properties_provided */
2131 0, /* properties_destroyed */
2132 0, /* todo_flags_start */
2133 TODO_update_ssa, /* todo_flags_finish */
2136 class pass_cse_sincos : public gimple_opt_pass
2138 public:
2139 pass_cse_sincos (gcc::context *ctxt)
2140 : gimple_opt_pass (pass_data_cse_sincos, ctxt)
2143 /* opt_pass methods: */
2144 virtual bool gate (function *)
2146 /* We no longer require either sincos or cexp, since powi expansion
2147 piggybacks on this pass. */
2148 return optimize;
2151 virtual unsigned int execute (function *);
2153 }; // class pass_cse_sincos
2155 unsigned int
2156 pass_cse_sincos::execute (function *fun)
2158 basic_block bb;
2159 bool cfg_changed = false;
2161 calculate_dominance_info (CDI_DOMINATORS);
2162 memset (&sincos_stats, 0, sizeof (sincos_stats));
2164 FOR_EACH_BB_FN (bb, fun)
2166 gimple_stmt_iterator gsi;
2167 bool cleanup_eh = false;
2169 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2171 gimple *stmt = gsi_stmt (gsi);
2173 /* Only the last stmt in a bb could throw, no need to call
2174 gimple_purge_dead_eh_edges if we change something in the middle
2175 of a basic block. */
2176 cleanup_eh = false;
2178 if (is_gimple_call (stmt)
2179 && gimple_call_lhs (stmt))
2181 tree arg, arg0, arg1, result;
2182 HOST_WIDE_INT n;
2183 location_t loc;
2185 switch (gimple_call_combined_fn (stmt))
2187 CASE_CFN_COS:
2188 CASE_CFN_SIN:
2189 CASE_CFN_CEXPI:
2190 arg = gimple_call_arg (stmt, 0);
2191 /* Make sure we have either sincos or cexp. */
2192 if (!targetm.libc_has_function (function_c99_math_complex,
2193 TREE_TYPE (arg))
2194 && !targetm.libc_has_function (function_sincos,
2195 TREE_TYPE (arg)))
2196 break;
2198 if (TREE_CODE (arg) == SSA_NAME)
2199 cfg_changed |= execute_cse_sincos_1 (arg);
2200 break;
2202 CASE_CFN_POW:
2203 arg0 = gimple_call_arg (stmt, 0);
2204 arg1 = gimple_call_arg (stmt, 1);
2206 loc = gimple_location (stmt);
2207 result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
2209 if (result)
2211 tree lhs = gimple_get_lhs (stmt);
2212 gassign *new_stmt = gimple_build_assign (lhs, result);
2213 gimple_set_location (new_stmt, loc);
2214 unlink_stmt_vdef (stmt);
2215 gsi_replace (&gsi, new_stmt, true);
2216 cleanup_eh = true;
2217 if (gimple_vdef (stmt))
2218 release_ssa_name (gimple_vdef (stmt));
2220 break;
2222 CASE_CFN_POWI:
2223 arg0 = gimple_call_arg (stmt, 0);
2224 arg1 = gimple_call_arg (stmt, 1);
2225 loc = gimple_location (stmt);
2227 if (real_minus_onep (arg0))
2229 tree t0, t1, cond, one, minus_one;
2230 gassign *stmt;
2232 t0 = TREE_TYPE (arg0);
2233 t1 = TREE_TYPE (arg1);
2234 one = build_real (t0, dconst1);
2235 minus_one = build_real (t0, dconstm1);
2237 cond = make_temp_ssa_name (t1, NULL, "powi_cond");
2238 stmt = gimple_build_assign (cond, BIT_AND_EXPR,
2239 arg1, build_int_cst (t1, 1));
2240 gimple_set_location (stmt, loc);
2241 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2243 result = make_temp_ssa_name (t0, NULL, "powi");
2244 stmt = gimple_build_assign (result, COND_EXPR, cond,
2245 minus_one, one);
2246 gimple_set_location (stmt, loc);
2247 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2249 else
2251 if (!tree_fits_shwi_p (arg1))
2252 break;
2254 n = tree_to_shwi (arg1);
2255 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
2258 if (result)
2260 tree lhs = gimple_get_lhs (stmt);
2261 gassign *new_stmt = gimple_build_assign (lhs, result);
2262 gimple_set_location (new_stmt, loc);
2263 unlink_stmt_vdef (stmt);
2264 gsi_replace (&gsi, new_stmt, true);
2265 cleanup_eh = true;
2266 if (gimple_vdef (stmt))
2267 release_ssa_name (gimple_vdef (stmt));
2269 break;
2271 CASE_CFN_CABS:
2272 arg0 = gimple_call_arg (stmt, 0);
2273 loc = gimple_location (stmt);
2274 result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
2276 if (result)
2278 tree lhs = gimple_get_lhs (stmt);
2279 gassign *new_stmt = gimple_build_assign (lhs, result);
2280 gimple_set_location (new_stmt, loc);
2281 unlink_stmt_vdef (stmt);
2282 gsi_replace (&gsi, new_stmt, true);
2283 cleanup_eh = true;
2284 if (gimple_vdef (stmt))
2285 release_ssa_name (gimple_vdef (stmt));
2287 break;
2289 default:;
2293 if (cleanup_eh)
2294 cfg_changed |= gimple_purge_dead_eh_edges (bb);
2297 statistics_counter_event (fun, "sincos statements inserted",
2298 sincos_stats.inserted);
2300 return cfg_changed ? TODO_cleanup_cfg : 0;
2303 } // anon namespace
2305 gimple_opt_pass *
2306 make_pass_cse_sincos (gcc::context *ctxt)
2308 return new pass_cse_sincos (ctxt);
2311 /* Return true if stmt is a type conversion operation that can be stripped
2312 when used in a widening multiply operation. */
2313 static bool
2314 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2316 enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2318 if (TREE_CODE (result_type) == INTEGER_TYPE)
2320 tree op_type;
2321 tree inner_op_type;
2323 if (!CONVERT_EXPR_CODE_P (rhs_code))
2324 return false;
2326 op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2328 /* If the type of OP has the same precision as the result, then
2329 we can strip this conversion. The multiply operation will be
2330 selected to create the correct extension as a by-product. */
2331 if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2332 return true;
2334 /* We can also strip a conversion if it preserves the signed-ness of
2335 the operation and doesn't narrow the range. */
2336 inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2338 /* If the inner-most type is unsigned, then we can strip any
2339 intermediate widening operation. If it's signed, then the
2340 intermediate widening operation must also be signed. */
2341 if ((TYPE_UNSIGNED (inner_op_type)
2342 || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2343 && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2344 return true;
2346 return false;
2349 return rhs_code == FIXED_CONVERT_EXPR;
2352 /* Return true if RHS is a suitable operand for a widening multiplication,
2353 assuming a target type of TYPE.
2354 There are two cases:
2356 - RHS makes some value at least twice as wide. Store that value
2357 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2359 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
2360 but leave *TYPE_OUT untouched. */
2362 static bool
2363 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2364 tree *new_rhs_out)
2366 gimple *stmt;
2367 tree type1, rhs1;
2369 if (TREE_CODE (rhs) == SSA_NAME)
2371 stmt = SSA_NAME_DEF_STMT (rhs);
2372 if (is_gimple_assign (stmt))
2374 if (! widening_mult_conversion_strippable_p (type, stmt))
2375 rhs1 = rhs;
2376 else
2378 rhs1 = gimple_assign_rhs1 (stmt);
2380 if (TREE_CODE (rhs1) == INTEGER_CST)
2382 *new_rhs_out = rhs1;
2383 *type_out = NULL;
2384 return true;
2388 else
2389 rhs1 = rhs;
2391 type1 = TREE_TYPE (rhs1);
2393 if (TREE_CODE (type1) != TREE_CODE (type)
2394 || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2395 return false;
2397 *new_rhs_out = rhs1;
2398 *type_out = type1;
2399 return true;
2402 if (TREE_CODE (rhs) == INTEGER_CST)
2404 *new_rhs_out = rhs;
2405 *type_out = NULL;
2406 return true;
2409 return false;
2412 /* Return true if STMT performs a widening multiplication, assuming the
2413 output type is TYPE. If so, store the unwidened types of the operands
2414 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
2415 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2416 and *TYPE2_OUT would give the operands of the multiplication. */
2418 static bool
2419 is_widening_mult_p (gimple *stmt,
2420 tree *type1_out, tree *rhs1_out,
2421 tree *type2_out, tree *rhs2_out)
2423 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2425 if (TREE_CODE (type) == INTEGER_TYPE)
2427 if (TYPE_OVERFLOW_TRAPS (type))
2428 return false;
2430 else if (TREE_CODE (type) != FIXED_POINT_TYPE)
2431 return false;
2433 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2434 rhs1_out))
2435 return false;
2437 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2438 rhs2_out))
2439 return false;
2441 if (*type1_out == NULL)
2443 if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2444 return false;
2445 *type1_out = *type2_out;
2448 if (*type2_out == NULL)
2450 if (!int_fits_type_p (*rhs2_out, *type1_out))
2451 return false;
2452 *type2_out = *type1_out;
2455 /* Ensure that the larger of the two operands comes first. */
2456 if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2458 std::swap (*type1_out, *type2_out);
2459 std::swap (*rhs1_out, *rhs2_out);
2462 return true;
2465 /* Check to see if the CALL statement is an invocation of copysign
2466 with 1. being the first argument. */
2467 static bool
2468 is_copysign_call_with_1 (gimple *call)
2470 gcall *c = dyn_cast <gcall *> (call);
2471 if (! c)
2472 return false;
2474 enum combined_fn code = gimple_call_combined_fn (c);
2476 if (code == CFN_LAST)
2477 return false;
2479 if (builtin_fn_p (code))
2481 switch (as_builtin_fn (code))
2483 CASE_FLT_FN (BUILT_IN_COPYSIGN):
2484 CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
2485 return real_onep (gimple_call_arg (c, 0));
2486 default:
2487 return false;
2491 if (internal_fn_p (code))
2493 switch (as_internal_fn (code))
2495 case IFN_COPYSIGN:
2496 return real_onep (gimple_call_arg (c, 0));
2497 default:
2498 return false;
2502 return false;
2505 /* Try to expand the pattern x * copysign (1, y) into xorsign (x, y).
2506 This only happens when the xorsign optab is defined, if the
2507 pattern is not a xorsign pattern or if expansion fails FALSE is
2508 returned, otherwise TRUE is returned. */
2509 static bool
2510 convert_expand_mult_copysign (gimple *stmt, gimple_stmt_iterator *gsi)
2512 tree treeop0, treeop1, lhs, type;
2513 location_t loc = gimple_location (stmt);
2514 lhs = gimple_assign_lhs (stmt);
2515 treeop0 = gimple_assign_rhs1 (stmt);
2516 treeop1 = gimple_assign_rhs2 (stmt);
2517 type = TREE_TYPE (lhs);
2518 machine_mode mode = TYPE_MODE (type);
2520 if (HONOR_SNANS (type))
2521 return false;
2523 if (TREE_CODE (treeop0) == SSA_NAME && TREE_CODE (treeop1) == SSA_NAME)
2525 gimple *call0 = SSA_NAME_DEF_STMT (treeop0);
2526 if (!has_single_use (treeop0) || !is_copysign_call_with_1 (call0))
2528 call0 = SSA_NAME_DEF_STMT (treeop1);
2529 if (!has_single_use (treeop1) || !is_copysign_call_with_1 (call0))
2530 return false;
2532 treeop1 = treeop0;
2534 if (optab_handler (xorsign_optab, mode) == CODE_FOR_nothing)
2535 return false;
2537 gcall *c = as_a<gcall*> (call0);
2538 treeop0 = gimple_call_arg (c, 1);
2540 gcall *call_stmt
2541 = gimple_build_call_internal (IFN_XORSIGN, 2, treeop1, treeop0);
2542 gimple_set_lhs (call_stmt, lhs);
2543 gimple_set_location (call_stmt, loc);
2544 gsi_replace (gsi, call_stmt, true);
2545 return true;
2548 return false;
2551 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2552 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
2553 value is true iff we converted the statement. */
2555 static bool
2556 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
2558 tree lhs, rhs1, rhs2, type, type1, type2;
2559 enum insn_code handler;
2560 scalar_int_mode to_mode, from_mode, actual_mode;
2561 optab op;
2562 int actual_precision;
2563 location_t loc = gimple_location (stmt);
2564 bool from_unsigned1, from_unsigned2;
2566 lhs = gimple_assign_lhs (stmt);
2567 type = TREE_TYPE (lhs);
2568 if (TREE_CODE (type) != INTEGER_TYPE)
2569 return false;
2571 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2572 return false;
2574 to_mode = SCALAR_INT_TYPE_MODE (type);
2575 from_mode = SCALAR_INT_TYPE_MODE (type1);
2576 if (to_mode == from_mode)
2577 return false;
2579 from_unsigned1 = TYPE_UNSIGNED (type1);
2580 from_unsigned2 = TYPE_UNSIGNED (type2);
2582 if (from_unsigned1 && from_unsigned2)
2583 op = umul_widen_optab;
2584 else if (!from_unsigned1 && !from_unsigned2)
2585 op = smul_widen_optab;
2586 else
2587 op = usmul_widen_optab;
2589 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2590 &actual_mode);
2592 if (handler == CODE_FOR_nothing)
2594 if (op != smul_widen_optab)
2596 /* We can use a signed multiply with unsigned types as long as
2597 there is a wider mode to use, or it is the smaller of the two
2598 types that is unsigned. Note that type1 >= type2, always. */
2599 if ((TYPE_UNSIGNED (type1)
2600 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2601 || (TYPE_UNSIGNED (type2)
2602 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2604 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2605 || GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2606 return false;
2609 op = smul_widen_optab;
2610 handler = find_widening_optab_handler_and_mode (op, to_mode,
2611 from_mode,
2612 &actual_mode);
2614 if (handler == CODE_FOR_nothing)
2615 return false;
2617 from_unsigned1 = from_unsigned2 = false;
2619 else
2620 return false;
2623 /* Ensure that the inputs to the handler are in the correct precison
2624 for the opcode. This will be the full mode size. */
2625 actual_precision = GET_MODE_PRECISION (actual_mode);
2626 if (2 * actual_precision > TYPE_PRECISION (type))
2627 return false;
2628 if (actual_precision != TYPE_PRECISION (type1)
2629 || from_unsigned1 != TYPE_UNSIGNED (type1))
2630 rhs1 = build_and_insert_cast (gsi, loc,
2631 build_nonstandard_integer_type
2632 (actual_precision, from_unsigned1), rhs1);
2633 if (actual_precision != TYPE_PRECISION (type2)
2634 || from_unsigned2 != TYPE_UNSIGNED (type2))
2635 rhs2 = build_and_insert_cast (gsi, loc,
2636 build_nonstandard_integer_type
2637 (actual_precision, from_unsigned2), rhs2);
2639 /* Handle constants. */
2640 if (TREE_CODE (rhs1) == INTEGER_CST)
2641 rhs1 = fold_convert (type1, rhs1);
2642 if (TREE_CODE (rhs2) == INTEGER_CST)
2643 rhs2 = fold_convert (type2, rhs2);
2645 gimple_assign_set_rhs1 (stmt, rhs1);
2646 gimple_assign_set_rhs2 (stmt, rhs2);
2647 gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2648 update_stmt (stmt);
2649 widen_mul_stats.widen_mults_inserted++;
2650 return true;
2653 /* Process a single gimple statement STMT, which is found at the
2654 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2655 rhs (given by CODE), and try to convert it into a
2656 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
2657 is true iff we converted the statement. */
2659 static bool
2660 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
2661 enum tree_code code)
2663 gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
2664 gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
2665 tree type, type1, type2, optype;
2666 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2667 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2668 optab this_optab;
2669 enum tree_code wmult_code;
2670 enum insn_code handler;
2671 scalar_mode to_mode, from_mode, actual_mode;
2672 location_t loc = gimple_location (stmt);
2673 int actual_precision;
2674 bool from_unsigned1, from_unsigned2;
2676 lhs = gimple_assign_lhs (stmt);
2677 type = TREE_TYPE (lhs);
2678 if (TREE_CODE (type) != INTEGER_TYPE
2679 && TREE_CODE (type) != FIXED_POINT_TYPE)
2680 return false;
2682 if (code == MINUS_EXPR)
2683 wmult_code = WIDEN_MULT_MINUS_EXPR;
2684 else
2685 wmult_code = WIDEN_MULT_PLUS_EXPR;
2687 rhs1 = gimple_assign_rhs1 (stmt);
2688 rhs2 = gimple_assign_rhs2 (stmt);
2690 if (TREE_CODE (rhs1) == SSA_NAME)
2692 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2693 if (is_gimple_assign (rhs1_stmt))
2694 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2697 if (TREE_CODE (rhs2) == SSA_NAME)
2699 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2700 if (is_gimple_assign (rhs2_stmt))
2701 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2704 /* Allow for one conversion statement between the multiply
2705 and addition/subtraction statement. If there are more than
2706 one conversions then we assume they would invalidate this
2707 transformation. If that's not the case then they should have
2708 been folded before now. */
2709 if (CONVERT_EXPR_CODE_P (rhs1_code))
2711 conv1_stmt = rhs1_stmt;
2712 rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2713 if (TREE_CODE (rhs1) == SSA_NAME)
2715 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2716 if (is_gimple_assign (rhs1_stmt))
2717 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2719 else
2720 return false;
2722 if (CONVERT_EXPR_CODE_P (rhs2_code))
2724 conv2_stmt = rhs2_stmt;
2725 rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2726 if (TREE_CODE (rhs2) == SSA_NAME)
2728 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2729 if (is_gimple_assign (rhs2_stmt))
2730 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2732 else
2733 return false;
2736 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2737 is_widening_mult_p, but we still need the rhs returns.
2739 It might also appear that it would be sufficient to use the existing
2740 operands of the widening multiply, but that would limit the choice of
2741 multiply-and-accumulate instructions.
2743 If the widened-multiplication result has more than one uses, it is
2744 probably wiser not to do the conversion. Also restrict this operation
2745 to single basic block to avoid moving the multiply to a different block
2746 with a higher execution frequency. */
2747 if (code == PLUS_EXPR
2748 && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2750 if (!has_single_use (rhs1)
2751 || gimple_bb (rhs1_stmt) != gimple_bb (stmt)
2752 || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2753 &type2, &mult_rhs2))
2754 return false;
2755 add_rhs = rhs2;
2756 conv_stmt = conv1_stmt;
2758 else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2760 if (!has_single_use (rhs2)
2761 || gimple_bb (rhs2_stmt) != gimple_bb (stmt)
2762 || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2763 &type2, &mult_rhs2))
2764 return false;
2765 add_rhs = rhs1;
2766 conv_stmt = conv2_stmt;
2768 else
2769 return false;
2771 to_mode = SCALAR_TYPE_MODE (type);
2772 from_mode = SCALAR_TYPE_MODE (type1);
2773 if (to_mode == from_mode)
2774 return false;
2776 from_unsigned1 = TYPE_UNSIGNED (type1);
2777 from_unsigned2 = TYPE_UNSIGNED (type2);
2778 optype = type1;
2780 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
2781 if (from_unsigned1 != from_unsigned2)
2783 if (!INTEGRAL_TYPE_P (type))
2784 return false;
2785 /* We can use a signed multiply with unsigned types as long as
2786 there is a wider mode to use, or it is the smaller of the two
2787 types that is unsigned. Note that type1 >= type2, always. */
2788 if ((from_unsigned1
2789 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2790 || (from_unsigned2
2791 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2793 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2794 || GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2795 return false;
2798 from_unsigned1 = from_unsigned2 = false;
2799 optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2800 false);
2803 /* If there was a conversion between the multiply and addition
2804 then we need to make sure it fits a multiply-and-accumulate.
2805 The should be a single mode change which does not change the
2806 value. */
2807 if (conv_stmt)
2809 /* We use the original, unmodified data types for this. */
2810 tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2811 tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2812 int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2813 bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2815 if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2817 /* Conversion is a truncate. */
2818 if (TYPE_PRECISION (to_type) < data_size)
2819 return false;
2821 else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2823 /* Conversion is an extend. Check it's the right sort. */
2824 if (TYPE_UNSIGNED (from_type) != is_unsigned
2825 && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2826 return false;
2828 /* else convert is a no-op for our purposes. */
2831 /* Verify that the machine can perform a widening multiply
2832 accumulate in this mode/signedness combination, otherwise
2833 this transformation is likely to pessimize code. */
2834 this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2835 handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2836 from_mode, &actual_mode);
2838 if (handler == CODE_FOR_nothing)
2839 return false;
2841 /* Ensure that the inputs to the handler are in the correct precison
2842 for the opcode. This will be the full mode size. */
2843 actual_precision = GET_MODE_PRECISION (actual_mode);
2844 if (actual_precision != TYPE_PRECISION (type1)
2845 || from_unsigned1 != TYPE_UNSIGNED (type1))
2846 mult_rhs1 = build_and_insert_cast (gsi, loc,
2847 build_nonstandard_integer_type
2848 (actual_precision, from_unsigned1),
2849 mult_rhs1);
2850 if (actual_precision != TYPE_PRECISION (type2)
2851 || from_unsigned2 != TYPE_UNSIGNED (type2))
2852 mult_rhs2 = build_and_insert_cast (gsi, loc,
2853 build_nonstandard_integer_type
2854 (actual_precision, from_unsigned2),
2855 mult_rhs2);
2857 if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2858 add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2860 /* Handle constants. */
2861 if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2862 mult_rhs1 = fold_convert (type1, mult_rhs1);
2863 if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2864 mult_rhs2 = fold_convert (type2, mult_rhs2);
2866 gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
2867 add_rhs);
2868 update_stmt (gsi_stmt (*gsi));
2869 widen_mul_stats.maccs_inserted++;
2870 return true;
2873 /* Given a result MUL_RESULT which is a result of a multiplication of OP1 and
2874 OP2 and which we know is used in statements that can be, together with the
2875 multiplication, converted to FMAs, perform the transformation. */
2877 static void
2878 convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
2880 tree type = TREE_TYPE (mul_result);
2881 gimple *use_stmt;
2882 imm_use_iterator imm_iter;
2883 gcall *fma_stmt;
2885 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2887 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2888 tree addop, mulop1 = op1, result = mul_result;
2889 bool negate_p = false;
2890 gimple_seq seq = NULL;
2892 if (is_gimple_debug (use_stmt))
2893 continue;
2895 if (is_gimple_assign (use_stmt)
2896 && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
2898 result = gimple_assign_lhs (use_stmt);
2899 use_operand_p use_p;
2900 gimple *neguse_stmt;
2901 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2902 gsi_remove (&gsi, true);
2903 release_defs (use_stmt);
2905 use_stmt = neguse_stmt;
2906 gsi = gsi_for_stmt (use_stmt);
2907 negate_p = true;
2910 tree cond, else_value, ops[3];
2911 tree_code code;
2912 if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code,
2913 ops, &else_value))
2914 gcc_unreachable ();
2915 addop = ops[0] == result ? ops[1] : ops[0];
2917 if (code == MINUS_EXPR)
2919 if (ops[0] == result)
2920 /* a * b - c -> a * b + (-c) */
2921 addop = gimple_build (&seq, NEGATE_EXPR, type, addop);
2922 else
2923 /* a - b * c -> (-b) * c + a */
2924 negate_p = !negate_p;
2927 if (negate_p)
2928 mulop1 = gimple_build (&seq, NEGATE_EXPR, type, mulop1);
2930 if (seq)
2931 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
2933 if (cond)
2934 fma_stmt = gimple_build_call_internal (IFN_COND_FMA, 5, cond, mulop1,
2935 op2, addop, else_value);
2936 else
2937 fma_stmt = gimple_build_call_internal (IFN_FMA, 3, mulop1, op2, addop);
2938 gimple_set_lhs (fma_stmt, gimple_get_lhs (use_stmt));
2939 gimple_call_set_nothrow (fma_stmt, !stmt_can_throw_internal (cfun,
2940 use_stmt));
2941 gsi_replace (&gsi, fma_stmt, true);
2942 /* Follow all SSA edges so that we generate FMS, FNMA and FNMS
2943 regardless of where the negation occurs. */
2944 gimple *orig_stmt = gsi_stmt (gsi);
2945 if (fold_stmt (&gsi, follow_all_ssa_edges))
2947 if (maybe_clean_or_replace_eh_stmt (orig_stmt, gsi_stmt (gsi)))
2948 gcc_unreachable ();
2949 update_stmt (gsi_stmt (gsi));
2952 if (dump_file && (dump_flags & TDF_DETAILS))
2954 fprintf (dump_file, "Generated FMA ");
2955 print_gimple_stmt (dump_file, gsi_stmt (gsi), 0, TDF_NONE);
2956 fprintf (dump_file, "\n");
2959 /* If the FMA result is negated in a single use, fold the negation
2960 too. */
2961 orig_stmt = gsi_stmt (gsi);
2962 use_operand_p use_p;
2963 gimple *neg_stmt;
2964 if (is_gimple_call (orig_stmt)
2965 && gimple_call_internal_p (orig_stmt)
2966 && gimple_call_lhs (orig_stmt)
2967 && TREE_CODE (gimple_call_lhs (orig_stmt)) == SSA_NAME
2968 && single_imm_use (gimple_call_lhs (orig_stmt), &use_p, &neg_stmt)
2969 && is_gimple_assign (neg_stmt)
2970 && gimple_assign_rhs_code (neg_stmt) == NEGATE_EXPR
2971 && !stmt_could_throw_p (cfun, neg_stmt))
2973 gsi = gsi_for_stmt (neg_stmt);
2974 if (fold_stmt (&gsi, follow_all_ssa_edges))
2976 if (maybe_clean_or_replace_eh_stmt (neg_stmt, gsi_stmt (gsi)))
2977 gcc_unreachable ();
2978 update_stmt (gsi_stmt (gsi));
2979 if (dump_file && (dump_flags & TDF_DETAILS))
2981 fprintf (dump_file, "Folded FMA negation ");
2982 print_gimple_stmt (dump_file, gsi_stmt (gsi), 0, TDF_NONE);
2983 fprintf (dump_file, "\n");
2988 widen_mul_stats.fmas_inserted++;
2992 /* Data necessary to perform the actual transformation from a multiplication
2993 and an addition to an FMA after decision is taken it should be done and to
2994 then delete the multiplication statement from the function IL. */
2996 struct fma_transformation_info
2998 gimple *mul_stmt;
2999 tree mul_result;
3000 tree op1;
3001 tree op2;
3004 /* Structure containing the current state of FMA deferring, i.e. whether we are
3005 deferring, whether to continue deferring, and all data necessary to come
3006 back and perform all deferred transformations. */
3008 class fma_deferring_state
3010 public:
3011 /* Class constructor. Pass true as PERFORM_DEFERRING in order to actually
3012 do any deferring. */
3014 fma_deferring_state (bool perform_deferring)
3015 : m_candidates (), m_mul_result_set (), m_initial_phi (NULL),
3016 m_last_result (NULL_TREE), m_deferring_p (perform_deferring) {}
3018 /* List of FMA candidates for which we the transformation has been determined
3019 possible but we at this point in BB analysis we do not consider them
3020 beneficial. */
3021 auto_vec<fma_transformation_info, 8> m_candidates;
3023 /* Set of results of multiplication that are part of an already deferred FMA
3024 candidates. */
3025 hash_set<tree> m_mul_result_set;
3027 /* The PHI that supposedly feeds back result of a FMA to another over loop
3028 boundary. */
3029 gphi *m_initial_phi;
3031 /* Result of the last produced FMA candidate or NULL if there has not been
3032 one. */
3033 tree m_last_result;
3035 /* If true, deferring might still be profitable. If false, transform all
3036 candidates and no longer defer. */
3037 bool m_deferring_p;
3040 /* Transform all deferred FMA candidates and mark STATE as no longer
3041 deferring. */
3043 static void
3044 cancel_fma_deferring (fma_deferring_state *state)
3046 if (!state->m_deferring_p)
3047 return;
3049 for (unsigned i = 0; i < state->m_candidates.length (); i++)
3051 if (dump_file && (dump_flags & TDF_DETAILS))
3052 fprintf (dump_file, "Generating deferred FMA\n");
3054 const fma_transformation_info &fti = state->m_candidates[i];
3055 convert_mult_to_fma_1 (fti.mul_result, fti.op1, fti.op2);
3057 gimple_stmt_iterator gsi = gsi_for_stmt (fti.mul_stmt);
3058 gsi_remove (&gsi, true);
3059 release_defs (fti.mul_stmt);
3061 state->m_deferring_p = false;
3064 /* If OP is an SSA name defined by a PHI node, return the PHI statement.
3065 Otherwise return NULL. */
3067 static gphi *
3068 result_of_phi (tree op)
3070 if (TREE_CODE (op) != SSA_NAME)
3071 return NULL;
3073 return dyn_cast <gphi *> (SSA_NAME_DEF_STMT (op));
3076 /* After processing statements of a BB and recording STATE, return true if the
3077 initial phi is fed by the last FMA candidate result ore one such result from
3078 previously processed BBs marked in LAST_RESULT_SET. */
3080 static bool
3081 last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
3082 hash_set<tree> *last_result_set)
3084 ssa_op_iter iter;
3085 use_operand_p use;
3086 FOR_EACH_PHI_ARG (use, state->m_initial_phi, iter, SSA_OP_USE)
3088 tree t = USE_FROM_PTR (use);
3089 if (t == state->m_last_result
3090 || last_result_set->contains (t))
3091 return true;
3094 return false;
3097 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3098 with uses in additions and subtractions to form fused multiply-add
3099 operations. Returns true if successful and MUL_STMT should be removed.
3100 If MUL_COND is nonnull, the multiplication in MUL_STMT is conditional
3101 on MUL_COND, otherwise it is unconditional.
3103 If STATE indicates that we are deferring FMA transformation, that means
3104 that we do not produce FMAs for basic blocks which look like:
3106 <bb 6>
3107 # accumulator_111 = PHI <0.0(5), accumulator_66(6)>
3108 _65 = _14 * _16;
3109 accumulator_66 = _65 + accumulator_111;
3111 or its unrolled version, i.e. with several FMA candidates that feed result
3112 of one into the addend of another. Instead, we add them to a list in STATE
3113 and if we later discover an FMA candidate that is not part of such a chain,
3114 we go back and perform all deferred past candidates. */
3116 static bool
3117 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
3118 fma_deferring_state *state, tree mul_cond = NULL_TREE)
3120 tree mul_result = gimple_get_lhs (mul_stmt);
3121 tree type = TREE_TYPE (mul_result);
3122 gimple *use_stmt, *neguse_stmt;
3123 use_operand_p use_p;
3124 imm_use_iterator imm_iter;
3126 if (FLOAT_TYPE_P (type)
3127 && flag_fp_contract_mode == FP_CONTRACT_OFF)
3128 return false;
3130 /* We don't want to do bitfield reduction ops. */
3131 if (INTEGRAL_TYPE_P (type)
3132 && (!type_has_mode_precision_p (type) || TYPE_OVERFLOW_TRAPS (type)))
3133 return false;
3135 /* If the target doesn't support it, don't generate it. We assume that
3136 if fma isn't available then fms, fnma or fnms are not either. */
3137 optimization_type opt_type = bb_optimization_type (gimple_bb (mul_stmt));
3138 if (!direct_internal_fn_supported_p (IFN_FMA, type, opt_type))
3139 return false;
3141 /* If the multiplication has zero uses, it is kept around probably because
3142 of -fnon-call-exceptions. Don't optimize it away in that case,
3143 it is DCE job. */
3144 if (has_zero_uses (mul_result))
3145 return false;
3147 bool check_defer
3148 = (state->m_deferring_p
3149 && (tree_to_shwi (TYPE_SIZE (type))
3150 <= param_avoid_fma_max_bits));
3151 bool defer = check_defer;
3152 bool seen_negate_p = false;
3153 /* Make sure that the multiplication statement becomes dead after
3154 the transformation, thus that all uses are transformed to FMAs.
3155 This means we assume that an FMA operation has the same cost
3156 as an addition. */
3157 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3159 tree result = mul_result;
3160 bool negate_p = false;
3162 use_stmt = USE_STMT (use_p);
3164 if (is_gimple_debug (use_stmt))
3165 continue;
3167 /* For now restrict this operations to single basic blocks. In theory
3168 we would want to support sinking the multiplication in
3169 m = a*b;
3170 if ()
3171 ma = m + c;
3172 else
3173 d = m;
3174 to form a fma in the then block and sink the multiplication to the
3175 else block. */
3176 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3177 return false;
3179 /* A negate on the multiplication leads to FNMA. */
3180 if (is_gimple_assign (use_stmt)
3181 && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
3183 ssa_op_iter iter;
3184 use_operand_p usep;
3186 /* If (due to earlier missed optimizations) we have two
3187 negates of the same value, treat them as equivalent
3188 to a single negate with multiple uses. */
3189 if (seen_negate_p)
3190 return false;
3192 result = gimple_assign_lhs (use_stmt);
3194 /* Make sure the negate statement becomes dead with this
3195 single transformation. */
3196 if (!single_imm_use (gimple_assign_lhs (use_stmt),
3197 &use_p, &neguse_stmt))
3198 return false;
3200 /* Make sure the multiplication isn't also used on that stmt. */
3201 FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3202 if (USE_FROM_PTR (usep) == mul_result)
3203 return false;
3205 /* Re-validate. */
3206 use_stmt = neguse_stmt;
3207 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3208 return false;
3210 negate_p = seen_negate_p = true;
3213 tree cond, else_value, ops[3];
3214 tree_code code;
3215 if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code, ops,
3216 &else_value))
3217 return false;
3219 switch (code)
3221 case MINUS_EXPR:
3222 if (ops[1] == result)
3223 negate_p = !negate_p;
3224 break;
3225 case PLUS_EXPR:
3226 break;
3227 default:
3228 /* FMA can only be formed from PLUS and MINUS. */
3229 return false;
3232 if (mul_cond && cond != mul_cond)
3233 return false;
3235 if (cond)
3237 if (cond == result || else_value == result)
3238 return false;
3239 if (!direct_internal_fn_supported_p (IFN_COND_FMA, type, opt_type))
3240 return false;
3243 /* If the subtrahend (OPS[1]) is computed by a MULT_EXPR that
3244 we'll visit later, we might be able to get a more profitable
3245 match with fnma.
3246 OTOH, if we don't, a negate / fma pair has likely lower latency
3247 that a mult / subtract pair. */
3248 if (code == MINUS_EXPR
3249 && !negate_p
3250 && ops[0] == result
3251 && !direct_internal_fn_supported_p (IFN_FMS, type, opt_type)
3252 && direct_internal_fn_supported_p (IFN_FNMA, type, opt_type)
3253 && TREE_CODE (ops[1]) == SSA_NAME
3254 && has_single_use (ops[1]))
3256 gimple *stmt2 = SSA_NAME_DEF_STMT (ops[1]);
3257 if (is_gimple_assign (stmt2)
3258 && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3259 return false;
3262 /* We can't handle a * b + a * b. */
3263 if (ops[0] == ops[1])
3264 return false;
3265 /* If deferring, make sure we are not looking at an instruction that
3266 wouldn't have existed if we were not. */
3267 if (state->m_deferring_p
3268 && (state->m_mul_result_set.contains (ops[0])
3269 || state->m_mul_result_set.contains (ops[1])))
3270 return false;
3272 if (check_defer)
3274 tree use_lhs = gimple_get_lhs (use_stmt);
3275 if (state->m_last_result)
3277 if (ops[1] == state->m_last_result
3278 || ops[0] == state->m_last_result)
3279 defer = true;
3280 else
3281 defer = false;
3283 else
3285 gcc_checking_assert (!state->m_initial_phi);
3286 gphi *phi;
3287 if (ops[0] == result)
3288 phi = result_of_phi (ops[1]);
3289 else
3291 gcc_assert (ops[1] == result);
3292 phi = result_of_phi (ops[0]);
3295 if (phi)
3297 state->m_initial_phi = phi;
3298 defer = true;
3300 else
3301 defer = false;
3304 state->m_last_result = use_lhs;
3305 check_defer = false;
3307 else
3308 defer = false;
3310 /* While it is possible to validate whether or not the exact form that
3311 we've recognized is available in the backend, the assumption is that
3312 if the deferring logic above did not trigger, the transformation is
3313 never a loss. For instance, suppose the target only has the plain FMA
3314 pattern available. Consider a*b-c -> fma(a,b,-c): we've exchanged
3315 MUL+SUB for FMA+NEG, which is still two operations. Consider
3316 -(a*b)-c -> fma(-a,b,-c): we still have 3 operations, but in the FMA
3317 form the two NEGs are independent and could be run in parallel. */
3320 if (defer)
3322 fma_transformation_info fti;
3323 fti.mul_stmt = mul_stmt;
3324 fti.mul_result = mul_result;
3325 fti.op1 = op1;
3326 fti.op2 = op2;
3327 state->m_candidates.safe_push (fti);
3328 state->m_mul_result_set.add (mul_result);
3330 if (dump_file && (dump_flags & TDF_DETAILS))
3332 fprintf (dump_file, "Deferred generating FMA for multiplication ");
3333 print_gimple_stmt (dump_file, mul_stmt, 0, TDF_NONE);
3334 fprintf (dump_file, "\n");
3337 return false;
3339 else
3341 if (state->m_deferring_p)
3342 cancel_fma_deferring (state);
3343 convert_mult_to_fma_1 (mul_result, op1, op2);
3344 return true;
3349 /* Helper function of match_uaddsub_overflow. Return 1
3350 if USE_STMT is unsigned overflow check ovf != 0 for
3351 STMT, -1 if USE_STMT is unsigned overflow check ovf == 0
3352 and 0 otherwise. */
3354 static int
3355 uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt)
3357 enum tree_code ccode = ERROR_MARK;
3358 tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
3359 if (gimple_code (use_stmt) == GIMPLE_COND)
3361 ccode = gimple_cond_code (use_stmt);
3362 crhs1 = gimple_cond_lhs (use_stmt);
3363 crhs2 = gimple_cond_rhs (use_stmt);
3365 else if (is_gimple_assign (use_stmt))
3367 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3369 ccode = gimple_assign_rhs_code (use_stmt);
3370 crhs1 = gimple_assign_rhs1 (use_stmt);
3371 crhs2 = gimple_assign_rhs2 (use_stmt);
3373 else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
3375 tree cond = gimple_assign_rhs1 (use_stmt);
3376 if (COMPARISON_CLASS_P (cond))
3378 ccode = TREE_CODE (cond);
3379 crhs1 = TREE_OPERAND (cond, 0);
3380 crhs2 = TREE_OPERAND (cond, 1);
3382 else
3383 return 0;
3385 else
3386 return 0;
3388 else
3389 return 0;
3391 if (TREE_CODE_CLASS (ccode) != tcc_comparison)
3392 return 0;
3394 enum tree_code code = gimple_assign_rhs_code (stmt);
3395 tree lhs = gimple_assign_lhs (stmt);
3396 tree rhs1 = gimple_assign_rhs1 (stmt);
3397 tree rhs2 = gimple_assign_rhs2 (stmt);
3399 switch (ccode)
3401 case GT_EXPR:
3402 case LE_EXPR:
3403 /* r = a - b; r > a or r <= a
3404 r = a + b; a > r or a <= r or b > r or b <= r. */
3405 if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1)
3406 || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2)
3407 && crhs2 == lhs))
3408 return ccode == GT_EXPR ? 1 : -1;
3409 break;
3410 case LT_EXPR:
3411 case GE_EXPR:
3412 /* r = a - b; a < r or a >= r
3413 r = a + b; r < a or r >= a or r < b or r >= b. */
3414 if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs)
3415 || (code == PLUS_EXPR && crhs1 == lhs
3416 && (crhs2 == rhs1 || crhs2 == rhs2)))
3417 return ccode == LT_EXPR ? 1 : -1;
3418 break;
3419 default:
3420 break;
3422 return 0;
3425 /* Recognize for unsigned x
3426 x = y - z;
3427 if (x > y)
3428 where there are other uses of x and replace it with
3429 _7 = SUB_OVERFLOW (y, z);
3430 x = REALPART_EXPR <_7>;
3431 _8 = IMAGPART_EXPR <_7>;
3432 if (_8)
3433 and similarly for addition. */
3435 static bool
3436 match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
3437 enum tree_code code)
3439 tree lhs = gimple_assign_lhs (stmt);
3440 tree type = TREE_TYPE (lhs);
3441 use_operand_p use_p;
3442 imm_use_iterator iter;
3443 bool use_seen = false;
3444 bool ovf_use_seen = false;
3445 gimple *use_stmt;
3447 gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
3448 if (!INTEGRAL_TYPE_P (type)
3449 || !TYPE_UNSIGNED (type)
3450 || has_zero_uses (lhs)
3451 || has_single_use (lhs)
3452 || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
3453 TYPE_MODE (type)) == CODE_FOR_nothing)
3454 return false;
3456 FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
3458 use_stmt = USE_STMT (use_p);
3459 if (is_gimple_debug (use_stmt))
3460 continue;
3462 if (uaddsub_overflow_check_p (stmt, use_stmt))
3463 ovf_use_seen = true;
3464 else
3465 use_seen = true;
3466 if (ovf_use_seen && use_seen)
3467 break;
3470 if (!ovf_use_seen || !use_seen)
3471 return false;
3473 tree ctype = build_complex_type (type);
3474 tree rhs1 = gimple_assign_rhs1 (stmt);
3475 tree rhs2 = gimple_assign_rhs2 (stmt);
3476 gcall *g = gimple_build_call_internal (code == PLUS_EXPR
3477 ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW,
3478 2, rhs1, rhs2);
3479 tree ctmp = make_ssa_name (ctype);
3480 gimple_call_set_lhs (g, ctmp);
3481 gsi_insert_before (gsi, g, GSI_SAME_STMT);
3482 gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR,
3483 build1 (REALPART_EXPR, type, ctmp));
3484 gsi_replace (gsi, g2, true);
3485 tree ovf = make_ssa_name (type);
3486 g2 = gimple_build_assign (ovf, IMAGPART_EXPR,
3487 build1 (IMAGPART_EXPR, type, ctmp));
3488 gsi_insert_after (gsi, g2, GSI_NEW_STMT);
3490 FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
3492 if (is_gimple_debug (use_stmt))
3493 continue;
3495 int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt);
3496 if (ovf_use == 0)
3497 continue;
3498 if (gimple_code (use_stmt) == GIMPLE_COND)
3500 gcond *cond_stmt = as_a <gcond *> (use_stmt);
3501 gimple_cond_set_lhs (cond_stmt, ovf);
3502 gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0));
3503 gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3505 else
3507 gcc_checking_assert (is_gimple_assign (use_stmt));
3508 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3510 gimple_assign_set_rhs1 (use_stmt, ovf);
3511 gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0));
3512 gimple_assign_set_rhs_code (use_stmt,
3513 ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3515 else
3517 gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
3518 == COND_EXPR);
3519 tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR,
3520 boolean_type_node, ovf,
3521 build_int_cst (type, 0));
3522 gimple_assign_set_rhs1 (use_stmt, cond);
3525 update_stmt (use_stmt);
3527 return true;
3530 /* Return true if target has support for divmod. */
3532 static bool
3533 target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode)
3535 /* If target supports hardware divmod insn, use it for divmod. */
3536 if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing)
3537 return true;
3539 /* Check if libfunc for divmod is available. */
3540 rtx libfunc = optab_libfunc (divmod_optab, mode);
3541 if (libfunc != NULL_RTX)
3543 /* If optab_handler exists for div_optab, perhaps in a wider mode,
3544 we don't want to use the libfunc even if it exists for given mode. */
3545 machine_mode div_mode;
3546 FOR_EACH_MODE_FROM (div_mode, mode)
3547 if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing)
3548 return false;
3550 return targetm.expand_divmod_libfunc != NULL;
3553 return false;
3556 /* Check if stmt is candidate for divmod transform. */
3558 static bool
3559 divmod_candidate_p (gassign *stmt)
3561 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3562 machine_mode mode = TYPE_MODE (type);
3563 optab divmod_optab, div_optab;
3565 if (TYPE_UNSIGNED (type))
3567 divmod_optab = udivmod_optab;
3568 div_optab = udiv_optab;
3570 else
3572 divmod_optab = sdivmod_optab;
3573 div_optab = sdiv_optab;
3576 tree op1 = gimple_assign_rhs1 (stmt);
3577 tree op2 = gimple_assign_rhs2 (stmt);
3579 /* Disable the transform if either is a constant, since division-by-constant
3580 may have specialized expansion. */
3581 if (CONSTANT_CLASS_P (op1))
3582 return false;
3584 if (CONSTANT_CLASS_P (op2))
3586 if (integer_pow2p (op2))
3587 return false;
3589 if (TYPE_PRECISION (type) <= HOST_BITS_PER_WIDE_INT
3590 && TYPE_PRECISION (type) <= BITS_PER_WORD)
3591 return false;
3593 /* If the divisor is not power of 2 and the precision wider than
3594 HWI, expand_divmod punts on that, so in that case it is better
3595 to use divmod optab or libfunc. Similarly if choose_multiplier
3596 might need pre/post shifts of BITS_PER_WORD or more. */
3599 /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should
3600 expand using the [su]divv optabs. */
3601 if (TYPE_OVERFLOW_TRAPS (type))
3602 return false;
3604 if (!target_supports_divmod_p (divmod_optab, div_optab, mode))
3605 return false;
3607 return true;
3610 /* This function looks for:
3611 t1 = a TRUNC_DIV_EXPR b;
3612 t2 = a TRUNC_MOD_EXPR b;
3613 and transforms it to the following sequence:
3614 complex_tmp = DIVMOD (a, b);
3615 t1 = REALPART_EXPR(a);
3616 t2 = IMAGPART_EXPR(b);
3617 For conditions enabling the transform see divmod_candidate_p().
3619 The pass has three parts:
3620 1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all
3621 other trunc_div_expr and trunc_mod_expr stmts.
3622 2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt
3623 to stmts vector.
3624 3) Insert DIVMOD call just before top_stmt and update entries in
3625 stmts vector to use return value of DIMOVD (REALEXPR_PART for div,
3626 IMAGPART_EXPR for mod). */
3628 static bool
3629 convert_to_divmod (gassign *stmt)
3631 if (stmt_can_throw_internal (cfun, stmt)
3632 || !divmod_candidate_p (stmt))
3633 return false;
3635 tree op1 = gimple_assign_rhs1 (stmt);
3636 tree op2 = gimple_assign_rhs2 (stmt);
3638 imm_use_iterator use_iter;
3639 gimple *use_stmt;
3640 auto_vec<gimple *> stmts;
3642 gimple *top_stmt = stmt;
3643 basic_block top_bb = gimple_bb (stmt);
3645 /* Part 1: Try to set top_stmt to "topmost" stmt that dominates
3646 at-least stmt and possibly other trunc_div/trunc_mod stmts
3647 having same operands as stmt. */
3649 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1)
3651 if (is_gimple_assign (use_stmt)
3652 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3653 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3654 && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
3655 && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
3657 if (stmt_can_throw_internal (cfun, use_stmt))
3658 continue;
3660 basic_block bb = gimple_bb (use_stmt);
3662 if (bb == top_bb)
3664 if (gimple_uid (use_stmt) < gimple_uid (top_stmt))
3665 top_stmt = use_stmt;
3667 else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb))
3669 top_bb = bb;
3670 top_stmt = use_stmt;
3675 tree top_op1 = gimple_assign_rhs1 (top_stmt);
3676 tree top_op2 = gimple_assign_rhs2 (top_stmt);
3678 stmts.safe_push (top_stmt);
3679 bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR);
3681 /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb
3682 to stmts vector. The 2nd loop will always add stmt to stmts vector, since
3683 gimple_bb (top_stmt) dominates gimple_bb (stmt), so the
3684 2nd loop ends up adding at-least single trunc_mod_expr stmt. */
3686 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1)
3688 if (is_gimple_assign (use_stmt)
3689 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3690 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3691 && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
3692 && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
3694 if (use_stmt == top_stmt
3695 || stmt_can_throw_internal (cfun, use_stmt)
3696 || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
3697 continue;
3699 stmts.safe_push (use_stmt);
3700 if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
3701 div_seen = true;
3705 if (!div_seen)
3706 return false;
3708 /* Part 3: Create libcall to internal fn DIVMOD:
3709 divmod_tmp = DIVMOD (op1, op2). */
3711 gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
3712 tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)),
3713 call_stmt, "divmod_tmp");
3714 gimple_call_set_lhs (call_stmt, res);
3715 /* We rejected throwing statements above. */
3716 gimple_call_set_nothrow (call_stmt, true);
3718 /* Insert the call before top_stmt. */
3719 gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt);
3720 gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT);
3722 widen_mul_stats.divmod_calls_inserted++;
3724 /* Update all statements in stmts vector:
3725 lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp>
3726 lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>. */
3728 for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i)
3730 tree new_rhs;
3732 switch (gimple_assign_rhs_code (use_stmt))
3734 case TRUNC_DIV_EXPR:
3735 new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res);
3736 break;
3738 case TRUNC_MOD_EXPR:
3739 new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res);
3740 break;
3742 default:
3743 gcc_unreachable ();
3746 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3747 gimple_assign_set_rhs_from_tree (&gsi, new_rhs);
3748 update_stmt (use_stmt);
3751 return true;
3754 /* Find integer multiplications where the operands are extended from
3755 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3756 where appropriate. */
3758 namespace {
3760 const pass_data pass_data_optimize_widening_mul =
3762 GIMPLE_PASS, /* type */
3763 "widening_mul", /* name */
3764 OPTGROUP_NONE, /* optinfo_flags */
3765 TV_TREE_WIDEN_MUL, /* tv_id */
3766 PROP_ssa, /* properties_required */
3767 0, /* properties_provided */
3768 0, /* properties_destroyed */
3769 0, /* todo_flags_start */
3770 TODO_update_ssa, /* todo_flags_finish */
3773 class pass_optimize_widening_mul : public gimple_opt_pass
3775 public:
3776 pass_optimize_widening_mul (gcc::context *ctxt)
3777 : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3780 /* opt_pass methods: */
3781 virtual bool gate (function *)
3783 return flag_expensive_optimizations && optimize;
3786 virtual unsigned int execute (function *);
3788 }; // class pass_optimize_widening_mul
3790 /* Walker class to perform the transformation in reverse dominance order. */
3792 class math_opts_dom_walker : public dom_walker
3794 public:
3795 /* Constructor, CFG_CHANGED is a pointer to a boolean flag that will be set
3796 if walking modidifes the CFG. */
3798 math_opts_dom_walker (bool *cfg_changed_p)
3799 : dom_walker (CDI_DOMINATORS), m_last_result_set (),
3800 m_cfg_changed_p (cfg_changed_p) {}
3802 /* The actual actions performed in the walk. */
3804 virtual void after_dom_children (basic_block);
3806 /* Set of results of chains of multiply and add statement combinations that
3807 were not transformed into FMAs because of active deferring. */
3808 hash_set<tree> m_last_result_set;
3810 /* Pointer to a flag of the user that needs to be set if CFG has been
3811 modified. */
3812 bool *m_cfg_changed_p;
3815 void
3816 math_opts_dom_walker::after_dom_children (basic_block bb)
3818 gimple_stmt_iterator gsi;
3820 fma_deferring_state fma_state (param_avoid_fma_max_bits > 0);
3822 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3824 gimple *stmt = gsi_stmt (gsi);
3825 enum tree_code code;
3827 if (is_gimple_assign (stmt))
3829 code = gimple_assign_rhs_code (stmt);
3830 switch (code)
3832 case MULT_EXPR:
3833 if (!convert_mult_to_widen (stmt, &gsi)
3834 && !convert_expand_mult_copysign (stmt, &gsi)
3835 && convert_mult_to_fma (stmt,
3836 gimple_assign_rhs1 (stmt),
3837 gimple_assign_rhs2 (stmt),
3838 &fma_state))
3840 gsi_remove (&gsi, true);
3841 release_defs (stmt);
3842 continue;
3844 break;
3846 case PLUS_EXPR:
3847 case MINUS_EXPR:
3848 if (!convert_plusminus_to_widen (&gsi, stmt, code))
3849 match_uaddsub_overflow (&gsi, stmt, code);
3850 break;
3852 case TRUNC_MOD_EXPR:
3853 convert_to_divmod (as_a<gassign *> (stmt));
3854 break;
3856 default:;
3859 else if (is_gimple_call (stmt))
3861 switch (gimple_call_combined_fn (stmt))
3863 CASE_CFN_POW:
3864 if (gimple_call_lhs (stmt)
3865 && TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3866 && real_equal (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3867 &dconst2)
3868 && convert_mult_to_fma (stmt,
3869 gimple_call_arg (stmt, 0),
3870 gimple_call_arg (stmt, 0),
3871 &fma_state))
3873 unlink_stmt_vdef (stmt);
3874 if (gsi_remove (&gsi, true)
3875 && gimple_purge_dead_eh_edges (bb))
3876 *m_cfg_changed_p = true;
3877 release_defs (stmt);
3878 continue;
3880 break;
3882 case CFN_COND_MUL:
3883 if (convert_mult_to_fma (stmt,
3884 gimple_call_arg (stmt, 1),
3885 gimple_call_arg (stmt, 2),
3886 &fma_state,
3887 gimple_call_arg (stmt, 0)))
3890 gsi_remove (&gsi, true);
3891 release_defs (stmt);
3892 continue;
3894 break;
3896 case CFN_LAST:
3897 cancel_fma_deferring (&fma_state);
3898 break;
3900 default:
3901 break;
3904 gsi_next (&gsi);
3906 if (fma_state.m_deferring_p
3907 && fma_state.m_initial_phi)
3909 gcc_checking_assert (fma_state.m_last_result);
3910 if (!last_fma_candidate_feeds_initial_phi (&fma_state,
3911 &m_last_result_set))
3912 cancel_fma_deferring (&fma_state);
3913 else
3914 m_last_result_set.add (fma_state.m_last_result);
3919 unsigned int
3920 pass_optimize_widening_mul::execute (function *fun)
3922 bool cfg_changed = false;
3924 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3925 calculate_dominance_info (CDI_DOMINATORS);
3926 renumber_gimple_stmt_uids (cfun);
3928 math_opts_dom_walker (&cfg_changed).walk (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3930 statistics_counter_event (fun, "widening multiplications inserted",
3931 widen_mul_stats.widen_mults_inserted);
3932 statistics_counter_event (fun, "widening maccs inserted",
3933 widen_mul_stats.maccs_inserted);
3934 statistics_counter_event (fun, "fused multiply-adds inserted",
3935 widen_mul_stats.fmas_inserted);
3936 statistics_counter_event (fun, "divmod calls inserted",
3937 widen_mul_stats.divmod_calls_inserted);
3939 return cfg_changed ? TODO_cleanup_cfg : 0;
3942 } // anon namespace
3944 gimple_opt_pass *
3945 make_pass_optimize_widening_mul (gcc::context *ctxt)
3947 return new pass_optimize_widening_mul (ctxt);