aix: Fix _STDC_FORMAT_MACROS in inttypes.h [PR97044]
[official-gcc.git] / gcc / tree-ssa-math-opts.c
blob8423caa3ee3c2e88c6321d637bdc92044a942102
1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2020 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
23 modulus = sqrt(x*x + y*y + z*z);
24 x = x / modulus;
25 y = y / modulus;
26 z = z / modulus;
28 that can be optimized to
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
32 x = x * rmodulus;
33 y = y * rmodulus;
34 z = z * rmodulus;
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 by the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
49 this comment.
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "backend.h"
91 #include "target.h"
92 #include "rtl.h"
93 #include "tree.h"
94 #include "gimple.h"
95 #include "predict.h"
96 #include "alloc-pool.h"
97 #include "tree-pass.h"
98 #include "ssa.h"
99 #include "optabs-tree.h"
100 #include "gimple-pretty-print.h"
101 #include "alias.h"
102 #include "fold-const.h"
103 #include "gimple-fold.h"
104 #include "gimple-iterator.h"
105 #include "gimplify.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-dfa.h"
110 #include "tree-ssa.h"
111 #include "builtins.h"
112 #include "internal-fn.h"
113 #include "case-cfn-macros.h"
114 #include "optabs-libfuncs.h"
115 #include "tree-eh.h"
116 #include "targhooks.h"
117 #include "domwalk.h"
119 /* This structure represents one basic block that either computes a
120 division, or is a common dominator for basic block that compute a
121 division. */
122 struct occurrence {
123 /* The basic block represented by this structure. */
124 basic_block bb = basic_block();
126 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
127 inserted in BB. */
128 tree recip_def = tree();
130 /* If non-NULL, the SSA_NAME holding the definition for a squared
131 reciprocal inserted in BB. */
132 tree square_recip_def = tree();
134 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
135 was inserted in BB. */
136 gimple *recip_def_stmt = nullptr;
138 /* Pointer to a list of "struct occurrence"s for blocks dominated
139 by BB. */
140 struct occurrence *children = nullptr;
142 /* Pointer to the next "struct occurrence"s in the list of blocks
143 sharing a common dominator. */
144 struct occurrence *next = nullptr;
146 /* The number of divisions that are in BB before compute_merit. The
147 number of divisions that are in BB or post-dominate it after
148 compute_merit. */
149 int num_divisions = 0;
151 /* True if the basic block has a division, false if it is a common
152 dominator for basic blocks that do. If it is false and trapping
153 math is active, BB is not a candidate for inserting a reciprocal. */
154 bool bb_has_division = false;
156 /* Construct a struct occurrence for basic block BB, and whose
157 children list is headed by CHILDREN. */
158 occurrence (basic_block bb, struct occurrence *children)
159 : bb (bb), children (children)
161 bb->aux = this;
164 /* Destroy a struct occurrence and remove it from its basic block. */
165 ~occurrence ()
167 bb->aux = nullptr;
170 /* Allocate memory for a struct occurrence from OCC_POOL. */
171 static void* operator new (size_t);
173 /* Return memory for a struct occurrence to OCC_POOL. */
174 static void operator delete (void*, size_t);
177 static struct
179 /* Number of 1.0/X ops inserted. */
180 int rdivs_inserted;
182 /* Number of 1.0/FUNC ops inserted. */
183 int rfuncs_inserted;
184 } reciprocal_stats;
186 static struct
188 /* Number of cexpi calls inserted. */
189 int inserted;
190 } sincos_stats;
192 static struct
194 /* Number of widening multiplication ops inserted. */
195 int widen_mults_inserted;
197 /* Number of integer multiply-and-accumulate ops inserted. */
198 int maccs_inserted;
200 /* Number of fp fused multiply-add ops inserted. */
201 int fmas_inserted;
203 /* Number of divmod calls inserted. */
204 int divmod_calls_inserted;
205 } widen_mul_stats;
207 /* The instance of "struct occurrence" representing the highest
208 interesting block in the dominator tree. */
209 static struct occurrence *occ_head;
211 /* Allocation pool for getting instances of "struct occurrence". */
212 static object_allocator<occurrence> *occ_pool;
214 void* occurrence::operator new (size_t n)
216 gcc_assert (n == sizeof(occurrence));
217 return occ_pool->allocate_raw ();
220 void occurrence::operator delete (void *occ, size_t n)
222 gcc_assert (n == sizeof(occurrence));
223 occ_pool->remove_raw (occ);
226 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
227 list of "struct occurrence"s, one per basic block, having IDOM as
228 their common dominator.
230 We try to insert NEW_OCC as deep as possible in the tree, and we also
231 insert any other block that is a common dominator for BB and one
232 block already in the tree. */
234 static void
235 insert_bb (struct occurrence *new_occ, basic_block idom,
236 struct occurrence **p_head)
238 struct occurrence *occ, **p_occ;
240 for (p_occ = p_head; (occ = *p_occ) != NULL; )
242 basic_block bb = new_occ->bb, occ_bb = occ->bb;
243 basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
244 if (dom == bb)
246 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
247 from its list. */
248 *p_occ = occ->next;
249 occ->next = new_occ->children;
250 new_occ->children = occ;
252 /* Try the next block (it may as well be dominated by BB). */
255 else if (dom == occ_bb)
257 /* OCC_BB dominates BB. Tail recurse to look deeper. */
258 insert_bb (new_occ, dom, &occ->children);
259 return;
262 else if (dom != idom)
264 gcc_assert (!dom->aux);
266 /* There is a dominator between IDOM and BB, add it and make
267 two children out of NEW_OCC and OCC. First, remove OCC from
268 its list. */
269 *p_occ = occ->next;
270 new_occ->next = occ;
271 occ->next = NULL;
273 /* None of the previous blocks has DOM as a dominator: if we tail
274 recursed, we would reexamine them uselessly. Just switch BB with
275 DOM, and go on looking for blocks dominated by DOM. */
276 new_occ = new occurrence (dom, new_occ);
279 else
281 /* Nothing special, go on with the next element. */
282 p_occ = &occ->next;
286 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
287 new_occ->next = *p_head;
288 *p_head = new_occ;
291 /* Register that we found a division in BB.
292 IMPORTANCE is a measure of how much weighting to give
293 that division. Use IMPORTANCE = 2 to register a single
294 division. If the division is going to be found multiple
295 times use 1 (as it is with squares). */
297 static inline void
298 register_division_in (basic_block bb, int importance)
300 struct occurrence *occ;
302 occ = (struct occurrence *) bb->aux;
303 if (!occ)
305 occ = new occurrence (bb, NULL);
306 insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
309 occ->bb_has_division = true;
310 occ->num_divisions += importance;
314 /* Compute the number of divisions that postdominate each block in OCC and
315 its children. */
317 static void
318 compute_merit (struct occurrence *occ)
320 struct occurrence *occ_child;
321 basic_block dom = occ->bb;
323 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
325 basic_block bb;
326 if (occ_child->children)
327 compute_merit (occ_child);
329 if (flag_exceptions)
330 bb = single_noncomplex_succ (dom);
331 else
332 bb = dom;
334 if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
335 occ->num_divisions += occ_child->num_divisions;
340 /* Return whether USE_STMT is a floating-point division by DEF. */
341 static inline bool
342 is_division_by (gimple *use_stmt, tree def)
344 return is_gimple_assign (use_stmt)
345 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
346 && gimple_assign_rhs2 (use_stmt) == def
347 /* Do not recognize x / x as valid division, as we are getting
348 confused later by replacing all immediate uses x in such
349 a stmt. */
350 && gimple_assign_rhs1 (use_stmt) != def
351 && !stmt_can_throw_internal (cfun, use_stmt);
354 /* Return TRUE if USE_STMT is a multiplication of DEF by A. */
355 static inline bool
356 is_mult_by (gimple *use_stmt, tree def, tree a)
358 if (gimple_code (use_stmt) == GIMPLE_ASSIGN
359 && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
361 tree op0 = gimple_assign_rhs1 (use_stmt);
362 tree op1 = gimple_assign_rhs2 (use_stmt);
364 return (op0 == def && op1 == a)
365 || (op0 == a && op1 == def);
367 return 0;
370 /* Return whether USE_STMT is DEF * DEF. */
371 static inline bool
372 is_square_of (gimple *use_stmt, tree def)
374 return is_mult_by (use_stmt, def, def);
377 /* Return whether USE_STMT is a floating-point division by
378 DEF * DEF. */
379 static inline bool
380 is_division_by_square (gimple *use_stmt, tree def)
382 if (gimple_code (use_stmt) == GIMPLE_ASSIGN
383 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
384 && gimple_assign_rhs1 (use_stmt) != gimple_assign_rhs2 (use_stmt)
385 && !stmt_can_throw_internal (cfun, use_stmt))
387 tree denominator = gimple_assign_rhs2 (use_stmt);
388 if (TREE_CODE (denominator) == SSA_NAME)
389 return is_square_of (SSA_NAME_DEF_STMT (denominator), def);
391 return 0;
394 /* Walk the subset of the dominator tree rooted at OCC, setting the
395 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
396 the given basic block. The field may be left NULL, of course,
397 if it is not possible or profitable to do the optimization.
399 DEF_BSI is an iterator pointing at the statement defining DEF.
400 If RECIP_DEF is set, a dominator already has a computation that can
401 be used.
403 If should_insert_square_recip is set, then this also inserts
404 the square of the reciprocal immediately after the definition
405 of the reciprocal. */
407 static void
408 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
409 tree def, tree recip_def, tree square_recip_def,
410 int should_insert_square_recip, int threshold)
412 tree type;
413 gassign *new_stmt, *new_square_stmt;
414 gimple_stmt_iterator gsi;
415 struct occurrence *occ_child;
417 if (!recip_def
418 && (occ->bb_has_division || !flag_trapping_math)
419 /* Divide by two as all divisions are counted twice in
420 the costing loop. */
421 && occ->num_divisions / 2 >= threshold)
423 /* Make a variable with the replacement and substitute it. */
424 type = TREE_TYPE (def);
425 recip_def = create_tmp_reg (type, "reciptmp");
426 new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
427 build_one_cst (type), def);
429 if (should_insert_square_recip)
431 square_recip_def = create_tmp_reg (type, "powmult_reciptmp");
432 new_square_stmt = gimple_build_assign (square_recip_def, MULT_EXPR,
433 recip_def, recip_def);
436 if (occ->bb_has_division)
438 /* Case 1: insert before an existing division. */
439 gsi = gsi_after_labels (occ->bb);
440 while (!gsi_end_p (gsi)
441 && (!is_division_by (gsi_stmt (gsi), def))
442 && (!is_division_by_square (gsi_stmt (gsi), def)))
443 gsi_next (&gsi);
445 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
446 if (should_insert_square_recip)
447 gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
449 else if (def_gsi && occ->bb == gsi_bb (*def_gsi))
451 /* Case 2: insert right after the definition. Note that this will
452 never happen if the definition statement can throw, because in
453 that case the sole successor of the statement's basic block will
454 dominate all the uses as well. */
455 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
456 if (should_insert_square_recip)
457 gsi_insert_after (def_gsi, new_square_stmt, GSI_NEW_STMT);
459 else
461 /* Case 3: insert in a basic block not containing defs/uses. */
462 gsi = gsi_after_labels (occ->bb);
463 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
464 if (should_insert_square_recip)
465 gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
468 reciprocal_stats.rdivs_inserted++;
470 occ->recip_def_stmt = new_stmt;
473 occ->recip_def = recip_def;
474 occ->square_recip_def = square_recip_def;
475 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
476 insert_reciprocals (def_gsi, occ_child, def, recip_def,
477 square_recip_def, should_insert_square_recip,
478 threshold);
481 /* Replace occurrences of expr / (x * x) with expr * ((1 / x) * (1 / x)).
482 Take as argument the use for (x * x). */
483 static inline void
484 replace_reciprocal_squares (use_operand_p use_p)
486 gimple *use_stmt = USE_STMT (use_p);
487 basic_block bb = gimple_bb (use_stmt);
488 struct occurrence *occ = (struct occurrence *) bb->aux;
490 if (optimize_bb_for_speed_p (bb) && occ->square_recip_def
491 && occ->recip_def)
493 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
494 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
495 gimple_assign_set_rhs2 (use_stmt, occ->square_recip_def);
496 SET_USE (use_p, occ->square_recip_def);
497 fold_stmt_inplace (&gsi);
498 update_stmt (use_stmt);
503 /* Replace the division at USE_P with a multiplication by the reciprocal, if
504 possible. */
506 static inline void
507 replace_reciprocal (use_operand_p use_p)
509 gimple *use_stmt = USE_STMT (use_p);
510 basic_block bb = gimple_bb (use_stmt);
511 struct occurrence *occ = (struct occurrence *) bb->aux;
513 if (optimize_bb_for_speed_p (bb)
514 && occ->recip_def && use_stmt != occ->recip_def_stmt)
516 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
517 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
518 SET_USE (use_p, occ->recip_def);
519 fold_stmt_inplace (&gsi);
520 update_stmt (use_stmt);
525 /* Free OCC and return one more "struct occurrence" to be freed. */
527 static struct occurrence *
528 free_bb (struct occurrence *occ)
530 struct occurrence *child, *next;
532 /* First get the two pointers hanging off OCC. */
533 next = occ->next;
534 child = occ->children;
535 delete occ;
537 /* Now ensure that we don't recurse unless it is necessary. */
538 if (!child)
539 return next;
540 else
542 while (next)
543 next = free_bb (next);
545 return child;
549 /* Transform sequences like
550 t = sqrt (a)
551 x = 1.0 / t;
552 r1 = x * x;
553 r2 = a * x;
554 into:
555 t = sqrt (a)
556 r1 = 1.0 / a;
557 r2 = t;
558 x = r1 * r2;
559 depending on the uses of x, r1, r2. This removes one multiplication and
560 allows the sqrt and division operations to execute in parallel.
561 DEF_GSI is the gsi of the initial division by sqrt that defines
562 DEF (x in the example above). */
564 static void
565 optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
567 gimple *use_stmt;
568 imm_use_iterator use_iter;
569 gimple *stmt = gsi_stmt (*def_gsi);
570 tree x = def;
571 tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
572 tree div_rhs1 = gimple_assign_rhs1 (stmt);
574 if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
575 || TREE_CODE (div_rhs1) != REAL_CST
576 || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
577 return;
579 gcall *sqrt_stmt
580 = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
582 if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
583 return;
585 switch (gimple_call_combined_fn (sqrt_stmt))
587 CASE_CFN_SQRT:
588 CASE_CFN_SQRT_FN:
589 break;
591 default:
592 return;
594 tree a = gimple_call_arg (sqrt_stmt, 0);
596 /* We have 'a' and 'x'. Now analyze the uses of 'x'. */
598 /* Statements that use x in x * x. */
599 auto_vec<gimple *> sqr_stmts;
600 /* Statements that use x in a * x. */
601 auto_vec<gimple *> mult_stmts;
602 bool has_other_use = false;
603 bool mult_on_main_path = false;
605 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, x)
607 if (is_gimple_debug (use_stmt))
608 continue;
609 if (is_square_of (use_stmt, x))
611 sqr_stmts.safe_push (use_stmt);
612 if (gimple_bb (use_stmt) == gimple_bb (stmt))
613 mult_on_main_path = true;
615 else if (is_mult_by (use_stmt, x, a))
617 mult_stmts.safe_push (use_stmt);
618 if (gimple_bb (use_stmt) == gimple_bb (stmt))
619 mult_on_main_path = true;
621 else
622 has_other_use = true;
625 /* In the x * x and a * x cases we just rewire stmt operands or
626 remove multiplications. In the has_other_use case we introduce
627 a multiplication so make sure we don't introduce a multiplication
628 on a path where there was none. */
629 if (has_other_use && !mult_on_main_path)
630 return;
632 if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
633 return;
635 /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
636 to be able to compose it from the sqr and mult cases. */
637 if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
638 return;
640 if (dump_file)
642 fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
643 print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
644 print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
645 fprintf (dump_file, "\n");
648 bool delete_div = !has_other_use;
649 tree sqr_ssa_name = NULL_TREE;
650 if (!sqr_stmts.is_empty ())
652 /* r1 = x * x. Transform the original
653 x = 1.0 / t
654 into
655 tmp1 = 1.0 / a
656 r1 = tmp1. */
658 sqr_ssa_name
659 = make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
661 if (dump_file)
663 fprintf (dump_file, "Replacing original division\n");
664 print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
665 fprintf (dump_file, "with new division\n");
667 stmt
668 = gimple_build_assign (sqr_ssa_name, gimple_assign_rhs_code (stmt),
669 gimple_assign_rhs1 (stmt), a);
670 gsi_insert_before (def_gsi, stmt, GSI_SAME_STMT);
671 gsi_remove (def_gsi, true);
672 *def_gsi = gsi_for_stmt (stmt);
673 fold_stmt_inplace (def_gsi);
674 update_stmt (stmt);
676 if (dump_file)
677 print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
679 delete_div = false;
680 gimple *sqr_stmt;
681 unsigned int i;
682 FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
684 gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
685 gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
686 update_stmt (sqr_stmt);
689 if (!mult_stmts.is_empty ())
691 /* r2 = a * x. Transform this into:
692 r2 = t (The original sqrt (a)). */
693 unsigned int i;
694 gimple *mult_stmt = NULL;
695 FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
697 gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
699 if (dump_file)
701 fprintf (dump_file, "Replacing squaring multiplication\n");
702 print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
703 fprintf (dump_file, "with assignment\n");
705 gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name);
706 fold_stmt_inplace (&gsi2);
707 update_stmt (mult_stmt);
708 if (dump_file)
709 print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
713 if (has_other_use)
715 /* Using the two temporaries tmp1, tmp2 from above
716 the original x is now:
717 x = tmp1 * tmp2. */
718 gcc_assert (orig_sqrt_ssa_name);
719 gcc_assert (sqr_ssa_name);
721 gimple *new_stmt
722 = gimple_build_assign (x, MULT_EXPR,
723 orig_sqrt_ssa_name, sqr_ssa_name);
724 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
725 update_stmt (stmt);
727 else if (delete_div)
729 /* Remove the original division. */
730 gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
731 gsi_remove (&gsi2, true);
732 release_defs (stmt);
734 else
735 release_ssa_name (x);
738 /* Look for floating-point divisions among DEF's uses, and try to
739 replace them by multiplications with the reciprocal. Add
740 as many statements computing the reciprocal as needed.
742 DEF must be a GIMPLE register of a floating-point type. */
744 static void
745 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
747 use_operand_p use_p, square_use_p;
748 imm_use_iterator use_iter, square_use_iter;
749 tree square_def;
750 struct occurrence *occ;
751 int count = 0;
752 int threshold;
753 int square_recip_count = 0;
754 int sqrt_recip_count = 0;
756 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME);
757 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
759 /* If DEF is a square (x * x), count the number of divisions by x.
760 If there are more divisions by x than by (DEF * DEF), prefer to optimize
761 the reciprocal of x instead of DEF. This improves cases like:
762 def = x * x
763 t0 = a / def
764 t1 = b / def
765 t2 = c / x
766 Reciprocal optimization of x results in 1 division rather than 2 or 3. */
767 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
769 if (is_gimple_assign (def_stmt)
770 && gimple_assign_rhs_code (def_stmt) == MULT_EXPR
771 && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME
772 && gimple_assign_rhs1 (def_stmt) == gimple_assign_rhs2 (def_stmt))
774 tree op0 = gimple_assign_rhs1 (def_stmt);
776 FOR_EACH_IMM_USE_FAST (use_p, use_iter, op0)
778 gimple *use_stmt = USE_STMT (use_p);
779 if (is_division_by (use_stmt, op0))
780 sqrt_recip_count++;
784 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
786 gimple *use_stmt = USE_STMT (use_p);
787 if (is_division_by (use_stmt, def))
789 register_division_in (gimple_bb (use_stmt), 2);
790 count++;
793 if (is_square_of (use_stmt, def))
795 square_def = gimple_assign_lhs (use_stmt);
796 FOR_EACH_IMM_USE_FAST (square_use_p, square_use_iter, square_def)
798 gimple *square_use_stmt = USE_STMT (square_use_p);
799 if (is_division_by (square_use_stmt, square_def))
801 /* This is executed twice for each division by a square. */
802 register_division_in (gimple_bb (square_use_stmt), 1);
803 square_recip_count++;
809 /* Square reciprocals were counted twice above. */
810 square_recip_count /= 2;
812 /* If it is more profitable to optimize 1 / x, don't optimize 1 / (x * x). */
813 if (sqrt_recip_count > square_recip_count)
814 goto out;
816 /* Do the expensive part only if we can hope to optimize something. */
817 if (count + square_recip_count >= threshold && count >= 1)
819 gimple *use_stmt;
820 for (occ = occ_head; occ; occ = occ->next)
822 compute_merit (occ);
823 insert_reciprocals (def_gsi, occ, def, NULL, NULL,
824 square_recip_count, threshold);
827 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
829 if (is_division_by (use_stmt, def))
831 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
832 replace_reciprocal (use_p);
834 else if (square_recip_count > 0 && is_square_of (use_stmt, def))
836 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
838 /* Find all uses of the square that are divisions and
839 * replace them by multiplications with the inverse. */
840 imm_use_iterator square_iterator;
841 gimple *powmult_use_stmt = USE_STMT (use_p);
842 tree powmult_def_name = gimple_assign_lhs (powmult_use_stmt);
844 FOR_EACH_IMM_USE_STMT (powmult_use_stmt,
845 square_iterator, powmult_def_name)
846 FOR_EACH_IMM_USE_ON_STMT (square_use_p, square_iterator)
848 gimple *powmult_use_stmt = USE_STMT (square_use_p);
849 if (is_division_by (powmult_use_stmt, powmult_def_name))
850 replace_reciprocal_squares (square_use_p);
857 out:
858 for (occ = occ_head; occ; )
859 occ = free_bb (occ);
861 occ_head = NULL;
864 /* Return an internal function that implements the reciprocal of CALL,
865 or IFN_LAST if there is no such function that the target supports. */
867 internal_fn
868 internal_fn_reciprocal (gcall *call)
870 internal_fn ifn;
872 switch (gimple_call_combined_fn (call))
874 CASE_CFN_SQRT:
875 CASE_CFN_SQRT_FN:
876 ifn = IFN_RSQRT;
877 break;
879 default:
880 return IFN_LAST;
883 tree_pair types = direct_internal_fn_types (ifn, call);
884 if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED))
885 return IFN_LAST;
887 return ifn;
890 /* Go through all the floating-point SSA_NAMEs, and call
891 execute_cse_reciprocals_1 on each of them. */
892 namespace {
894 const pass_data pass_data_cse_reciprocals =
896 GIMPLE_PASS, /* type */
897 "recip", /* name */
898 OPTGROUP_NONE, /* optinfo_flags */
899 TV_TREE_RECIP, /* tv_id */
900 PROP_ssa, /* properties_required */
901 0, /* properties_provided */
902 0, /* properties_destroyed */
903 0, /* todo_flags_start */
904 TODO_update_ssa, /* todo_flags_finish */
907 class pass_cse_reciprocals : public gimple_opt_pass
909 public:
910 pass_cse_reciprocals (gcc::context *ctxt)
911 : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
914 /* opt_pass methods: */
915 virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
916 virtual unsigned int execute (function *);
918 }; // class pass_cse_reciprocals
920 unsigned int
921 pass_cse_reciprocals::execute (function *fun)
923 basic_block bb;
924 tree arg;
926 occ_pool = new object_allocator<occurrence> ("dominators for recip");
928 memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
929 calculate_dominance_info (CDI_DOMINATORS);
930 calculate_dominance_info (CDI_POST_DOMINATORS);
932 if (flag_checking)
933 FOR_EACH_BB_FN (bb, fun)
934 gcc_assert (!bb->aux);
936 for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
937 if (FLOAT_TYPE_P (TREE_TYPE (arg))
938 && is_gimple_reg (arg))
940 tree name = ssa_default_def (fun, arg);
941 if (name)
942 execute_cse_reciprocals_1 (NULL, name);
945 FOR_EACH_BB_FN (bb, fun)
947 tree def;
949 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
950 gsi_next (&gsi))
952 gphi *phi = gsi.phi ();
953 def = PHI_RESULT (phi);
954 if (! virtual_operand_p (def)
955 && FLOAT_TYPE_P (TREE_TYPE (def)))
956 execute_cse_reciprocals_1 (NULL, def);
959 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
960 gsi_next (&gsi))
962 gimple *stmt = gsi_stmt (gsi);
964 if (gimple_has_lhs (stmt)
965 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
966 && FLOAT_TYPE_P (TREE_TYPE (def))
967 && TREE_CODE (def) == SSA_NAME)
969 execute_cse_reciprocals_1 (&gsi, def);
970 stmt = gsi_stmt (gsi);
971 if (flag_unsafe_math_optimizations
972 && is_gimple_assign (stmt)
973 && gimple_assign_lhs (stmt) == def
974 && !stmt_can_throw_internal (cfun, stmt)
975 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
976 optimize_recip_sqrt (&gsi, def);
980 if (optimize_bb_for_size_p (bb))
981 continue;
983 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
984 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
985 gsi_next (&gsi))
987 gimple *stmt = gsi_stmt (gsi);
989 if (is_gimple_assign (stmt)
990 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
992 tree arg1 = gimple_assign_rhs2 (stmt);
993 gimple *stmt1;
995 if (TREE_CODE (arg1) != SSA_NAME)
996 continue;
998 stmt1 = SSA_NAME_DEF_STMT (arg1);
1000 if (is_gimple_call (stmt1)
1001 && gimple_call_lhs (stmt1))
1003 bool fail;
1004 imm_use_iterator ui;
1005 use_operand_p use_p;
1006 tree fndecl = NULL_TREE;
1008 gcall *call = as_a <gcall *> (stmt1);
1009 internal_fn ifn = internal_fn_reciprocal (call);
1010 if (ifn == IFN_LAST)
1012 fndecl = gimple_call_fndecl (call);
1013 if (!fndecl
1014 || !fndecl_built_in_p (fndecl, BUILT_IN_MD))
1015 continue;
1016 fndecl = targetm.builtin_reciprocal (fndecl);
1017 if (!fndecl)
1018 continue;
1021 /* Check that all uses of the SSA name are divisions,
1022 otherwise replacing the defining statement will do
1023 the wrong thing. */
1024 fail = false;
1025 FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
1027 gimple *stmt2 = USE_STMT (use_p);
1028 if (is_gimple_debug (stmt2))
1029 continue;
1030 if (!is_gimple_assign (stmt2)
1031 || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
1032 || gimple_assign_rhs1 (stmt2) == arg1
1033 || gimple_assign_rhs2 (stmt2) != arg1)
1035 fail = true;
1036 break;
1039 if (fail)
1040 continue;
1042 gimple_replace_ssa_lhs (call, arg1);
1043 if (gimple_call_internal_p (call) != (ifn != IFN_LAST))
1045 auto_vec<tree, 4> args;
1046 for (unsigned int i = 0;
1047 i < gimple_call_num_args (call); i++)
1048 args.safe_push (gimple_call_arg (call, i));
1049 gcall *stmt2;
1050 if (ifn == IFN_LAST)
1051 stmt2 = gimple_build_call_vec (fndecl, args);
1052 else
1053 stmt2 = gimple_build_call_internal_vec (ifn, args);
1054 gimple_call_set_lhs (stmt2, arg1);
1055 gimple_move_vops (stmt2, call);
1056 gimple_call_set_nothrow (stmt2,
1057 gimple_call_nothrow_p (call));
1058 gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
1059 gsi_replace (&gsi2, stmt2, true);
1061 else
1063 if (ifn == IFN_LAST)
1064 gimple_call_set_fndecl (call, fndecl);
1065 else
1066 gimple_call_set_internal_fn (call, ifn);
1067 update_stmt (call);
1069 reciprocal_stats.rfuncs_inserted++;
1071 FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
1073 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
1074 gimple_assign_set_rhs_code (stmt, MULT_EXPR);
1075 fold_stmt_inplace (&gsi);
1076 update_stmt (stmt);
1083 statistics_counter_event (fun, "reciprocal divs inserted",
1084 reciprocal_stats.rdivs_inserted);
1085 statistics_counter_event (fun, "reciprocal functions inserted",
1086 reciprocal_stats.rfuncs_inserted);
1088 free_dominance_info (CDI_DOMINATORS);
1089 free_dominance_info (CDI_POST_DOMINATORS);
1090 delete occ_pool;
1091 return 0;
1094 } // anon namespace
1096 gimple_opt_pass *
1097 make_pass_cse_reciprocals (gcc::context *ctxt)
1099 return new pass_cse_reciprocals (ctxt);
1102 /* Records an occurrence at statement USE_STMT in the vector of trees
1103 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
1104 is not yet initialized. Returns true if the occurrence was pushed on
1105 the vector. Adjusts *TOP_BB to be the basic block dominating all
1106 statements in the vector. */
1108 static bool
1109 maybe_record_sincos (vec<gimple *> *stmts,
1110 basic_block *top_bb, gimple *use_stmt)
1112 basic_block use_bb = gimple_bb (use_stmt);
1113 if (*top_bb
1114 && (*top_bb == use_bb
1115 || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
1116 stmts->safe_push (use_stmt);
1117 else if (!*top_bb
1118 || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
1120 stmts->safe_push (use_stmt);
1121 *top_bb = use_bb;
1123 else
1124 return false;
1126 return true;
1129 /* Look for sin, cos and cexpi calls with the same argument NAME and
1130 create a single call to cexpi CSEing the result in this case.
1131 We first walk over all immediate uses of the argument collecting
1132 statements that we can CSE in a vector and in a second pass replace
1133 the statement rhs with a REALPART or IMAGPART expression on the
1134 result of the cexpi call we insert before the use statement that
1135 dominates all other candidates. */
1137 static bool
1138 execute_cse_sincos_1 (tree name)
1140 gimple_stmt_iterator gsi;
1141 imm_use_iterator use_iter;
1142 tree fndecl, res, type;
1143 gimple *def_stmt, *use_stmt, *stmt;
1144 int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
1145 auto_vec<gimple *> stmts;
1146 basic_block top_bb = NULL;
1147 int i;
1148 bool cfg_changed = false;
1150 type = TREE_TYPE (name);
1151 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
1153 if (gimple_code (use_stmt) != GIMPLE_CALL
1154 || !gimple_call_lhs (use_stmt))
1155 continue;
1157 switch (gimple_call_combined_fn (use_stmt))
1159 CASE_CFN_COS:
1160 seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1161 break;
1163 CASE_CFN_SIN:
1164 seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1165 break;
1167 CASE_CFN_CEXPI:
1168 seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1169 break;
1171 default:;
1175 if (seen_cos + seen_sin + seen_cexpi <= 1)
1176 return false;
1178 /* Simply insert cexpi at the beginning of top_bb but not earlier than
1179 the name def statement. */
1180 fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
1181 if (!fndecl)
1182 return false;
1183 stmt = gimple_build_call (fndecl, 1, name);
1184 res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
1185 gimple_call_set_lhs (stmt, res);
1187 def_stmt = SSA_NAME_DEF_STMT (name);
1188 if (!SSA_NAME_IS_DEFAULT_DEF (name)
1189 && gimple_code (def_stmt) != GIMPLE_PHI
1190 && gimple_bb (def_stmt) == top_bb)
1192 gsi = gsi_for_stmt (def_stmt);
1193 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
1195 else
1197 gsi = gsi_after_labels (top_bb);
1198 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1200 sincos_stats.inserted++;
1202 /* And adjust the recorded old call sites. */
1203 for (i = 0; stmts.iterate (i, &use_stmt); ++i)
1205 tree rhs = NULL;
1207 switch (gimple_call_combined_fn (use_stmt))
1209 CASE_CFN_COS:
1210 rhs = fold_build1 (REALPART_EXPR, type, res);
1211 break;
1213 CASE_CFN_SIN:
1214 rhs = fold_build1 (IMAGPART_EXPR, type, res);
1215 break;
1217 CASE_CFN_CEXPI:
1218 rhs = res;
1219 break;
1221 default:;
1222 gcc_unreachable ();
1225 /* Replace call with a copy. */
1226 stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
1228 gsi = gsi_for_stmt (use_stmt);
1229 gsi_replace (&gsi, stmt, true);
1230 if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
1231 cfg_changed = true;
1234 return cfg_changed;
1237 /* To evaluate powi(x,n), the floating point value x raised to the
1238 constant integer exponent n, we use a hybrid algorithm that
1239 combines the "window method" with look-up tables. For an
1240 introduction to exponentiation algorithms and "addition chains",
1241 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
1242 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
1243 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
1244 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
1246 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
1247 multiplications to inline before calling the system library's pow
1248 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
1249 so this default never requires calling pow, powf or powl. */
1251 #ifndef POWI_MAX_MULTS
1252 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
1253 #endif
1255 /* The size of the "optimal power tree" lookup table. All
1256 exponents less than this value are simply looked up in the
1257 powi_table below. This threshold is also used to size the
1258 cache of pseudo registers that hold intermediate results. */
1259 #define POWI_TABLE_SIZE 256
1261 /* The size, in bits of the window, used in the "window method"
1262 exponentiation algorithm. This is equivalent to a radix of
1263 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
1264 #define POWI_WINDOW_SIZE 3
1266 /* The following table is an efficient representation of an
1267 "optimal power tree". For each value, i, the corresponding
1268 value, j, in the table states than an optimal evaluation
1269 sequence for calculating pow(x,i) can be found by evaluating
1270 pow(x,j)*pow(x,i-j). An optimal power tree for the first
1271 100 integers is given in Knuth's "Seminumerical algorithms". */
1273 static const unsigned char powi_table[POWI_TABLE_SIZE] =
1275 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
1276 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
1277 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
1278 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
1279 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
1280 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
1281 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
1282 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
1283 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
1284 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
1285 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
1286 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
1287 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
1288 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
1289 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
1290 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
1291 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
1292 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
1293 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
1294 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
1295 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
1296 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
1297 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
1298 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
1299 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
1300 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
1301 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
1302 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
1303 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
1304 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
1305 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
1306 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
1310 /* Return the number of multiplications required to calculate
1311 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
1312 subroutine of powi_cost. CACHE is an array indicating
1313 which exponents have already been calculated. */
1315 static int
1316 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
1318 /* If we've already calculated this exponent, then this evaluation
1319 doesn't require any additional multiplications. */
1320 if (cache[n])
1321 return 0;
1323 cache[n] = true;
1324 return powi_lookup_cost (n - powi_table[n], cache)
1325 + powi_lookup_cost (powi_table[n], cache) + 1;
1328 /* Return the number of multiplications required to calculate
1329 powi(x,n) for an arbitrary x, given the exponent N. This
1330 function needs to be kept in sync with powi_as_mults below. */
1332 static int
1333 powi_cost (HOST_WIDE_INT n)
1335 bool cache[POWI_TABLE_SIZE];
1336 unsigned HOST_WIDE_INT digit;
1337 unsigned HOST_WIDE_INT val;
1338 int result;
1340 if (n == 0)
1341 return 0;
1343 /* Ignore the reciprocal when calculating the cost. */
1344 val = (n < 0) ? -n : n;
1346 /* Initialize the exponent cache. */
1347 memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
1348 cache[1] = true;
1350 result = 0;
1352 while (val >= POWI_TABLE_SIZE)
1354 if (val & 1)
1356 digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
1357 result += powi_lookup_cost (digit, cache)
1358 + POWI_WINDOW_SIZE + 1;
1359 val >>= POWI_WINDOW_SIZE;
1361 else
1363 val >>= 1;
1364 result++;
1368 return result + powi_lookup_cost (val, cache);
1371 /* Recursive subroutine of powi_as_mults. This function takes the
1372 array, CACHE, of already calculated exponents and an exponent N and
1373 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
1375 static tree
1376 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
1377 HOST_WIDE_INT n, tree *cache)
1379 tree op0, op1, ssa_target;
1380 unsigned HOST_WIDE_INT digit;
1381 gassign *mult_stmt;
1383 if (n < POWI_TABLE_SIZE && cache[n])
1384 return cache[n];
1386 ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1388 if (n < POWI_TABLE_SIZE)
1390 cache[n] = ssa_target;
1391 op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1392 op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1394 else if (n & 1)
1396 digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1397 op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1398 op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1400 else
1402 op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1403 op1 = op0;
1406 mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1407 gimple_set_location (mult_stmt, loc);
1408 gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1410 return ssa_target;
1413 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1414 This function needs to be kept in sync with powi_cost above. */
1416 static tree
1417 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1418 tree arg0, HOST_WIDE_INT n)
1420 tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1421 gassign *div_stmt;
1422 tree target;
1424 if (n == 0)
1425 return build_real (type, dconst1);
1427 memset (cache, 0, sizeof (cache));
1428 cache[1] = arg0;
1430 result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1431 if (n >= 0)
1432 return result;
1434 /* If the original exponent was negative, reciprocate the result. */
1435 target = make_temp_ssa_name (type, NULL, "powmult");
1436 div_stmt = gimple_build_assign (target, RDIV_EXPR,
1437 build_real (type, dconst1), result);
1438 gimple_set_location (div_stmt, loc);
1439 gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1441 return target;
1444 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1445 location info LOC. If the arguments are appropriate, create an
1446 equivalent sequence of statements prior to GSI using an optimal
1447 number of multiplications, and return an expession holding the
1448 result. */
1450 static tree
1451 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1452 tree arg0, HOST_WIDE_INT n)
1454 /* Avoid largest negative number. */
1455 if (n != -n
1456 && ((n >= -1 && n <= 2)
1457 || (optimize_function_for_speed_p (cfun)
1458 && powi_cost (n) <= POWI_MAX_MULTS)))
1459 return powi_as_mults (gsi, loc, arg0, n);
1461 return NULL_TREE;
1464 /* Build a gimple call statement that calls FN with argument ARG.
1465 Set the lhs of the call statement to a fresh SSA name. Insert the
1466 statement prior to GSI's current position, and return the fresh
1467 SSA name. */
1469 static tree
1470 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1471 tree fn, tree arg)
1473 gcall *call_stmt;
1474 tree ssa_target;
1476 call_stmt = gimple_build_call (fn, 1, arg);
1477 ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1478 gimple_set_lhs (call_stmt, ssa_target);
1479 gimple_set_location (call_stmt, loc);
1480 gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1482 return ssa_target;
1485 /* Build a gimple binary operation with the given CODE and arguments
1486 ARG0, ARG1, assigning the result to a new SSA name for variable
1487 TARGET. Insert the statement prior to GSI's current position, and
1488 return the fresh SSA name.*/
1490 static tree
1491 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1492 const char *name, enum tree_code code,
1493 tree arg0, tree arg1)
1495 tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1496 gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1497 gimple_set_location (stmt, loc);
1498 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1499 return result;
1502 /* Build a gimple reference operation with the given CODE and argument
1503 ARG, assigning the result to a new SSA name of TYPE with NAME.
1504 Insert the statement prior to GSI's current position, and return
1505 the fresh SSA name. */
1507 static inline tree
1508 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1509 const char *name, enum tree_code code, tree arg0)
1511 tree result = make_temp_ssa_name (type, NULL, name);
1512 gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1513 gimple_set_location (stmt, loc);
1514 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1515 return result;
1518 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1519 prior to GSI's current position, and return the fresh SSA name. */
1521 static tree
1522 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1523 tree type, tree val)
1525 tree result = make_ssa_name (type);
1526 gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1527 gimple_set_location (stmt, loc);
1528 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1529 return result;
1532 struct pow_synth_sqrt_info
1534 bool *factors;
1535 unsigned int deepest;
1536 unsigned int num_mults;
1539 /* Return true iff the real value C can be represented as a
1540 sum of powers of 0.5 up to N. That is:
1541 C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1542 Record in INFO the various parameters of the synthesis algorithm such
1543 as the factors a[i], the maximum 0.5 power and the number of
1544 multiplications that will be required. */
1546 bool
1547 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1548 struct pow_synth_sqrt_info *info)
1550 REAL_VALUE_TYPE factor = dconsthalf;
1551 REAL_VALUE_TYPE remainder = c;
1553 info->deepest = 0;
1554 info->num_mults = 0;
1555 memset (info->factors, 0, n * sizeof (bool));
1557 for (unsigned i = 0; i < n; i++)
1559 REAL_VALUE_TYPE res;
1561 /* If something inexact happened bail out now. */
1562 if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1563 return false;
1565 /* We have hit zero. The number is representable as a sum
1566 of powers of 0.5. */
1567 if (real_equal (&res, &dconst0))
1569 info->factors[i] = true;
1570 info->deepest = i + 1;
1571 return true;
1573 else if (!REAL_VALUE_NEGATIVE (res))
1575 remainder = res;
1576 info->factors[i] = true;
1577 info->num_mults++;
1579 else
1580 info->factors[i] = false;
1582 real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1584 return false;
1587 /* Return the tree corresponding to FN being applied
1588 to ARG N times at GSI and LOC.
1589 Look up previous results from CACHE if need be.
1590 cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times. */
1592 static tree
1593 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1594 tree fn, location_t loc, tree *cache)
1596 tree res = cache[n];
1597 if (!res)
1599 tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1600 res = build_and_insert_call (gsi, loc, fn, prev);
1601 cache[n] = res;
1604 return res;
1607 /* Print to STREAM the repeated application of function FNAME to ARG
1608 N times. So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1609 "foo (foo (x))". */
1611 static void
1612 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1613 unsigned int n)
1615 if (n == 0)
1616 fprintf (stream, "%s", arg);
1617 else
1619 fprintf (stream, "%s (", fname);
1620 print_nested_fn (stream, fname, arg, n - 1);
1621 fprintf (stream, ")");
1625 /* Print to STREAM the fractional sequence of sqrt chains
1626 applied to ARG, described by INFO. Used for the dump file. */
1628 static void
1629 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1630 struct pow_synth_sqrt_info *info)
1632 for (unsigned int i = 0; i < info->deepest; i++)
1634 bool is_set = info->factors[i];
1635 if (is_set)
1637 print_nested_fn (stream, "sqrt", arg, i + 1);
1638 if (i != info->deepest - 1)
1639 fprintf (stream, " * ");
1644 /* Print to STREAM a representation of raising ARG to an integer
1645 power N. Used for the dump file. */
1647 static void
1648 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1650 if (n > 1)
1651 fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1652 else if (n == 1)
1653 fprintf (stream, "%s", arg);
1656 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1657 square roots. Place at GSI and LOC. Limit the maximum depth
1658 of the sqrt chains to MAX_DEPTH. Return the tree holding the
1659 result of the expanded sequence or NULL_TREE if the expansion failed.
1661 This routine assumes that ARG1 is a real number with a fractional part
1662 (the integer exponent case will have been handled earlier in
1663 gimple_expand_builtin_pow).
1665 For ARG1 > 0.0:
1666 * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1667 FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1668 FRAC_PART == ARG1 - WHOLE_PART:
1669 Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1670 POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1671 if it can be expressed as such, that is if FRAC_PART satisfies:
1672 FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1673 where integer a[i] is either 0 or 1.
1675 Example:
1676 POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1677 --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1679 For ARG1 < 0.0 there are two approaches:
1680 * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1681 is calculated as above.
1683 Example:
1684 POW (x, -5.625) == 1.0 / POW (x, 5.625)
1685 --> 1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1687 * (B) : WHOLE_PART := - ceil (abs (ARG1))
1688 FRAC_PART := ARG1 - WHOLE_PART
1689 and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1690 Example:
1691 POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1692 --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1694 For ARG1 < 0.0 we choose between (A) and (B) depending on
1695 how many multiplications we'd have to do.
1696 So, for the example in (B): POW (x, -5.875), if we were to
1697 follow algorithm (A) we would produce:
1698 1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1699 which contains more multiplications than approach (B).
1701 Hopefully, this approach will eliminate potentially expensive POW library
1702 calls when unsafe floating point math is enabled and allow the compiler to
1703 further optimise the multiplies, square roots and divides produced by this
1704 function. */
1706 static tree
1707 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1708 tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1710 tree type = TREE_TYPE (arg0);
1711 machine_mode mode = TYPE_MODE (type);
1712 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1713 bool one_over = true;
1715 if (!sqrtfn)
1716 return NULL_TREE;
1718 if (TREE_CODE (arg1) != REAL_CST)
1719 return NULL_TREE;
1721 REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1723 gcc_assert (max_depth > 0);
1724 tree *cache = XALLOCAVEC (tree, max_depth + 1);
1726 struct pow_synth_sqrt_info synth_info;
1727 synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1728 synth_info.deepest = 0;
1729 synth_info.num_mults = 0;
1731 bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1732 REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1734 /* The whole and fractional parts of exp. */
1735 REAL_VALUE_TYPE whole_part;
1736 REAL_VALUE_TYPE frac_part;
1738 real_floor (&whole_part, mode, &exp);
1739 real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1742 REAL_VALUE_TYPE ceil_whole = dconst0;
1743 REAL_VALUE_TYPE ceil_fract = dconst0;
1745 if (neg_exp)
1747 real_ceil (&ceil_whole, mode, &exp);
1748 real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1751 if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1752 return NULL_TREE;
1754 /* Check whether it's more profitable to not use 1.0 / ... */
1755 if (neg_exp)
1757 struct pow_synth_sqrt_info alt_synth_info;
1758 alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1759 alt_synth_info.deepest = 0;
1760 alt_synth_info.num_mults = 0;
1762 if (representable_as_half_series_p (ceil_fract, max_depth,
1763 &alt_synth_info)
1764 && alt_synth_info.deepest <= synth_info.deepest
1765 && alt_synth_info.num_mults < synth_info.num_mults)
1767 whole_part = ceil_whole;
1768 frac_part = ceil_fract;
1769 synth_info.deepest = alt_synth_info.deepest;
1770 synth_info.num_mults = alt_synth_info.num_mults;
1771 memcpy (synth_info.factors, alt_synth_info.factors,
1772 (max_depth + 1) * sizeof (bool));
1773 one_over = false;
1777 HOST_WIDE_INT n = real_to_integer (&whole_part);
1778 REAL_VALUE_TYPE cint;
1779 real_from_integer (&cint, VOIDmode, n, SIGNED);
1781 if (!real_identical (&whole_part, &cint))
1782 return NULL_TREE;
1784 if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1785 return NULL_TREE;
1787 memset (cache, 0, (max_depth + 1) * sizeof (tree));
1789 tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1791 /* Calculate the integer part of the exponent. */
1792 if (n > 1)
1794 integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1795 if (!integer_res)
1796 return NULL_TREE;
1799 if (dump_file)
1801 char string[64];
1803 real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1804 fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1806 if (neg_exp)
1808 if (one_over)
1810 fprintf (dump_file, "1.0 / (");
1811 dump_integer_part (dump_file, "x", n);
1812 if (n > 0)
1813 fprintf (dump_file, " * ");
1814 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1815 fprintf (dump_file, ")");
1817 else
1819 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1820 fprintf (dump_file, " / (");
1821 dump_integer_part (dump_file, "x", n);
1822 fprintf (dump_file, ")");
1825 else
1827 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1828 if (n > 0)
1829 fprintf (dump_file, " * ");
1830 dump_integer_part (dump_file, "x", n);
1833 fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1837 tree fract_res = NULL_TREE;
1838 cache[0] = arg0;
1840 /* Calculate the fractional part of the exponent. */
1841 for (unsigned i = 0; i < synth_info.deepest; i++)
1843 if (synth_info.factors[i])
1845 tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1847 if (!fract_res)
1848 fract_res = sqrt_chain;
1850 else
1851 fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1852 fract_res, sqrt_chain);
1856 tree res = NULL_TREE;
1858 if (neg_exp)
1860 if (one_over)
1862 if (n > 0)
1863 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1864 fract_res, integer_res);
1865 else
1866 res = fract_res;
1868 res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1869 build_real (type, dconst1), res);
1871 else
1873 res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1874 fract_res, integer_res);
1877 else
1878 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1879 fract_res, integer_res);
1880 return res;
1883 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1884 with location info LOC. If possible, create an equivalent and
1885 less expensive sequence of statements prior to GSI, and return an
1886 expession holding the result. */
1888 static tree
1889 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1890 tree arg0, tree arg1)
1892 REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1893 REAL_VALUE_TYPE c2, dconst3;
1894 HOST_WIDE_INT n;
1895 tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1896 machine_mode mode;
1897 bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1898 bool hw_sqrt_exists, c_is_int, c2_is_int;
1900 dconst1_4 = dconst1;
1901 SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1903 /* If the exponent isn't a constant, there's nothing of interest
1904 to be done. */
1905 if (TREE_CODE (arg1) != REAL_CST)
1906 return NULL_TREE;
1908 /* Don't perform the operation if flag_signaling_nans is on
1909 and the operand is a signaling NaN. */
1910 if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1)))
1911 && ((TREE_CODE (arg0) == REAL_CST
1912 && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0)))
1913 || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1))))
1914 return NULL_TREE;
1916 /* If the exponent is equivalent to an integer, expand to an optimal
1917 multiplication sequence when profitable. */
1918 c = TREE_REAL_CST (arg1);
1919 n = real_to_integer (&c);
1920 real_from_integer (&cint, VOIDmode, n, SIGNED);
1921 c_is_int = real_identical (&c, &cint);
1923 if (c_is_int
1924 && ((n >= -1 && n <= 2)
1925 || (flag_unsafe_math_optimizations
1926 && speed_p
1927 && powi_cost (n) <= POWI_MAX_MULTS)))
1928 return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1930 /* Attempt various optimizations using sqrt and cbrt. */
1931 type = TREE_TYPE (arg0);
1932 mode = TYPE_MODE (type);
1933 sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1935 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1936 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1937 sqrt(-0) = -0. */
1938 if (sqrtfn
1939 && real_equal (&c, &dconsthalf)
1940 && !HONOR_SIGNED_ZEROS (mode))
1941 return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1943 hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1945 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1946 optimizations since 1./3. is not exactly representable. If x
1947 is negative and finite, the correct value of pow(x,1./3.) is
1948 a NaN with the "invalid" exception raised, because the value
1949 of 1./3. actually has an even denominator. The correct value
1950 of cbrt(x) is a negative real value. */
1951 cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1952 dconst1_3 = real_value_truncate (mode, dconst_third ());
1954 if (flag_unsafe_math_optimizations
1955 && cbrtfn
1956 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1957 && real_equal (&c, &dconst1_3))
1958 return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1960 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1961 if we don't have a hardware sqrt insn. */
1962 dconst1_6 = dconst1_3;
1963 SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1965 if (flag_unsafe_math_optimizations
1966 && sqrtfn
1967 && cbrtfn
1968 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1969 && speed_p
1970 && hw_sqrt_exists
1971 && real_equal (&c, &dconst1_6))
1973 /* sqrt(x) */
1974 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1976 /* cbrt(sqrt(x)) */
1977 return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1981 /* Attempt to expand the POW as a product of square root chains.
1982 Expand the 0.25 case even when otpimising for size. */
1983 if (flag_unsafe_math_optimizations
1984 && sqrtfn
1985 && hw_sqrt_exists
1986 && (speed_p || real_equal (&c, &dconst1_4))
1987 && !HONOR_SIGNED_ZEROS (mode))
1989 unsigned int max_depth = speed_p
1990 ? param_max_pow_sqrt_depth
1991 : 2;
1993 tree expand_with_sqrts
1994 = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1996 if (expand_with_sqrts)
1997 return expand_with_sqrts;
2000 real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
2001 n = real_to_integer (&c2);
2002 real_from_integer (&cint, VOIDmode, n, SIGNED);
2003 c2_is_int = real_identical (&c2, &cint);
2005 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
2007 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
2008 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
2010 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
2011 different from pow(x, 1./3.) due to rounding and behavior with
2012 negative x, we need to constrain this transformation to unsafe
2013 math and positive x or finite math. */
2014 real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
2015 real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
2016 real_round (&c2, mode, &c2);
2017 n = real_to_integer (&c2);
2018 real_from_integer (&cint, VOIDmode, n, SIGNED);
2019 real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
2020 real_convert (&c2, mode, &c2);
2022 if (flag_unsafe_math_optimizations
2023 && cbrtfn
2024 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
2025 && real_identical (&c2, &c)
2026 && !c2_is_int
2027 && optimize_function_for_speed_p (cfun)
2028 && powi_cost (n / 3) <= POWI_MAX_MULTS)
2030 tree powi_x_ndiv3 = NULL_TREE;
2032 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
2033 possible or profitable, give up. Skip the degenerate case when
2034 abs(n) < 3, where the result is always 1. */
2035 if (absu_hwi (n) >= 3)
2037 powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
2038 abs_hwi (n / 3));
2039 if (!powi_x_ndiv3)
2040 return NULL_TREE;
2043 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
2044 as that creates an unnecessary variable. Instead, just produce
2045 either cbrt(x) or cbrt(x) * cbrt(x). */
2046 cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
2048 if (absu_hwi (n) % 3 == 1)
2049 powi_cbrt_x = cbrt_x;
2050 else
2051 powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2052 cbrt_x, cbrt_x);
2054 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
2055 if (absu_hwi (n) < 3)
2056 result = powi_cbrt_x;
2057 else
2058 result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2059 powi_x_ndiv3, powi_cbrt_x);
2061 /* If n is negative, reciprocate the result. */
2062 if (n < 0)
2063 result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
2064 build_real (type, dconst1), result);
2066 return result;
2069 /* No optimizations succeeded. */
2070 return NULL_TREE;
2073 /* ARG is the argument to a cabs builtin call in GSI with location info
2074 LOC. Create a sequence of statements prior to GSI that calculates
2075 sqrt(R*R + I*I), where R and I are the real and imaginary components
2076 of ARG, respectively. Return an expression holding the result. */
2078 static tree
2079 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
2081 tree real_part, imag_part, addend1, addend2, sum, result;
2082 tree type = TREE_TYPE (TREE_TYPE (arg));
2083 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
2084 machine_mode mode = TYPE_MODE (type);
2086 if (!flag_unsafe_math_optimizations
2087 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
2088 || !sqrtfn
2089 || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
2090 return NULL_TREE;
2092 real_part = build_and_insert_ref (gsi, loc, type, "cabs",
2093 REALPART_EXPR, arg);
2094 addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2095 real_part, real_part);
2096 imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
2097 IMAGPART_EXPR, arg);
2098 addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2099 imag_part, imag_part);
2100 sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
2101 result = build_and_insert_call (gsi, loc, sqrtfn, sum);
2103 return result;
2106 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
2107 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
2108 an optimal number of multiplies, when n is a constant. */
2110 namespace {
2112 const pass_data pass_data_cse_sincos =
2114 GIMPLE_PASS, /* type */
2115 "sincos", /* name */
2116 OPTGROUP_NONE, /* optinfo_flags */
2117 TV_TREE_SINCOS, /* tv_id */
2118 PROP_ssa, /* properties_required */
2119 PROP_gimple_opt_math, /* properties_provided */
2120 0, /* properties_destroyed */
2121 0, /* todo_flags_start */
2122 TODO_update_ssa, /* todo_flags_finish */
2125 class pass_cse_sincos : public gimple_opt_pass
2127 public:
2128 pass_cse_sincos (gcc::context *ctxt)
2129 : gimple_opt_pass (pass_data_cse_sincos, ctxt)
2132 /* opt_pass methods: */
2133 virtual bool gate (function *)
2135 /* We no longer require either sincos or cexp, since powi expansion
2136 piggybacks on this pass. */
2137 return optimize;
2140 virtual unsigned int execute (function *);
2142 }; // class pass_cse_sincos
2144 unsigned int
2145 pass_cse_sincos::execute (function *fun)
2147 basic_block bb;
2148 bool cfg_changed = false;
2150 calculate_dominance_info (CDI_DOMINATORS);
2151 memset (&sincos_stats, 0, sizeof (sincos_stats));
2153 FOR_EACH_BB_FN (bb, fun)
2155 gimple_stmt_iterator gsi;
2156 bool cleanup_eh = false;
2158 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2160 gimple *stmt = gsi_stmt (gsi);
2162 /* Only the last stmt in a bb could throw, no need to call
2163 gimple_purge_dead_eh_edges if we change something in the middle
2164 of a basic block. */
2165 cleanup_eh = false;
2167 if (is_gimple_call (stmt)
2168 && gimple_call_lhs (stmt))
2170 tree arg, arg0, arg1, result;
2171 HOST_WIDE_INT n;
2172 location_t loc;
2174 switch (gimple_call_combined_fn (stmt))
2176 CASE_CFN_COS:
2177 CASE_CFN_SIN:
2178 CASE_CFN_CEXPI:
2179 /* Make sure we have either sincos or cexp. */
2180 if (!targetm.libc_has_function (function_c99_math_complex)
2181 && !targetm.libc_has_function (function_sincos))
2182 break;
2184 arg = gimple_call_arg (stmt, 0);
2185 if (TREE_CODE (arg) == SSA_NAME)
2186 cfg_changed |= execute_cse_sincos_1 (arg);
2187 break;
2189 CASE_CFN_POW:
2190 arg0 = gimple_call_arg (stmt, 0);
2191 arg1 = gimple_call_arg (stmt, 1);
2193 loc = gimple_location (stmt);
2194 result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
2196 if (result)
2198 tree lhs = gimple_get_lhs (stmt);
2199 gassign *new_stmt = gimple_build_assign (lhs, result);
2200 gimple_set_location (new_stmt, loc);
2201 unlink_stmt_vdef (stmt);
2202 gsi_replace (&gsi, new_stmt, true);
2203 cleanup_eh = true;
2204 if (gimple_vdef (stmt))
2205 release_ssa_name (gimple_vdef (stmt));
2207 break;
2209 CASE_CFN_POWI:
2210 arg0 = gimple_call_arg (stmt, 0);
2211 arg1 = gimple_call_arg (stmt, 1);
2212 loc = gimple_location (stmt);
2214 if (real_minus_onep (arg0))
2216 tree t0, t1, cond, one, minus_one;
2217 gassign *stmt;
2219 t0 = TREE_TYPE (arg0);
2220 t1 = TREE_TYPE (arg1);
2221 one = build_real (t0, dconst1);
2222 minus_one = build_real (t0, dconstm1);
2224 cond = make_temp_ssa_name (t1, NULL, "powi_cond");
2225 stmt = gimple_build_assign (cond, BIT_AND_EXPR,
2226 arg1, build_int_cst (t1, 1));
2227 gimple_set_location (stmt, loc);
2228 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2230 result = make_temp_ssa_name (t0, NULL, "powi");
2231 stmt = gimple_build_assign (result, COND_EXPR, cond,
2232 minus_one, one);
2233 gimple_set_location (stmt, loc);
2234 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2236 else
2238 if (!tree_fits_shwi_p (arg1))
2239 break;
2241 n = tree_to_shwi (arg1);
2242 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
2245 if (result)
2247 tree lhs = gimple_get_lhs (stmt);
2248 gassign *new_stmt = gimple_build_assign (lhs, result);
2249 gimple_set_location (new_stmt, loc);
2250 unlink_stmt_vdef (stmt);
2251 gsi_replace (&gsi, new_stmt, true);
2252 cleanup_eh = true;
2253 if (gimple_vdef (stmt))
2254 release_ssa_name (gimple_vdef (stmt));
2256 break;
2258 CASE_CFN_CABS:
2259 arg0 = gimple_call_arg (stmt, 0);
2260 loc = gimple_location (stmt);
2261 result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
2263 if (result)
2265 tree lhs = gimple_get_lhs (stmt);
2266 gassign *new_stmt = gimple_build_assign (lhs, result);
2267 gimple_set_location (new_stmt, loc);
2268 unlink_stmt_vdef (stmt);
2269 gsi_replace (&gsi, new_stmt, true);
2270 cleanup_eh = true;
2271 if (gimple_vdef (stmt))
2272 release_ssa_name (gimple_vdef (stmt));
2274 break;
2276 default:;
2280 if (cleanup_eh)
2281 cfg_changed |= gimple_purge_dead_eh_edges (bb);
2284 statistics_counter_event (fun, "sincos statements inserted",
2285 sincos_stats.inserted);
2287 return cfg_changed ? TODO_cleanup_cfg : 0;
2290 } // anon namespace
2292 gimple_opt_pass *
2293 make_pass_cse_sincos (gcc::context *ctxt)
2295 return new pass_cse_sincos (ctxt);
2298 /* Return true if stmt is a type conversion operation that can be stripped
2299 when used in a widening multiply operation. */
2300 static bool
2301 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2303 enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2305 if (TREE_CODE (result_type) == INTEGER_TYPE)
2307 tree op_type;
2308 tree inner_op_type;
2310 if (!CONVERT_EXPR_CODE_P (rhs_code))
2311 return false;
2313 op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2315 /* If the type of OP has the same precision as the result, then
2316 we can strip this conversion. The multiply operation will be
2317 selected to create the correct extension as a by-product. */
2318 if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2319 return true;
2321 /* We can also strip a conversion if it preserves the signed-ness of
2322 the operation and doesn't narrow the range. */
2323 inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2325 /* If the inner-most type is unsigned, then we can strip any
2326 intermediate widening operation. If it's signed, then the
2327 intermediate widening operation must also be signed. */
2328 if ((TYPE_UNSIGNED (inner_op_type)
2329 || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2330 && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2331 return true;
2333 return false;
2336 return rhs_code == FIXED_CONVERT_EXPR;
2339 /* Return true if RHS is a suitable operand for a widening multiplication,
2340 assuming a target type of TYPE.
2341 There are two cases:
2343 - RHS makes some value at least twice as wide. Store that value
2344 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2346 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
2347 but leave *TYPE_OUT untouched. */
2349 static bool
2350 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2351 tree *new_rhs_out)
2353 gimple *stmt;
2354 tree type1, rhs1;
2356 if (TREE_CODE (rhs) == SSA_NAME)
2358 stmt = SSA_NAME_DEF_STMT (rhs);
2359 if (is_gimple_assign (stmt))
2361 if (! widening_mult_conversion_strippable_p (type, stmt))
2362 rhs1 = rhs;
2363 else
2365 rhs1 = gimple_assign_rhs1 (stmt);
2367 if (TREE_CODE (rhs1) == INTEGER_CST)
2369 *new_rhs_out = rhs1;
2370 *type_out = NULL;
2371 return true;
2375 else
2376 rhs1 = rhs;
2378 type1 = TREE_TYPE (rhs1);
2380 if (TREE_CODE (type1) != TREE_CODE (type)
2381 || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2382 return false;
2384 *new_rhs_out = rhs1;
2385 *type_out = type1;
2386 return true;
2389 if (TREE_CODE (rhs) == INTEGER_CST)
2391 *new_rhs_out = rhs;
2392 *type_out = NULL;
2393 return true;
2396 return false;
2399 /* Return true if STMT performs a widening multiplication, assuming the
2400 output type is TYPE. If so, store the unwidened types of the operands
2401 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
2402 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2403 and *TYPE2_OUT would give the operands of the multiplication. */
2405 static bool
2406 is_widening_mult_p (gimple *stmt,
2407 tree *type1_out, tree *rhs1_out,
2408 tree *type2_out, tree *rhs2_out)
2410 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2412 if (TREE_CODE (type) == INTEGER_TYPE)
2414 if (TYPE_OVERFLOW_TRAPS (type))
2415 return false;
2417 else if (TREE_CODE (type) != FIXED_POINT_TYPE)
2418 return false;
2420 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2421 rhs1_out))
2422 return false;
2424 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2425 rhs2_out))
2426 return false;
2428 if (*type1_out == NULL)
2430 if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2431 return false;
2432 *type1_out = *type2_out;
2435 if (*type2_out == NULL)
2437 if (!int_fits_type_p (*rhs2_out, *type1_out))
2438 return false;
2439 *type2_out = *type1_out;
2442 /* Ensure that the larger of the two operands comes first. */
2443 if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2445 std::swap (*type1_out, *type2_out);
2446 std::swap (*rhs1_out, *rhs2_out);
2449 return true;
2452 /* Check to see if the CALL statement is an invocation of copysign
2453 with 1. being the first argument. */
2454 static bool
2455 is_copysign_call_with_1 (gimple *call)
2457 gcall *c = dyn_cast <gcall *> (call);
2458 if (! c)
2459 return false;
2461 enum combined_fn code = gimple_call_combined_fn (c);
2463 if (code == CFN_LAST)
2464 return false;
2466 if (builtin_fn_p (code))
2468 switch (as_builtin_fn (code))
2470 CASE_FLT_FN (BUILT_IN_COPYSIGN):
2471 CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
2472 return real_onep (gimple_call_arg (c, 0));
2473 default:
2474 return false;
2478 if (internal_fn_p (code))
2480 switch (as_internal_fn (code))
2482 case IFN_COPYSIGN:
2483 return real_onep (gimple_call_arg (c, 0));
2484 default:
2485 return false;
2489 return false;
2492 /* Try to expand the pattern x * copysign (1, y) into xorsign (x, y).
2493 This only happens when the xorsign optab is defined, if the
2494 pattern is not a xorsign pattern or if expansion fails FALSE is
2495 returned, otherwise TRUE is returned. */
2496 static bool
2497 convert_expand_mult_copysign (gimple *stmt, gimple_stmt_iterator *gsi)
2499 tree treeop0, treeop1, lhs, type;
2500 location_t loc = gimple_location (stmt);
2501 lhs = gimple_assign_lhs (stmt);
2502 treeop0 = gimple_assign_rhs1 (stmt);
2503 treeop1 = gimple_assign_rhs2 (stmt);
2504 type = TREE_TYPE (lhs);
2505 machine_mode mode = TYPE_MODE (type);
2507 if (HONOR_SNANS (type))
2508 return false;
2510 if (TREE_CODE (treeop0) == SSA_NAME && TREE_CODE (treeop1) == SSA_NAME)
2512 gimple *call0 = SSA_NAME_DEF_STMT (treeop0);
2513 if (!has_single_use (treeop0) || !is_copysign_call_with_1 (call0))
2515 call0 = SSA_NAME_DEF_STMT (treeop1);
2516 if (!has_single_use (treeop1) || !is_copysign_call_with_1 (call0))
2517 return false;
2519 treeop1 = treeop0;
2521 if (optab_handler (xorsign_optab, mode) == CODE_FOR_nothing)
2522 return false;
2524 gcall *c = as_a<gcall*> (call0);
2525 treeop0 = gimple_call_arg (c, 1);
2527 gcall *call_stmt
2528 = gimple_build_call_internal (IFN_XORSIGN, 2, treeop1, treeop0);
2529 gimple_set_lhs (call_stmt, lhs);
2530 gimple_set_location (call_stmt, loc);
2531 gsi_replace (gsi, call_stmt, true);
2532 return true;
2535 return false;
2538 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2539 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
2540 value is true iff we converted the statement. */
2542 static bool
2543 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
2545 tree lhs, rhs1, rhs2, type, type1, type2;
2546 enum insn_code handler;
2547 scalar_int_mode to_mode, from_mode, actual_mode;
2548 optab op;
2549 int actual_precision;
2550 location_t loc = gimple_location (stmt);
2551 bool from_unsigned1, from_unsigned2;
2553 lhs = gimple_assign_lhs (stmt);
2554 type = TREE_TYPE (lhs);
2555 if (TREE_CODE (type) != INTEGER_TYPE)
2556 return false;
2558 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2559 return false;
2561 to_mode = SCALAR_INT_TYPE_MODE (type);
2562 from_mode = SCALAR_INT_TYPE_MODE (type1);
2563 if (to_mode == from_mode)
2564 return false;
2566 from_unsigned1 = TYPE_UNSIGNED (type1);
2567 from_unsigned2 = TYPE_UNSIGNED (type2);
2569 if (from_unsigned1 && from_unsigned2)
2570 op = umul_widen_optab;
2571 else if (!from_unsigned1 && !from_unsigned2)
2572 op = smul_widen_optab;
2573 else
2574 op = usmul_widen_optab;
2576 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2577 &actual_mode);
2579 if (handler == CODE_FOR_nothing)
2581 if (op != smul_widen_optab)
2583 /* We can use a signed multiply with unsigned types as long as
2584 there is a wider mode to use, or it is the smaller of the two
2585 types that is unsigned. Note that type1 >= type2, always. */
2586 if ((TYPE_UNSIGNED (type1)
2587 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2588 || (TYPE_UNSIGNED (type2)
2589 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2591 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2592 || GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2593 return false;
2596 op = smul_widen_optab;
2597 handler = find_widening_optab_handler_and_mode (op, to_mode,
2598 from_mode,
2599 &actual_mode);
2601 if (handler == CODE_FOR_nothing)
2602 return false;
2604 from_unsigned1 = from_unsigned2 = false;
2606 else
2607 return false;
2610 /* Ensure that the inputs to the handler are in the correct precison
2611 for the opcode. This will be the full mode size. */
2612 actual_precision = GET_MODE_PRECISION (actual_mode);
2613 if (2 * actual_precision > TYPE_PRECISION (type))
2614 return false;
2615 if (actual_precision != TYPE_PRECISION (type1)
2616 || from_unsigned1 != TYPE_UNSIGNED (type1))
2617 rhs1 = build_and_insert_cast (gsi, loc,
2618 build_nonstandard_integer_type
2619 (actual_precision, from_unsigned1), rhs1);
2620 if (actual_precision != TYPE_PRECISION (type2)
2621 || from_unsigned2 != TYPE_UNSIGNED (type2))
2622 rhs2 = build_and_insert_cast (gsi, loc,
2623 build_nonstandard_integer_type
2624 (actual_precision, from_unsigned2), rhs2);
2626 /* Handle constants. */
2627 if (TREE_CODE (rhs1) == INTEGER_CST)
2628 rhs1 = fold_convert (type1, rhs1);
2629 if (TREE_CODE (rhs2) == INTEGER_CST)
2630 rhs2 = fold_convert (type2, rhs2);
2632 gimple_assign_set_rhs1 (stmt, rhs1);
2633 gimple_assign_set_rhs2 (stmt, rhs2);
2634 gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2635 update_stmt (stmt);
2636 widen_mul_stats.widen_mults_inserted++;
2637 return true;
2640 /* Process a single gimple statement STMT, which is found at the
2641 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2642 rhs (given by CODE), and try to convert it into a
2643 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
2644 is true iff we converted the statement. */
2646 static bool
2647 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
2648 enum tree_code code)
2650 gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
2651 gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
2652 tree type, type1, type2, optype;
2653 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2654 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2655 optab this_optab;
2656 enum tree_code wmult_code;
2657 enum insn_code handler;
2658 scalar_mode to_mode, from_mode, actual_mode;
2659 location_t loc = gimple_location (stmt);
2660 int actual_precision;
2661 bool from_unsigned1, from_unsigned2;
2663 lhs = gimple_assign_lhs (stmt);
2664 type = TREE_TYPE (lhs);
2665 if (TREE_CODE (type) != INTEGER_TYPE
2666 && TREE_CODE (type) != FIXED_POINT_TYPE)
2667 return false;
2669 if (code == MINUS_EXPR)
2670 wmult_code = WIDEN_MULT_MINUS_EXPR;
2671 else
2672 wmult_code = WIDEN_MULT_PLUS_EXPR;
2674 rhs1 = gimple_assign_rhs1 (stmt);
2675 rhs2 = gimple_assign_rhs2 (stmt);
2677 if (TREE_CODE (rhs1) == SSA_NAME)
2679 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2680 if (is_gimple_assign (rhs1_stmt))
2681 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2684 if (TREE_CODE (rhs2) == SSA_NAME)
2686 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2687 if (is_gimple_assign (rhs2_stmt))
2688 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2691 /* Allow for one conversion statement between the multiply
2692 and addition/subtraction statement. If there are more than
2693 one conversions then we assume they would invalidate this
2694 transformation. If that's not the case then they should have
2695 been folded before now. */
2696 if (CONVERT_EXPR_CODE_P (rhs1_code))
2698 conv1_stmt = rhs1_stmt;
2699 rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2700 if (TREE_CODE (rhs1) == SSA_NAME)
2702 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2703 if (is_gimple_assign (rhs1_stmt))
2704 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2706 else
2707 return false;
2709 if (CONVERT_EXPR_CODE_P (rhs2_code))
2711 conv2_stmt = rhs2_stmt;
2712 rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2713 if (TREE_CODE (rhs2) == SSA_NAME)
2715 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2716 if (is_gimple_assign (rhs2_stmt))
2717 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2719 else
2720 return false;
2723 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2724 is_widening_mult_p, but we still need the rhs returns.
2726 It might also appear that it would be sufficient to use the existing
2727 operands of the widening multiply, but that would limit the choice of
2728 multiply-and-accumulate instructions.
2730 If the widened-multiplication result has more than one uses, it is
2731 probably wiser not to do the conversion. Also restrict this operation
2732 to single basic block to avoid moving the multiply to a different block
2733 with a higher execution frequency. */
2734 if (code == PLUS_EXPR
2735 && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2737 if (!has_single_use (rhs1)
2738 || gimple_bb (rhs1_stmt) != gimple_bb (stmt)
2739 || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2740 &type2, &mult_rhs2))
2741 return false;
2742 add_rhs = rhs2;
2743 conv_stmt = conv1_stmt;
2745 else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2747 if (!has_single_use (rhs2)
2748 || gimple_bb (rhs2_stmt) != gimple_bb (stmt)
2749 || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2750 &type2, &mult_rhs2))
2751 return false;
2752 add_rhs = rhs1;
2753 conv_stmt = conv2_stmt;
2755 else
2756 return false;
2758 to_mode = SCALAR_TYPE_MODE (type);
2759 from_mode = SCALAR_TYPE_MODE (type1);
2760 if (to_mode == from_mode)
2761 return false;
2763 from_unsigned1 = TYPE_UNSIGNED (type1);
2764 from_unsigned2 = TYPE_UNSIGNED (type2);
2765 optype = type1;
2767 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
2768 if (from_unsigned1 != from_unsigned2)
2770 if (!INTEGRAL_TYPE_P (type))
2771 return false;
2772 /* We can use a signed multiply with unsigned types as long as
2773 there is a wider mode to use, or it is the smaller of the two
2774 types that is unsigned. Note that type1 >= type2, always. */
2775 if ((from_unsigned1
2776 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2777 || (from_unsigned2
2778 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2780 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2781 || GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2782 return false;
2785 from_unsigned1 = from_unsigned2 = false;
2786 optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2787 false);
2790 /* If there was a conversion between the multiply and addition
2791 then we need to make sure it fits a multiply-and-accumulate.
2792 The should be a single mode change which does not change the
2793 value. */
2794 if (conv_stmt)
2796 /* We use the original, unmodified data types for this. */
2797 tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2798 tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2799 int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2800 bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2802 if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2804 /* Conversion is a truncate. */
2805 if (TYPE_PRECISION (to_type) < data_size)
2806 return false;
2808 else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2810 /* Conversion is an extend. Check it's the right sort. */
2811 if (TYPE_UNSIGNED (from_type) != is_unsigned
2812 && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2813 return false;
2815 /* else convert is a no-op for our purposes. */
2818 /* Verify that the machine can perform a widening multiply
2819 accumulate in this mode/signedness combination, otherwise
2820 this transformation is likely to pessimize code. */
2821 this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2822 handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2823 from_mode, &actual_mode);
2825 if (handler == CODE_FOR_nothing)
2826 return false;
2828 /* Ensure that the inputs to the handler are in the correct precison
2829 for the opcode. This will be the full mode size. */
2830 actual_precision = GET_MODE_PRECISION (actual_mode);
2831 if (actual_precision != TYPE_PRECISION (type1)
2832 || from_unsigned1 != TYPE_UNSIGNED (type1))
2833 mult_rhs1 = build_and_insert_cast (gsi, loc,
2834 build_nonstandard_integer_type
2835 (actual_precision, from_unsigned1),
2836 mult_rhs1);
2837 if (actual_precision != TYPE_PRECISION (type2)
2838 || from_unsigned2 != TYPE_UNSIGNED (type2))
2839 mult_rhs2 = build_and_insert_cast (gsi, loc,
2840 build_nonstandard_integer_type
2841 (actual_precision, from_unsigned2),
2842 mult_rhs2);
2844 if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2845 add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2847 /* Handle constants. */
2848 if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2849 mult_rhs1 = fold_convert (type1, mult_rhs1);
2850 if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2851 mult_rhs2 = fold_convert (type2, mult_rhs2);
2853 gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
2854 add_rhs);
2855 update_stmt (gsi_stmt (*gsi));
2856 widen_mul_stats.maccs_inserted++;
2857 return true;
2860 /* Given a result MUL_RESULT which is a result of a multiplication of OP1 and
2861 OP2 and which we know is used in statements that can be, together with the
2862 multiplication, converted to FMAs, perform the transformation. */
2864 static void
2865 convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
2867 tree type = TREE_TYPE (mul_result);
2868 gimple *use_stmt;
2869 imm_use_iterator imm_iter;
2870 gcall *fma_stmt;
2872 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2874 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2875 tree addop, mulop1 = op1, result = mul_result;
2876 bool negate_p = false;
2877 gimple_seq seq = NULL;
2879 if (is_gimple_debug (use_stmt))
2880 continue;
2882 if (is_gimple_assign (use_stmt)
2883 && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
2885 result = gimple_assign_lhs (use_stmt);
2886 use_operand_p use_p;
2887 gimple *neguse_stmt;
2888 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2889 gsi_remove (&gsi, true);
2890 release_defs (use_stmt);
2892 use_stmt = neguse_stmt;
2893 gsi = gsi_for_stmt (use_stmt);
2894 negate_p = true;
2897 tree cond, else_value, ops[3];
2898 tree_code code;
2899 if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code,
2900 ops, &else_value))
2901 gcc_unreachable ();
2902 addop = ops[0] == result ? ops[1] : ops[0];
2904 if (code == MINUS_EXPR)
2906 if (ops[0] == result)
2907 /* a * b - c -> a * b + (-c) */
2908 addop = gimple_build (&seq, NEGATE_EXPR, type, addop);
2909 else
2910 /* a - b * c -> (-b) * c + a */
2911 negate_p = !negate_p;
2914 if (negate_p)
2915 mulop1 = gimple_build (&seq, NEGATE_EXPR, type, mulop1);
2917 if (seq)
2918 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
2920 if (cond)
2921 fma_stmt = gimple_build_call_internal (IFN_COND_FMA, 5, cond, mulop1,
2922 op2, addop, else_value);
2923 else
2924 fma_stmt = gimple_build_call_internal (IFN_FMA, 3, mulop1, op2, addop);
2925 gimple_set_lhs (fma_stmt, gimple_get_lhs (use_stmt));
2926 gimple_call_set_nothrow (fma_stmt, !stmt_can_throw_internal (cfun,
2927 use_stmt));
2928 gsi_replace (&gsi, fma_stmt, true);
2929 /* Follow all SSA edges so that we generate FMS, FNMA and FNMS
2930 regardless of where the negation occurs. */
2931 gimple *orig_stmt = gsi_stmt (gsi);
2932 if (fold_stmt (&gsi, follow_all_ssa_edges))
2934 if (maybe_clean_or_replace_eh_stmt (orig_stmt, gsi_stmt (gsi)))
2935 gcc_unreachable ();
2936 update_stmt (gsi_stmt (gsi));
2939 if (dump_file && (dump_flags & TDF_DETAILS))
2941 fprintf (dump_file, "Generated FMA ");
2942 print_gimple_stmt (dump_file, gsi_stmt (gsi), 0, TDF_NONE);
2943 fprintf (dump_file, "\n");
2946 /* If the FMA result is negated in a single use, fold the negation
2947 too. */
2948 orig_stmt = gsi_stmt (gsi);
2949 use_operand_p use_p;
2950 gimple *neg_stmt;
2951 if (is_gimple_call (orig_stmt)
2952 && gimple_call_internal_p (orig_stmt)
2953 && gimple_call_lhs (orig_stmt)
2954 && TREE_CODE (gimple_call_lhs (orig_stmt)) == SSA_NAME
2955 && single_imm_use (gimple_call_lhs (orig_stmt), &use_p, &neg_stmt)
2956 && is_gimple_assign (neg_stmt)
2957 && gimple_assign_rhs_code (neg_stmt) == NEGATE_EXPR
2958 && !stmt_could_throw_p (cfun, neg_stmt))
2960 gsi = gsi_for_stmt (neg_stmt);
2961 if (fold_stmt (&gsi, follow_all_ssa_edges))
2963 if (maybe_clean_or_replace_eh_stmt (neg_stmt, gsi_stmt (gsi)))
2964 gcc_unreachable ();
2965 update_stmt (gsi_stmt (gsi));
2966 if (dump_file && (dump_flags & TDF_DETAILS))
2968 fprintf (dump_file, "Folded FMA negation ");
2969 print_gimple_stmt (dump_file, gsi_stmt (gsi), 0, TDF_NONE);
2970 fprintf (dump_file, "\n");
2975 widen_mul_stats.fmas_inserted++;
2979 /* Data necessary to perform the actual transformation from a multiplication
2980 and an addition to an FMA after decision is taken it should be done and to
2981 then delete the multiplication statement from the function IL. */
2983 struct fma_transformation_info
2985 gimple *mul_stmt;
2986 tree mul_result;
2987 tree op1;
2988 tree op2;
2991 /* Structure containing the current state of FMA deferring, i.e. whether we are
2992 deferring, whether to continue deferring, and all data necessary to come
2993 back and perform all deferred transformations. */
2995 class fma_deferring_state
2997 public:
2998 /* Class constructor. Pass true as PERFORM_DEFERRING in order to actually
2999 do any deferring. */
3001 fma_deferring_state (bool perform_deferring)
3002 : m_candidates (), m_mul_result_set (), m_initial_phi (NULL),
3003 m_last_result (NULL_TREE), m_deferring_p (perform_deferring) {}
3005 /* List of FMA candidates for which we the transformation has been determined
3006 possible but we at this point in BB analysis we do not consider them
3007 beneficial. */
3008 auto_vec<fma_transformation_info, 8> m_candidates;
3010 /* Set of results of multiplication that are part of an already deferred FMA
3011 candidates. */
3012 hash_set<tree> m_mul_result_set;
3014 /* The PHI that supposedly feeds back result of a FMA to another over loop
3015 boundary. */
3016 gphi *m_initial_phi;
3018 /* Result of the last produced FMA candidate or NULL if there has not been
3019 one. */
3020 tree m_last_result;
3022 /* If true, deferring might still be profitable. If false, transform all
3023 candidates and no longer defer. */
3024 bool m_deferring_p;
3027 /* Transform all deferred FMA candidates and mark STATE as no longer
3028 deferring. */
3030 static void
3031 cancel_fma_deferring (fma_deferring_state *state)
3033 if (!state->m_deferring_p)
3034 return;
3036 for (unsigned i = 0; i < state->m_candidates.length (); i++)
3038 if (dump_file && (dump_flags & TDF_DETAILS))
3039 fprintf (dump_file, "Generating deferred FMA\n");
3041 const fma_transformation_info &fti = state->m_candidates[i];
3042 convert_mult_to_fma_1 (fti.mul_result, fti.op1, fti.op2);
3044 gimple_stmt_iterator gsi = gsi_for_stmt (fti.mul_stmt);
3045 gsi_remove (&gsi, true);
3046 release_defs (fti.mul_stmt);
3048 state->m_deferring_p = false;
3051 /* If OP is an SSA name defined by a PHI node, return the PHI statement.
3052 Otherwise return NULL. */
3054 static gphi *
3055 result_of_phi (tree op)
3057 if (TREE_CODE (op) != SSA_NAME)
3058 return NULL;
3060 return dyn_cast <gphi *> (SSA_NAME_DEF_STMT (op));
3063 /* After processing statements of a BB and recording STATE, return true if the
3064 initial phi is fed by the last FMA candidate result ore one such result from
3065 previously processed BBs marked in LAST_RESULT_SET. */
3067 static bool
3068 last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
3069 hash_set<tree> *last_result_set)
3071 ssa_op_iter iter;
3072 use_operand_p use;
3073 FOR_EACH_PHI_ARG (use, state->m_initial_phi, iter, SSA_OP_USE)
3075 tree t = USE_FROM_PTR (use);
3076 if (t == state->m_last_result
3077 || last_result_set->contains (t))
3078 return true;
3081 return false;
3084 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3085 with uses in additions and subtractions to form fused multiply-add
3086 operations. Returns true if successful and MUL_STMT should be removed.
3087 If MUL_COND is nonnull, the multiplication in MUL_STMT is conditional
3088 on MUL_COND, otherwise it is unconditional.
3090 If STATE indicates that we are deferring FMA transformation, that means
3091 that we do not produce FMAs for basic blocks which look like:
3093 <bb 6>
3094 # accumulator_111 = PHI <0.0(5), accumulator_66(6)>
3095 _65 = _14 * _16;
3096 accumulator_66 = _65 + accumulator_111;
3098 or its unrolled version, i.e. with several FMA candidates that feed result
3099 of one into the addend of another. Instead, we add them to a list in STATE
3100 and if we later discover an FMA candidate that is not part of such a chain,
3101 we go back and perform all deferred past candidates. */
3103 static bool
3104 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
3105 fma_deferring_state *state, tree mul_cond = NULL_TREE)
3107 tree mul_result = gimple_get_lhs (mul_stmt);
3108 tree type = TREE_TYPE (mul_result);
3109 gimple *use_stmt, *neguse_stmt;
3110 use_operand_p use_p;
3111 imm_use_iterator imm_iter;
3113 if (FLOAT_TYPE_P (type)
3114 && flag_fp_contract_mode == FP_CONTRACT_OFF)
3115 return false;
3117 /* We don't want to do bitfield reduction ops. */
3118 if (INTEGRAL_TYPE_P (type)
3119 && (!type_has_mode_precision_p (type) || TYPE_OVERFLOW_TRAPS (type)))
3120 return false;
3122 /* If the target doesn't support it, don't generate it. We assume that
3123 if fma isn't available then fms, fnma or fnms are not either. */
3124 optimization_type opt_type = bb_optimization_type (gimple_bb (mul_stmt));
3125 if (!direct_internal_fn_supported_p (IFN_FMA, type, opt_type))
3126 return false;
3128 /* If the multiplication has zero uses, it is kept around probably because
3129 of -fnon-call-exceptions. Don't optimize it away in that case,
3130 it is DCE job. */
3131 if (has_zero_uses (mul_result))
3132 return false;
3134 bool check_defer
3135 = (state->m_deferring_p
3136 && (tree_to_shwi (TYPE_SIZE (type))
3137 <= param_avoid_fma_max_bits));
3138 bool defer = check_defer;
3139 bool seen_negate_p = false;
3140 /* Make sure that the multiplication statement becomes dead after
3141 the transformation, thus that all uses are transformed to FMAs.
3142 This means we assume that an FMA operation has the same cost
3143 as an addition. */
3144 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3146 tree result = mul_result;
3147 bool negate_p = false;
3149 use_stmt = USE_STMT (use_p);
3151 if (is_gimple_debug (use_stmt))
3152 continue;
3154 /* For now restrict this operations to single basic blocks. In theory
3155 we would want to support sinking the multiplication in
3156 m = a*b;
3157 if ()
3158 ma = m + c;
3159 else
3160 d = m;
3161 to form a fma in the then block and sink the multiplication to the
3162 else block. */
3163 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3164 return false;
3166 /* A negate on the multiplication leads to FNMA. */
3167 if (is_gimple_assign (use_stmt)
3168 && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
3170 ssa_op_iter iter;
3171 use_operand_p usep;
3173 /* If (due to earlier missed optimizations) we have two
3174 negates of the same value, treat them as equivalent
3175 to a single negate with multiple uses. */
3176 if (seen_negate_p)
3177 return false;
3179 result = gimple_assign_lhs (use_stmt);
3181 /* Make sure the negate statement becomes dead with this
3182 single transformation. */
3183 if (!single_imm_use (gimple_assign_lhs (use_stmt),
3184 &use_p, &neguse_stmt))
3185 return false;
3187 /* Make sure the multiplication isn't also used on that stmt. */
3188 FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3189 if (USE_FROM_PTR (usep) == mul_result)
3190 return false;
3192 /* Re-validate. */
3193 use_stmt = neguse_stmt;
3194 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3195 return false;
3197 negate_p = seen_negate_p = true;
3200 tree cond, else_value, ops[3];
3201 tree_code code;
3202 if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code, ops,
3203 &else_value))
3204 return false;
3206 switch (code)
3208 case MINUS_EXPR:
3209 if (ops[1] == result)
3210 negate_p = !negate_p;
3211 break;
3212 case PLUS_EXPR:
3213 break;
3214 default:
3215 /* FMA can only be formed from PLUS and MINUS. */
3216 return false;
3219 if (mul_cond && cond != mul_cond)
3220 return false;
3222 if (cond)
3224 if (cond == result || else_value == result)
3225 return false;
3226 if (!direct_internal_fn_supported_p (IFN_COND_FMA, type, opt_type))
3227 return false;
3230 /* If the subtrahend (OPS[1]) is computed by a MULT_EXPR that
3231 we'll visit later, we might be able to get a more profitable
3232 match with fnma.
3233 OTOH, if we don't, a negate / fma pair has likely lower latency
3234 that a mult / subtract pair. */
3235 if (code == MINUS_EXPR
3236 && !negate_p
3237 && ops[0] == result
3238 && !direct_internal_fn_supported_p (IFN_FMS, type, opt_type)
3239 && direct_internal_fn_supported_p (IFN_FNMA, type, opt_type)
3240 && TREE_CODE (ops[1]) == SSA_NAME
3241 && has_single_use (ops[1]))
3243 gimple *stmt2 = SSA_NAME_DEF_STMT (ops[1]);
3244 if (is_gimple_assign (stmt2)
3245 && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3246 return false;
3249 /* We can't handle a * b + a * b. */
3250 if (ops[0] == ops[1])
3251 return false;
3252 /* If deferring, make sure we are not looking at an instruction that
3253 wouldn't have existed if we were not. */
3254 if (state->m_deferring_p
3255 && (state->m_mul_result_set.contains (ops[0])
3256 || state->m_mul_result_set.contains (ops[1])))
3257 return false;
3259 if (check_defer)
3261 tree use_lhs = gimple_get_lhs (use_stmt);
3262 if (state->m_last_result)
3264 if (ops[1] == state->m_last_result
3265 || ops[0] == state->m_last_result)
3266 defer = true;
3267 else
3268 defer = false;
3270 else
3272 gcc_checking_assert (!state->m_initial_phi);
3273 gphi *phi;
3274 if (ops[0] == result)
3275 phi = result_of_phi (ops[1]);
3276 else
3278 gcc_assert (ops[1] == result);
3279 phi = result_of_phi (ops[0]);
3282 if (phi)
3284 state->m_initial_phi = phi;
3285 defer = true;
3287 else
3288 defer = false;
3291 state->m_last_result = use_lhs;
3292 check_defer = false;
3294 else
3295 defer = false;
3297 /* While it is possible to validate whether or not the exact form that
3298 we've recognized is available in the backend, the assumption is that
3299 if the deferring logic above did not trigger, the transformation is
3300 never a loss. For instance, suppose the target only has the plain FMA
3301 pattern available. Consider a*b-c -> fma(a,b,-c): we've exchanged
3302 MUL+SUB for FMA+NEG, which is still two operations. Consider
3303 -(a*b)-c -> fma(-a,b,-c): we still have 3 operations, but in the FMA
3304 form the two NEGs are independent and could be run in parallel. */
3307 if (defer)
3309 fma_transformation_info fti;
3310 fti.mul_stmt = mul_stmt;
3311 fti.mul_result = mul_result;
3312 fti.op1 = op1;
3313 fti.op2 = op2;
3314 state->m_candidates.safe_push (fti);
3315 state->m_mul_result_set.add (mul_result);
3317 if (dump_file && (dump_flags & TDF_DETAILS))
3319 fprintf (dump_file, "Deferred generating FMA for multiplication ");
3320 print_gimple_stmt (dump_file, mul_stmt, 0, TDF_NONE);
3321 fprintf (dump_file, "\n");
3324 return false;
3326 else
3328 if (state->m_deferring_p)
3329 cancel_fma_deferring (state);
3330 convert_mult_to_fma_1 (mul_result, op1, op2);
3331 return true;
3336 /* Helper function of match_uaddsub_overflow. Return 1
3337 if USE_STMT is unsigned overflow check ovf != 0 for
3338 STMT, -1 if USE_STMT is unsigned overflow check ovf == 0
3339 and 0 otherwise. */
3341 static int
3342 uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt)
3344 enum tree_code ccode = ERROR_MARK;
3345 tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
3346 if (gimple_code (use_stmt) == GIMPLE_COND)
3348 ccode = gimple_cond_code (use_stmt);
3349 crhs1 = gimple_cond_lhs (use_stmt);
3350 crhs2 = gimple_cond_rhs (use_stmt);
3352 else if (is_gimple_assign (use_stmt))
3354 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3356 ccode = gimple_assign_rhs_code (use_stmt);
3357 crhs1 = gimple_assign_rhs1 (use_stmt);
3358 crhs2 = gimple_assign_rhs2 (use_stmt);
3360 else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
3362 tree cond = gimple_assign_rhs1 (use_stmt);
3363 if (COMPARISON_CLASS_P (cond))
3365 ccode = TREE_CODE (cond);
3366 crhs1 = TREE_OPERAND (cond, 0);
3367 crhs2 = TREE_OPERAND (cond, 1);
3369 else
3370 return 0;
3372 else
3373 return 0;
3375 else
3376 return 0;
3378 if (TREE_CODE_CLASS (ccode) != tcc_comparison)
3379 return 0;
3381 enum tree_code code = gimple_assign_rhs_code (stmt);
3382 tree lhs = gimple_assign_lhs (stmt);
3383 tree rhs1 = gimple_assign_rhs1 (stmt);
3384 tree rhs2 = gimple_assign_rhs2 (stmt);
3386 switch (ccode)
3388 case GT_EXPR:
3389 case LE_EXPR:
3390 /* r = a - b; r > a or r <= a
3391 r = a + b; a > r or a <= r or b > r or b <= r. */
3392 if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1)
3393 || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2)
3394 && crhs2 == lhs))
3395 return ccode == GT_EXPR ? 1 : -1;
3396 break;
3397 case LT_EXPR:
3398 case GE_EXPR:
3399 /* r = a - b; a < r or a >= r
3400 r = a + b; r < a or r >= a or r < b or r >= b. */
3401 if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs)
3402 || (code == PLUS_EXPR && crhs1 == lhs
3403 && (crhs2 == rhs1 || crhs2 == rhs2)))
3404 return ccode == LT_EXPR ? 1 : -1;
3405 break;
3406 default:
3407 break;
3409 return 0;
3412 /* Recognize for unsigned x
3413 x = y - z;
3414 if (x > y)
3415 where there are other uses of x and replace it with
3416 _7 = SUB_OVERFLOW (y, z);
3417 x = REALPART_EXPR <_7>;
3418 _8 = IMAGPART_EXPR <_7>;
3419 if (_8)
3420 and similarly for addition. */
3422 static bool
3423 match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
3424 enum tree_code code)
3426 tree lhs = gimple_assign_lhs (stmt);
3427 tree type = TREE_TYPE (lhs);
3428 use_operand_p use_p;
3429 imm_use_iterator iter;
3430 bool use_seen = false;
3431 bool ovf_use_seen = false;
3432 gimple *use_stmt;
3434 gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
3435 if (!INTEGRAL_TYPE_P (type)
3436 || !TYPE_UNSIGNED (type)
3437 || has_zero_uses (lhs)
3438 || has_single_use (lhs)
3439 || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
3440 TYPE_MODE (type)) == CODE_FOR_nothing)
3441 return false;
3443 FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
3445 use_stmt = USE_STMT (use_p);
3446 if (is_gimple_debug (use_stmt))
3447 continue;
3449 if (uaddsub_overflow_check_p (stmt, use_stmt))
3450 ovf_use_seen = true;
3451 else
3452 use_seen = true;
3453 if (ovf_use_seen && use_seen)
3454 break;
3457 if (!ovf_use_seen || !use_seen)
3458 return false;
3460 tree ctype = build_complex_type (type);
3461 tree rhs1 = gimple_assign_rhs1 (stmt);
3462 tree rhs2 = gimple_assign_rhs2 (stmt);
3463 gcall *g = gimple_build_call_internal (code == PLUS_EXPR
3464 ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW,
3465 2, rhs1, rhs2);
3466 tree ctmp = make_ssa_name (ctype);
3467 gimple_call_set_lhs (g, ctmp);
3468 gsi_insert_before (gsi, g, GSI_SAME_STMT);
3469 gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR,
3470 build1 (REALPART_EXPR, type, ctmp));
3471 gsi_replace (gsi, g2, true);
3472 tree ovf = make_ssa_name (type);
3473 g2 = gimple_build_assign (ovf, IMAGPART_EXPR,
3474 build1 (IMAGPART_EXPR, type, ctmp));
3475 gsi_insert_after (gsi, g2, GSI_NEW_STMT);
3477 FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
3479 if (is_gimple_debug (use_stmt))
3480 continue;
3482 int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt);
3483 if (ovf_use == 0)
3484 continue;
3485 if (gimple_code (use_stmt) == GIMPLE_COND)
3487 gcond *cond_stmt = as_a <gcond *> (use_stmt);
3488 gimple_cond_set_lhs (cond_stmt, ovf);
3489 gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0));
3490 gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3492 else
3494 gcc_checking_assert (is_gimple_assign (use_stmt));
3495 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3497 gimple_assign_set_rhs1 (use_stmt, ovf);
3498 gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0));
3499 gimple_assign_set_rhs_code (use_stmt,
3500 ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3502 else
3504 gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
3505 == COND_EXPR);
3506 tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR,
3507 boolean_type_node, ovf,
3508 build_int_cst (type, 0));
3509 gimple_assign_set_rhs1 (use_stmt, cond);
3512 update_stmt (use_stmt);
3514 return true;
3517 /* Return true if target has support for divmod. */
3519 static bool
3520 target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode)
3522 /* If target supports hardware divmod insn, use it for divmod. */
3523 if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing)
3524 return true;
3526 /* Check if libfunc for divmod is available. */
3527 rtx libfunc = optab_libfunc (divmod_optab, mode);
3528 if (libfunc != NULL_RTX)
3530 /* If optab_handler exists for div_optab, perhaps in a wider mode,
3531 we don't want to use the libfunc even if it exists for given mode. */
3532 machine_mode div_mode;
3533 FOR_EACH_MODE_FROM (div_mode, mode)
3534 if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing)
3535 return false;
3537 return targetm.expand_divmod_libfunc != NULL;
3540 return false;
3543 /* Check if stmt is candidate for divmod transform. */
3545 static bool
3546 divmod_candidate_p (gassign *stmt)
3548 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3549 machine_mode mode = TYPE_MODE (type);
3550 optab divmod_optab, div_optab;
3552 if (TYPE_UNSIGNED (type))
3554 divmod_optab = udivmod_optab;
3555 div_optab = udiv_optab;
3557 else
3559 divmod_optab = sdivmod_optab;
3560 div_optab = sdiv_optab;
3563 tree op1 = gimple_assign_rhs1 (stmt);
3564 tree op2 = gimple_assign_rhs2 (stmt);
3566 /* Disable the transform if either is a constant, since division-by-constant
3567 may have specialized expansion. */
3568 if (CONSTANT_CLASS_P (op1) || CONSTANT_CLASS_P (op2))
3569 return false;
3571 /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should
3572 expand using the [su]divv optabs. */
3573 if (TYPE_OVERFLOW_TRAPS (type))
3574 return false;
3576 if (!target_supports_divmod_p (divmod_optab, div_optab, mode))
3577 return false;
3579 return true;
3582 /* This function looks for:
3583 t1 = a TRUNC_DIV_EXPR b;
3584 t2 = a TRUNC_MOD_EXPR b;
3585 and transforms it to the following sequence:
3586 complex_tmp = DIVMOD (a, b);
3587 t1 = REALPART_EXPR(a);
3588 t2 = IMAGPART_EXPR(b);
3589 For conditions enabling the transform see divmod_candidate_p().
3591 The pass has three parts:
3592 1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all
3593 other trunc_div_expr and trunc_mod_expr stmts.
3594 2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt
3595 to stmts vector.
3596 3) Insert DIVMOD call just before top_stmt and update entries in
3597 stmts vector to use return value of DIMOVD (REALEXPR_PART for div,
3598 IMAGPART_EXPR for mod). */
3600 static bool
3601 convert_to_divmod (gassign *stmt)
3603 if (stmt_can_throw_internal (cfun, stmt)
3604 || !divmod_candidate_p (stmt))
3605 return false;
3607 tree op1 = gimple_assign_rhs1 (stmt);
3608 tree op2 = gimple_assign_rhs2 (stmt);
3610 imm_use_iterator use_iter;
3611 gimple *use_stmt;
3612 auto_vec<gimple *> stmts;
3614 gimple *top_stmt = stmt;
3615 basic_block top_bb = gimple_bb (stmt);
3617 /* Part 1: Try to set top_stmt to "topmost" stmt that dominates
3618 at-least stmt and possibly other trunc_div/trunc_mod stmts
3619 having same operands as stmt. */
3621 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1)
3623 if (is_gimple_assign (use_stmt)
3624 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3625 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3626 && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
3627 && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
3629 if (stmt_can_throw_internal (cfun, use_stmt))
3630 continue;
3632 basic_block bb = gimple_bb (use_stmt);
3634 if (bb == top_bb)
3636 if (gimple_uid (use_stmt) < gimple_uid (top_stmt))
3637 top_stmt = use_stmt;
3639 else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb))
3641 top_bb = bb;
3642 top_stmt = use_stmt;
3647 tree top_op1 = gimple_assign_rhs1 (top_stmt);
3648 tree top_op2 = gimple_assign_rhs2 (top_stmt);
3650 stmts.safe_push (top_stmt);
3651 bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR);
3653 /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb
3654 to stmts vector. The 2nd loop will always add stmt to stmts vector, since
3655 gimple_bb (top_stmt) dominates gimple_bb (stmt), so the
3656 2nd loop ends up adding at-least single trunc_mod_expr stmt. */
3658 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1)
3660 if (is_gimple_assign (use_stmt)
3661 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3662 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3663 && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
3664 && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
3666 if (use_stmt == top_stmt
3667 || stmt_can_throw_internal (cfun, use_stmt)
3668 || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
3669 continue;
3671 stmts.safe_push (use_stmt);
3672 if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
3673 div_seen = true;
3677 if (!div_seen)
3678 return false;
3680 /* Part 3: Create libcall to internal fn DIVMOD:
3681 divmod_tmp = DIVMOD (op1, op2). */
3683 gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
3684 tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)),
3685 call_stmt, "divmod_tmp");
3686 gimple_call_set_lhs (call_stmt, res);
3687 /* We rejected throwing statements above. */
3688 gimple_call_set_nothrow (call_stmt, true);
3690 /* Insert the call before top_stmt. */
3691 gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt);
3692 gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT);
3694 widen_mul_stats.divmod_calls_inserted++;
3696 /* Update all statements in stmts vector:
3697 lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp>
3698 lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>. */
3700 for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i)
3702 tree new_rhs;
3704 switch (gimple_assign_rhs_code (use_stmt))
3706 case TRUNC_DIV_EXPR:
3707 new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res);
3708 break;
3710 case TRUNC_MOD_EXPR:
3711 new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res);
3712 break;
3714 default:
3715 gcc_unreachable ();
3718 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3719 gimple_assign_set_rhs_from_tree (&gsi, new_rhs);
3720 update_stmt (use_stmt);
3723 return true;
3726 /* Find integer multiplications where the operands are extended from
3727 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3728 where appropriate. */
3730 namespace {
3732 const pass_data pass_data_optimize_widening_mul =
3734 GIMPLE_PASS, /* type */
3735 "widening_mul", /* name */
3736 OPTGROUP_NONE, /* optinfo_flags */
3737 TV_TREE_WIDEN_MUL, /* tv_id */
3738 PROP_ssa, /* properties_required */
3739 0, /* properties_provided */
3740 0, /* properties_destroyed */
3741 0, /* todo_flags_start */
3742 TODO_update_ssa, /* todo_flags_finish */
3745 class pass_optimize_widening_mul : public gimple_opt_pass
3747 public:
3748 pass_optimize_widening_mul (gcc::context *ctxt)
3749 : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3752 /* opt_pass methods: */
3753 virtual bool gate (function *)
3755 return flag_expensive_optimizations && optimize;
3758 virtual unsigned int execute (function *);
3760 }; // class pass_optimize_widening_mul
3762 /* Walker class to perform the transformation in reverse dominance order. */
3764 class math_opts_dom_walker : public dom_walker
3766 public:
3767 /* Constructor, CFG_CHANGED is a pointer to a boolean flag that will be set
3768 if walking modidifes the CFG. */
3770 math_opts_dom_walker (bool *cfg_changed_p)
3771 : dom_walker (CDI_DOMINATORS), m_last_result_set (),
3772 m_cfg_changed_p (cfg_changed_p) {}
3774 /* The actual actions performed in the walk. */
3776 virtual void after_dom_children (basic_block);
3778 /* Set of results of chains of multiply and add statement combinations that
3779 were not transformed into FMAs because of active deferring. */
3780 hash_set<tree> m_last_result_set;
3782 /* Pointer to a flag of the user that needs to be set if CFG has been
3783 modified. */
3784 bool *m_cfg_changed_p;
3787 void
3788 math_opts_dom_walker::after_dom_children (basic_block bb)
3790 gimple_stmt_iterator gsi;
3792 fma_deferring_state fma_state (param_avoid_fma_max_bits > 0);
3794 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3796 gimple *stmt = gsi_stmt (gsi);
3797 enum tree_code code;
3799 if (is_gimple_assign (stmt))
3801 code = gimple_assign_rhs_code (stmt);
3802 switch (code)
3804 case MULT_EXPR:
3805 if (!convert_mult_to_widen (stmt, &gsi)
3806 && !convert_expand_mult_copysign (stmt, &gsi)
3807 && convert_mult_to_fma (stmt,
3808 gimple_assign_rhs1 (stmt),
3809 gimple_assign_rhs2 (stmt),
3810 &fma_state))
3812 gsi_remove (&gsi, true);
3813 release_defs (stmt);
3814 continue;
3816 break;
3818 case PLUS_EXPR:
3819 case MINUS_EXPR:
3820 if (!convert_plusminus_to_widen (&gsi, stmt, code))
3821 match_uaddsub_overflow (&gsi, stmt, code);
3822 break;
3824 case TRUNC_MOD_EXPR:
3825 convert_to_divmod (as_a<gassign *> (stmt));
3826 break;
3828 default:;
3831 else if (is_gimple_call (stmt))
3833 switch (gimple_call_combined_fn (stmt))
3835 CASE_CFN_POW:
3836 if (gimple_call_lhs (stmt)
3837 && TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3838 && real_equal (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3839 &dconst2)
3840 && convert_mult_to_fma (stmt,
3841 gimple_call_arg (stmt, 0),
3842 gimple_call_arg (stmt, 0),
3843 &fma_state))
3845 unlink_stmt_vdef (stmt);
3846 if (gsi_remove (&gsi, true)
3847 && gimple_purge_dead_eh_edges (bb))
3848 *m_cfg_changed_p = true;
3849 release_defs (stmt);
3850 continue;
3852 break;
3854 case CFN_COND_MUL:
3855 if (convert_mult_to_fma (stmt,
3856 gimple_call_arg (stmt, 1),
3857 gimple_call_arg (stmt, 2),
3858 &fma_state,
3859 gimple_call_arg (stmt, 0)))
3862 gsi_remove (&gsi, true);
3863 release_defs (stmt);
3864 continue;
3866 break;
3868 case CFN_LAST:
3869 cancel_fma_deferring (&fma_state);
3870 break;
3872 default:
3873 break;
3876 gsi_next (&gsi);
3878 if (fma_state.m_deferring_p
3879 && fma_state.m_initial_phi)
3881 gcc_checking_assert (fma_state.m_last_result);
3882 if (!last_fma_candidate_feeds_initial_phi (&fma_state,
3883 &m_last_result_set))
3884 cancel_fma_deferring (&fma_state);
3885 else
3886 m_last_result_set.add (fma_state.m_last_result);
3891 unsigned int
3892 pass_optimize_widening_mul::execute (function *fun)
3894 bool cfg_changed = false;
3896 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3897 calculate_dominance_info (CDI_DOMINATORS);
3898 renumber_gimple_stmt_uids (cfun);
3900 math_opts_dom_walker (&cfg_changed).walk (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3902 statistics_counter_event (fun, "widening multiplications inserted",
3903 widen_mul_stats.widen_mults_inserted);
3904 statistics_counter_event (fun, "widening maccs inserted",
3905 widen_mul_stats.maccs_inserted);
3906 statistics_counter_event (fun, "fused multiply-adds inserted",
3907 widen_mul_stats.fmas_inserted);
3908 statistics_counter_event (fun, "divmod calls inserted",
3909 widen_mul_stats.divmod_calls_inserted);
3911 return cfg_changed ? TODO_cleanup_cfg : 0;
3914 } // anon namespace
3916 gimple_opt_pass *
3917 make_pass_optimize_widening_mul (gcc::context *ctxt)
3919 return new pass_optimize_widening_mul (ctxt);