clang-format: Enhance list of FOR_EACH macros
[official-gcc.git] / gcc / tree-ssa-math-opts.c
blob1e7cf7ed283c90e9fc76cf1a0eaa868885fd0a5d
1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2015 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
23 modulus = sqrt(x*x + y*y + z*z);
24 x = x / modulus;
25 y = y / modulus;
26 z = z / modulus;
28 that can be optimized to
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
32 x = x * rmodulus;
33 y = y * rmodulus;
34 z = z * rmodulus;
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 hy the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
49 this comment.
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "backend.h"
91 #include "target.h"
92 #include "rtl.h"
93 #include "tree.h"
94 #include "gimple.h"
95 #include "predict.h"
96 #include "alloc-pool.h"
97 #include "tree-pass.h"
98 #include "ssa.h"
99 #include "optabs-tree.h"
100 #include "gimple-pretty-print.h"
101 #include "alias.h"
102 #include "fold-const.h"
103 #include "gimple-fold.h"
104 #include "gimple-iterator.h"
105 #include "gimplify.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-dfa.h"
110 #include "tree-ssa.h"
111 #include "builtins.h"
112 #include "params.h"
113 #include "case-cfn-macros.h"
115 /* This structure represents one basic block that either computes a
116 division, or is a common dominator for basic block that compute a
117 division. */
118 struct occurrence {
119 /* The basic block represented by this structure. */
120 basic_block bb;
122 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
123 inserted in BB. */
124 tree recip_def;
126 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
127 was inserted in BB. */
128 gimple *recip_def_stmt;
130 /* Pointer to a list of "struct occurrence"s for blocks dominated
131 by BB. */
132 struct occurrence *children;
134 /* Pointer to the next "struct occurrence"s in the list of blocks
135 sharing a common dominator. */
136 struct occurrence *next;
138 /* The number of divisions that are in BB before compute_merit. The
139 number of divisions that are in BB or post-dominate it after
140 compute_merit. */
141 int num_divisions;
143 /* True if the basic block has a division, false if it is a common
144 dominator for basic blocks that do. If it is false and trapping
145 math is active, BB is not a candidate for inserting a reciprocal. */
146 bool bb_has_division;
149 static struct
151 /* Number of 1.0/X ops inserted. */
152 int rdivs_inserted;
154 /* Number of 1.0/FUNC ops inserted. */
155 int rfuncs_inserted;
156 } reciprocal_stats;
158 static struct
160 /* Number of cexpi calls inserted. */
161 int inserted;
162 } sincos_stats;
164 static struct
166 /* Number of hand-written 16-bit nop / bswaps found. */
167 int found_16bit;
169 /* Number of hand-written 32-bit nop / bswaps found. */
170 int found_32bit;
172 /* Number of hand-written 64-bit nop / bswaps found. */
173 int found_64bit;
174 } nop_stats, bswap_stats;
176 static struct
178 /* Number of widening multiplication ops inserted. */
179 int widen_mults_inserted;
181 /* Number of integer multiply-and-accumulate ops inserted. */
182 int maccs_inserted;
184 /* Number of fp fused multiply-add ops inserted. */
185 int fmas_inserted;
186 } widen_mul_stats;
188 /* The instance of "struct occurrence" representing the highest
189 interesting block in the dominator tree. */
190 static struct occurrence *occ_head;
192 /* Allocation pool for getting instances of "struct occurrence". */
193 static object_allocator<occurrence> *occ_pool;
197 /* Allocate and return a new struct occurrence for basic block BB, and
198 whose children list is headed by CHILDREN. */
199 static struct occurrence *
200 occ_new (basic_block bb, struct occurrence *children)
202 struct occurrence *occ;
204 bb->aux = occ = occ_pool->allocate ();
205 memset (occ, 0, sizeof (struct occurrence));
207 occ->bb = bb;
208 occ->children = children;
209 return occ;
213 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
214 list of "struct occurrence"s, one per basic block, having IDOM as
215 their common dominator.
217 We try to insert NEW_OCC as deep as possible in the tree, and we also
218 insert any other block that is a common dominator for BB and one
219 block already in the tree. */
221 static void
222 insert_bb (struct occurrence *new_occ, basic_block idom,
223 struct occurrence **p_head)
225 struct occurrence *occ, **p_occ;
227 for (p_occ = p_head; (occ = *p_occ) != NULL; )
229 basic_block bb = new_occ->bb, occ_bb = occ->bb;
230 basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
231 if (dom == bb)
233 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
234 from its list. */
235 *p_occ = occ->next;
236 occ->next = new_occ->children;
237 new_occ->children = occ;
239 /* Try the next block (it may as well be dominated by BB). */
242 else if (dom == occ_bb)
244 /* OCC_BB dominates BB. Tail recurse to look deeper. */
245 insert_bb (new_occ, dom, &occ->children);
246 return;
249 else if (dom != idom)
251 gcc_assert (!dom->aux);
253 /* There is a dominator between IDOM and BB, add it and make
254 two children out of NEW_OCC and OCC. First, remove OCC from
255 its list. */
256 *p_occ = occ->next;
257 new_occ->next = occ;
258 occ->next = NULL;
260 /* None of the previous blocks has DOM as a dominator: if we tail
261 recursed, we would reexamine them uselessly. Just switch BB with
262 DOM, and go on looking for blocks dominated by DOM. */
263 new_occ = occ_new (dom, new_occ);
266 else
268 /* Nothing special, go on with the next element. */
269 p_occ = &occ->next;
273 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
274 new_occ->next = *p_head;
275 *p_head = new_occ;
278 /* Register that we found a division in BB. */
280 static inline void
281 register_division_in (basic_block bb)
283 struct occurrence *occ;
285 occ = (struct occurrence *) bb->aux;
286 if (!occ)
288 occ = occ_new (bb, NULL);
289 insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
292 occ->bb_has_division = true;
293 occ->num_divisions++;
297 /* Compute the number of divisions that postdominate each block in OCC and
298 its children. */
300 static void
301 compute_merit (struct occurrence *occ)
303 struct occurrence *occ_child;
304 basic_block dom = occ->bb;
306 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
308 basic_block bb;
309 if (occ_child->children)
310 compute_merit (occ_child);
312 if (flag_exceptions)
313 bb = single_noncomplex_succ (dom);
314 else
315 bb = dom;
317 if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
318 occ->num_divisions += occ_child->num_divisions;
323 /* Return whether USE_STMT is a floating-point division by DEF. */
324 static inline bool
325 is_division_by (gimple *use_stmt, tree def)
327 return is_gimple_assign (use_stmt)
328 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
329 && gimple_assign_rhs2 (use_stmt) == def
330 /* Do not recognize x / x as valid division, as we are getting
331 confused later by replacing all immediate uses x in such
332 a stmt. */
333 && gimple_assign_rhs1 (use_stmt) != def;
336 /* Walk the subset of the dominator tree rooted at OCC, setting the
337 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
338 the given basic block. The field may be left NULL, of course,
339 if it is not possible or profitable to do the optimization.
341 DEF_BSI is an iterator pointing at the statement defining DEF.
342 If RECIP_DEF is set, a dominator already has a computation that can
343 be used. */
345 static void
346 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
347 tree def, tree recip_def, int threshold)
349 tree type;
350 gassign *new_stmt;
351 gimple_stmt_iterator gsi;
352 struct occurrence *occ_child;
354 if (!recip_def
355 && (occ->bb_has_division || !flag_trapping_math)
356 && occ->num_divisions >= threshold)
358 /* Make a variable with the replacement and substitute it. */
359 type = TREE_TYPE (def);
360 recip_def = create_tmp_reg (type, "reciptmp");
361 new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
362 build_one_cst (type), def);
364 if (occ->bb_has_division)
366 /* Case 1: insert before an existing division. */
367 gsi = gsi_after_labels (occ->bb);
368 while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
369 gsi_next (&gsi);
371 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
373 else if (def_gsi && occ->bb == def_gsi->bb)
375 /* Case 2: insert right after the definition. Note that this will
376 never happen if the definition statement can throw, because in
377 that case the sole successor of the statement's basic block will
378 dominate all the uses as well. */
379 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
381 else
383 /* Case 3: insert in a basic block not containing defs/uses. */
384 gsi = gsi_after_labels (occ->bb);
385 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
388 reciprocal_stats.rdivs_inserted++;
390 occ->recip_def_stmt = new_stmt;
393 occ->recip_def = recip_def;
394 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
395 insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
399 /* Replace the division at USE_P with a multiplication by the reciprocal, if
400 possible. */
402 static inline void
403 replace_reciprocal (use_operand_p use_p)
405 gimple *use_stmt = USE_STMT (use_p);
406 basic_block bb = gimple_bb (use_stmt);
407 struct occurrence *occ = (struct occurrence *) bb->aux;
409 if (optimize_bb_for_speed_p (bb)
410 && occ->recip_def && use_stmt != occ->recip_def_stmt)
412 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
413 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
414 SET_USE (use_p, occ->recip_def);
415 fold_stmt_inplace (&gsi);
416 update_stmt (use_stmt);
421 /* Free OCC and return one more "struct occurrence" to be freed. */
423 static struct occurrence *
424 free_bb (struct occurrence *occ)
426 struct occurrence *child, *next;
428 /* First get the two pointers hanging off OCC. */
429 next = occ->next;
430 child = occ->children;
431 occ->bb->aux = NULL;
432 occ_pool->remove (occ);
434 /* Now ensure that we don't recurse unless it is necessary. */
435 if (!child)
436 return next;
437 else
439 while (next)
440 next = free_bb (next);
442 return child;
447 /* Look for floating-point divisions among DEF's uses, and try to
448 replace them by multiplications with the reciprocal. Add
449 as many statements computing the reciprocal as needed.
451 DEF must be a GIMPLE register of a floating-point type. */
453 static void
454 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
456 use_operand_p use_p;
457 imm_use_iterator use_iter;
458 struct occurrence *occ;
459 int count = 0, threshold;
461 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
463 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
465 gimple *use_stmt = USE_STMT (use_p);
466 if (is_division_by (use_stmt, def))
468 register_division_in (gimple_bb (use_stmt));
469 count++;
473 /* Do the expensive part only if we can hope to optimize something. */
474 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
475 if (count >= threshold)
477 gimple *use_stmt;
478 for (occ = occ_head; occ; occ = occ->next)
480 compute_merit (occ);
481 insert_reciprocals (def_gsi, occ, def, NULL, threshold);
484 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
486 if (is_division_by (use_stmt, def))
488 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
489 replace_reciprocal (use_p);
494 for (occ = occ_head; occ; )
495 occ = free_bb (occ);
497 occ_head = NULL;
500 /* Go through all the floating-point SSA_NAMEs, and call
501 execute_cse_reciprocals_1 on each of them. */
502 namespace {
504 const pass_data pass_data_cse_reciprocals =
506 GIMPLE_PASS, /* type */
507 "recip", /* name */
508 OPTGROUP_NONE, /* optinfo_flags */
509 TV_NONE, /* tv_id */
510 PROP_ssa, /* properties_required */
511 0, /* properties_provided */
512 0, /* properties_destroyed */
513 0, /* todo_flags_start */
514 TODO_update_ssa, /* todo_flags_finish */
517 class pass_cse_reciprocals : public gimple_opt_pass
519 public:
520 pass_cse_reciprocals (gcc::context *ctxt)
521 : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
524 /* opt_pass methods: */
525 virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
526 virtual unsigned int execute (function *);
528 }; // class pass_cse_reciprocals
530 unsigned int
531 pass_cse_reciprocals::execute (function *fun)
533 basic_block bb;
534 tree arg;
536 occ_pool = new object_allocator<occurrence> ("dominators for recip");
538 memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
539 calculate_dominance_info (CDI_DOMINATORS);
540 calculate_dominance_info (CDI_POST_DOMINATORS);
542 if (flag_checking)
543 FOR_EACH_BB_FN (bb, fun)
544 gcc_assert (!bb->aux);
546 for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
547 if (FLOAT_TYPE_P (TREE_TYPE (arg))
548 && is_gimple_reg (arg))
550 tree name = ssa_default_def (fun, arg);
551 if (name)
552 execute_cse_reciprocals_1 (NULL, name);
555 FOR_EACH_BB_FN (bb, fun)
557 tree def;
559 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
560 gsi_next (&gsi))
562 gphi *phi = gsi.phi ();
563 def = PHI_RESULT (phi);
564 if (! virtual_operand_p (def)
565 && FLOAT_TYPE_P (TREE_TYPE (def)))
566 execute_cse_reciprocals_1 (NULL, def);
569 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
570 gsi_next (&gsi))
572 gimple *stmt = gsi_stmt (gsi);
574 if (gimple_has_lhs (stmt)
575 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
576 && FLOAT_TYPE_P (TREE_TYPE (def))
577 && TREE_CODE (def) == SSA_NAME)
578 execute_cse_reciprocals_1 (&gsi, def);
581 if (optimize_bb_for_size_p (bb))
582 continue;
584 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
585 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
586 gsi_next (&gsi))
588 gimple *stmt = gsi_stmt (gsi);
589 tree fndecl;
591 if (is_gimple_assign (stmt)
592 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
594 tree arg1 = gimple_assign_rhs2 (stmt);
595 gimple *stmt1;
597 if (TREE_CODE (arg1) != SSA_NAME)
598 continue;
600 stmt1 = SSA_NAME_DEF_STMT (arg1);
602 if (is_gimple_call (stmt1)
603 && gimple_call_lhs (stmt1)
604 && (fndecl = gimple_call_fndecl (stmt1))
605 && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
606 || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
608 enum built_in_function code;
609 bool md_code, fail;
610 imm_use_iterator ui;
611 use_operand_p use_p;
613 code = DECL_FUNCTION_CODE (fndecl);
614 md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
616 fndecl = targetm.builtin_reciprocal (code, md_code, false);
617 if (!fndecl)
618 continue;
620 /* Check that all uses of the SSA name are divisions,
621 otherwise replacing the defining statement will do
622 the wrong thing. */
623 fail = false;
624 FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
626 gimple *stmt2 = USE_STMT (use_p);
627 if (is_gimple_debug (stmt2))
628 continue;
629 if (!is_gimple_assign (stmt2)
630 || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
631 || gimple_assign_rhs1 (stmt2) == arg1
632 || gimple_assign_rhs2 (stmt2) != arg1)
634 fail = true;
635 break;
638 if (fail)
639 continue;
641 gimple_replace_ssa_lhs (stmt1, arg1);
642 gimple_call_set_fndecl (stmt1, fndecl);
643 update_stmt (stmt1);
644 reciprocal_stats.rfuncs_inserted++;
646 FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
648 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
649 gimple_assign_set_rhs_code (stmt, MULT_EXPR);
650 fold_stmt_inplace (&gsi);
651 update_stmt (stmt);
658 statistics_counter_event (fun, "reciprocal divs inserted",
659 reciprocal_stats.rdivs_inserted);
660 statistics_counter_event (fun, "reciprocal functions inserted",
661 reciprocal_stats.rfuncs_inserted);
663 free_dominance_info (CDI_DOMINATORS);
664 free_dominance_info (CDI_POST_DOMINATORS);
665 delete occ_pool;
666 return 0;
669 } // anon namespace
671 gimple_opt_pass *
672 make_pass_cse_reciprocals (gcc::context *ctxt)
674 return new pass_cse_reciprocals (ctxt);
677 /* Records an occurrence at statement USE_STMT in the vector of trees
678 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
679 is not yet initialized. Returns true if the occurrence was pushed on
680 the vector. Adjusts *TOP_BB to be the basic block dominating all
681 statements in the vector. */
683 static bool
684 maybe_record_sincos (vec<gimple *> *stmts,
685 basic_block *top_bb, gimple *use_stmt)
687 basic_block use_bb = gimple_bb (use_stmt);
688 if (*top_bb
689 && (*top_bb == use_bb
690 || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
691 stmts->safe_push (use_stmt);
692 else if (!*top_bb
693 || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
695 stmts->safe_push (use_stmt);
696 *top_bb = use_bb;
698 else
699 return false;
701 return true;
704 /* Look for sin, cos and cexpi calls with the same argument NAME and
705 create a single call to cexpi CSEing the result in this case.
706 We first walk over all immediate uses of the argument collecting
707 statements that we can CSE in a vector and in a second pass replace
708 the statement rhs with a REALPART or IMAGPART expression on the
709 result of the cexpi call we insert before the use statement that
710 dominates all other candidates. */
712 static bool
713 execute_cse_sincos_1 (tree name)
715 gimple_stmt_iterator gsi;
716 imm_use_iterator use_iter;
717 tree fndecl, res, type;
718 gimple *def_stmt, *use_stmt, *stmt;
719 int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
720 auto_vec<gimple *> stmts;
721 basic_block top_bb = NULL;
722 int i;
723 bool cfg_changed = false;
725 type = TREE_TYPE (name);
726 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
728 if (gimple_code (use_stmt) != GIMPLE_CALL
729 || !gimple_call_lhs (use_stmt))
730 continue;
732 switch (gimple_call_combined_fn (use_stmt))
734 CASE_CFN_COS:
735 seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
736 break;
738 CASE_CFN_SIN:
739 seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
740 break;
742 CASE_CFN_CEXPI:
743 seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
744 break;
746 default:;
750 if (seen_cos + seen_sin + seen_cexpi <= 1)
751 return false;
753 /* Simply insert cexpi at the beginning of top_bb but not earlier than
754 the name def statement. */
755 fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
756 if (!fndecl)
757 return false;
758 stmt = gimple_build_call (fndecl, 1, name);
759 res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
760 gimple_call_set_lhs (stmt, res);
762 def_stmt = SSA_NAME_DEF_STMT (name);
763 if (!SSA_NAME_IS_DEFAULT_DEF (name)
764 && gimple_code (def_stmt) != GIMPLE_PHI
765 && gimple_bb (def_stmt) == top_bb)
767 gsi = gsi_for_stmt (def_stmt);
768 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
770 else
772 gsi = gsi_after_labels (top_bb);
773 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
775 sincos_stats.inserted++;
777 /* And adjust the recorded old call sites. */
778 for (i = 0; stmts.iterate (i, &use_stmt); ++i)
780 tree rhs = NULL;
782 switch (gimple_call_combined_fn (use_stmt))
784 CASE_CFN_COS:
785 rhs = fold_build1 (REALPART_EXPR, type, res);
786 break;
788 CASE_CFN_SIN:
789 rhs = fold_build1 (IMAGPART_EXPR, type, res);
790 break;
792 CASE_CFN_CEXPI:
793 rhs = res;
794 break;
796 default:;
797 gcc_unreachable ();
800 /* Replace call with a copy. */
801 stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
803 gsi = gsi_for_stmt (use_stmt);
804 gsi_replace (&gsi, stmt, true);
805 if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
806 cfg_changed = true;
809 return cfg_changed;
812 /* To evaluate powi(x,n), the floating point value x raised to the
813 constant integer exponent n, we use a hybrid algorithm that
814 combines the "window method" with look-up tables. For an
815 introduction to exponentiation algorithms and "addition chains",
816 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
817 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
818 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
819 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
821 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
822 multiplications to inline before calling the system library's pow
823 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
824 so this default never requires calling pow, powf or powl. */
826 #ifndef POWI_MAX_MULTS
827 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
828 #endif
830 /* The size of the "optimal power tree" lookup table. All
831 exponents less than this value are simply looked up in the
832 powi_table below. This threshold is also used to size the
833 cache of pseudo registers that hold intermediate results. */
834 #define POWI_TABLE_SIZE 256
836 /* The size, in bits of the window, used in the "window method"
837 exponentiation algorithm. This is equivalent to a radix of
838 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
839 #define POWI_WINDOW_SIZE 3
841 /* The following table is an efficient representation of an
842 "optimal power tree". For each value, i, the corresponding
843 value, j, in the table states than an optimal evaluation
844 sequence for calculating pow(x,i) can be found by evaluating
845 pow(x,j)*pow(x,i-j). An optimal power tree for the first
846 100 integers is given in Knuth's "Seminumerical algorithms". */
848 static const unsigned char powi_table[POWI_TABLE_SIZE] =
850 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
851 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
852 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
853 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
854 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
855 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
856 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
857 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
858 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
859 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
860 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
861 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
862 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
863 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
864 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
865 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
866 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
867 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
868 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
869 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
870 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
871 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
872 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
873 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
874 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
875 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
876 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
877 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
878 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
879 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
880 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
881 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
885 /* Return the number of multiplications required to calculate
886 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
887 subroutine of powi_cost. CACHE is an array indicating
888 which exponents have already been calculated. */
890 static int
891 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
893 /* If we've already calculated this exponent, then this evaluation
894 doesn't require any additional multiplications. */
895 if (cache[n])
896 return 0;
898 cache[n] = true;
899 return powi_lookup_cost (n - powi_table[n], cache)
900 + powi_lookup_cost (powi_table[n], cache) + 1;
903 /* Return the number of multiplications required to calculate
904 powi(x,n) for an arbitrary x, given the exponent N. This
905 function needs to be kept in sync with powi_as_mults below. */
907 static int
908 powi_cost (HOST_WIDE_INT n)
910 bool cache[POWI_TABLE_SIZE];
911 unsigned HOST_WIDE_INT digit;
912 unsigned HOST_WIDE_INT val;
913 int result;
915 if (n == 0)
916 return 0;
918 /* Ignore the reciprocal when calculating the cost. */
919 val = (n < 0) ? -n : n;
921 /* Initialize the exponent cache. */
922 memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
923 cache[1] = true;
925 result = 0;
927 while (val >= POWI_TABLE_SIZE)
929 if (val & 1)
931 digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
932 result += powi_lookup_cost (digit, cache)
933 + POWI_WINDOW_SIZE + 1;
934 val >>= POWI_WINDOW_SIZE;
936 else
938 val >>= 1;
939 result++;
943 return result + powi_lookup_cost (val, cache);
946 /* Recursive subroutine of powi_as_mults. This function takes the
947 array, CACHE, of already calculated exponents and an exponent N and
948 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
950 static tree
951 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
952 HOST_WIDE_INT n, tree *cache)
954 tree op0, op1, ssa_target;
955 unsigned HOST_WIDE_INT digit;
956 gassign *mult_stmt;
958 if (n < POWI_TABLE_SIZE && cache[n])
959 return cache[n];
961 ssa_target = make_temp_ssa_name (type, NULL, "powmult");
963 if (n < POWI_TABLE_SIZE)
965 cache[n] = ssa_target;
966 op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
967 op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
969 else if (n & 1)
971 digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
972 op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
973 op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
975 else
977 op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
978 op1 = op0;
981 mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
982 gimple_set_location (mult_stmt, loc);
983 gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
985 return ssa_target;
988 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
989 This function needs to be kept in sync with powi_cost above. */
991 static tree
992 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
993 tree arg0, HOST_WIDE_INT n)
995 tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
996 gassign *div_stmt;
997 tree target;
999 if (n == 0)
1000 return build_real (type, dconst1);
1002 memset (cache, 0, sizeof (cache));
1003 cache[1] = arg0;
1005 result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1006 if (n >= 0)
1007 return result;
1009 /* If the original exponent was negative, reciprocate the result. */
1010 target = make_temp_ssa_name (type, NULL, "powmult");
1011 div_stmt = gimple_build_assign (target, RDIV_EXPR,
1012 build_real (type, dconst1), result);
1013 gimple_set_location (div_stmt, loc);
1014 gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1016 return target;
1019 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1020 location info LOC. If the arguments are appropriate, create an
1021 equivalent sequence of statements prior to GSI using an optimal
1022 number of multiplications, and return an expession holding the
1023 result. */
1025 static tree
1026 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1027 tree arg0, HOST_WIDE_INT n)
1029 /* Avoid largest negative number. */
1030 if (n != -n
1031 && ((n >= -1 && n <= 2)
1032 || (optimize_function_for_speed_p (cfun)
1033 && powi_cost (n) <= POWI_MAX_MULTS)))
1034 return powi_as_mults (gsi, loc, arg0, n);
1036 return NULL_TREE;
1039 /* Build a gimple call statement that calls FN with argument ARG.
1040 Set the lhs of the call statement to a fresh SSA name. Insert the
1041 statement prior to GSI's current position, and return the fresh
1042 SSA name. */
1044 static tree
1045 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1046 tree fn, tree arg)
1048 gcall *call_stmt;
1049 tree ssa_target;
1051 call_stmt = gimple_build_call (fn, 1, arg);
1052 ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1053 gimple_set_lhs (call_stmt, ssa_target);
1054 gimple_set_location (call_stmt, loc);
1055 gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1057 return ssa_target;
1060 /* Build a gimple binary operation with the given CODE and arguments
1061 ARG0, ARG1, assigning the result to a new SSA name for variable
1062 TARGET. Insert the statement prior to GSI's current position, and
1063 return the fresh SSA name.*/
1065 static tree
1066 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1067 const char *name, enum tree_code code,
1068 tree arg0, tree arg1)
1070 tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1071 gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1072 gimple_set_location (stmt, loc);
1073 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1074 return result;
1077 /* Build a gimple reference operation with the given CODE and argument
1078 ARG, assigning the result to a new SSA name of TYPE with NAME.
1079 Insert the statement prior to GSI's current position, and return
1080 the fresh SSA name. */
1082 static inline tree
1083 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1084 const char *name, enum tree_code code, tree arg0)
1086 tree result = make_temp_ssa_name (type, NULL, name);
1087 gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1088 gimple_set_location (stmt, loc);
1089 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1090 return result;
1093 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1094 prior to GSI's current position, and return the fresh SSA name. */
1096 static tree
1097 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1098 tree type, tree val)
1100 tree result = make_ssa_name (type);
1101 gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1102 gimple_set_location (stmt, loc);
1103 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1104 return result;
1107 struct pow_synth_sqrt_info
1109 bool *factors;
1110 unsigned int deepest;
1111 unsigned int num_mults;
1114 /* Return true iff the real value C can be represented as a
1115 sum of powers of 0.5 up to N. That is:
1116 C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1117 Record in INFO the various parameters of the synthesis algorithm such
1118 as the factors a[i], the maximum 0.5 power and the number of
1119 multiplications that will be required. */
1121 bool
1122 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1123 struct pow_synth_sqrt_info *info)
1125 REAL_VALUE_TYPE factor = dconsthalf;
1126 REAL_VALUE_TYPE remainder = c;
1128 info->deepest = 0;
1129 info->num_mults = 0;
1130 memset (info->factors, 0, n * sizeof (bool));
1132 for (unsigned i = 0; i < n; i++)
1134 REAL_VALUE_TYPE res;
1136 /* If something inexact happened bail out now. */
1137 if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1138 return false;
1140 /* We have hit zero. The number is representable as a sum
1141 of powers of 0.5. */
1142 if (real_equal (&res, &dconst0))
1144 info->factors[i] = true;
1145 info->deepest = i + 1;
1146 return true;
1148 else if (!REAL_VALUE_NEGATIVE (res))
1150 remainder = res;
1151 info->factors[i] = true;
1152 info->num_mults++;
1154 else
1155 info->factors[i] = false;
1157 real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1159 return false;
1162 /* Return the tree corresponding to FN being applied
1163 to ARG N times at GSI and LOC.
1164 Look up previous results from CACHE if need be.
1165 cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times. */
1167 static tree
1168 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1169 tree fn, location_t loc, tree *cache)
1171 tree res = cache[n];
1172 if (!res)
1174 tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1175 res = build_and_insert_call (gsi, loc, fn, prev);
1176 cache[n] = res;
1179 return res;
1182 /* Print to STREAM the repeated application of function FNAME to ARG
1183 N times. So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1184 "foo (foo (x))". */
1186 static void
1187 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1188 unsigned int n)
1190 if (n == 0)
1191 fprintf (stream, "%s", arg);
1192 else
1194 fprintf (stream, "%s (", fname);
1195 print_nested_fn (stream, fname, arg, n - 1);
1196 fprintf (stream, ")");
1200 /* Print to STREAM the fractional sequence of sqrt chains
1201 applied to ARG, described by INFO. Used for the dump file. */
1203 static void
1204 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1205 struct pow_synth_sqrt_info *info)
1207 for (unsigned int i = 0; i < info->deepest; i++)
1209 bool is_set = info->factors[i];
1210 if (is_set)
1212 print_nested_fn (stream, "sqrt", arg, i + 1);
1213 if (i != info->deepest - 1)
1214 fprintf (stream, " * ");
1219 /* Print to STREAM a representation of raising ARG to an integer
1220 power N. Used for the dump file. */
1222 static void
1223 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1225 if (n > 1)
1226 fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1227 else if (n == 1)
1228 fprintf (stream, "%s", arg);
1231 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1232 square roots. Place at GSI and LOC. Limit the maximum depth
1233 of the sqrt chains to MAX_DEPTH. Return the tree holding the
1234 result of the expanded sequence or NULL_TREE if the expansion failed.
1236 This routine assumes that ARG1 is a real number with a fractional part
1237 (the integer exponent case will have been handled earlier in
1238 gimple_expand_builtin_pow).
1240 For ARG1 > 0.0:
1241 * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1242 FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1243 FRAC_PART == ARG1 - WHOLE_PART:
1244 Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1245 POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1246 if it can be expressed as such, that is if FRAC_PART satisfies:
1247 FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1248 where integer a[i] is either 0 or 1.
1250 Example:
1251 POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1252 --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1254 For ARG1 < 0.0 there are two approaches:
1255 * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1256 is calculated as above.
1258 Example:
1259 POW (x, -5.625) == 1.0 / POW (x, 5.625)
1260 --> 1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1262 * (B) : WHOLE_PART := - ceil (abs (ARG1))
1263 FRAC_PART := ARG1 - WHOLE_PART
1264 and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1265 Example:
1266 POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1267 --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1269 For ARG1 < 0.0 we choose between (A) and (B) depending on
1270 how many multiplications we'd have to do.
1271 So, for the example in (B): POW (x, -5.875), if we were to
1272 follow algorithm (A) we would produce:
1273 1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1274 which contains more multiplications than approach (B).
1276 Hopefully, this approach will eliminate potentially expensive POW library
1277 calls when unsafe floating point math is enabled and allow the compiler to
1278 further optimise the multiplies, square roots and divides produced by this
1279 function. */
1281 static tree
1282 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1283 tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1285 tree type = TREE_TYPE (arg0);
1286 machine_mode mode = TYPE_MODE (type);
1287 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1288 bool one_over = true;
1290 if (!sqrtfn)
1291 return NULL_TREE;
1293 if (TREE_CODE (arg1) != REAL_CST)
1294 return NULL_TREE;
1296 REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1298 gcc_assert (max_depth > 0);
1299 tree *cache = XALLOCAVEC (tree, max_depth + 1);
1301 struct pow_synth_sqrt_info synth_info;
1302 synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1303 synth_info.deepest = 0;
1304 synth_info.num_mults = 0;
1306 bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1307 REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1309 /* The whole and fractional parts of exp. */
1310 REAL_VALUE_TYPE whole_part;
1311 REAL_VALUE_TYPE frac_part;
1313 real_floor (&whole_part, mode, &exp);
1314 real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1317 REAL_VALUE_TYPE ceil_whole = dconst0;
1318 REAL_VALUE_TYPE ceil_fract = dconst0;
1320 if (neg_exp)
1322 real_ceil (&ceil_whole, mode, &exp);
1323 real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1326 if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1327 return NULL_TREE;
1329 /* Check whether it's more profitable to not use 1.0 / ... */
1330 if (neg_exp)
1332 struct pow_synth_sqrt_info alt_synth_info;
1333 alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1334 alt_synth_info.deepest = 0;
1335 alt_synth_info.num_mults = 0;
1337 if (representable_as_half_series_p (ceil_fract, max_depth,
1338 &alt_synth_info)
1339 && alt_synth_info.deepest <= synth_info.deepest
1340 && alt_synth_info.num_mults < synth_info.num_mults)
1342 whole_part = ceil_whole;
1343 frac_part = ceil_fract;
1344 synth_info.deepest = alt_synth_info.deepest;
1345 synth_info.num_mults = alt_synth_info.num_mults;
1346 memcpy (synth_info.factors, alt_synth_info.factors,
1347 (max_depth + 1) * sizeof (bool));
1348 one_over = false;
1352 HOST_WIDE_INT n = real_to_integer (&whole_part);
1353 REAL_VALUE_TYPE cint;
1354 real_from_integer (&cint, VOIDmode, n, SIGNED);
1356 if (!real_identical (&whole_part, &cint))
1357 return NULL_TREE;
1359 if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1360 return NULL_TREE;
1362 memset (cache, 0, (max_depth + 1) * sizeof (tree));
1364 tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1366 /* Calculate the integer part of the exponent. */
1367 if (n > 1)
1369 integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1370 if (!integer_res)
1371 return NULL_TREE;
1374 if (dump_file)
1376 char string[64];
1378 real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1379 fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1381 if (neg_exp)
1383 if (one_over)
1385 fprintf (dump_file, "1.0 / (");
1386 dump_integer_part (dump_file, "x", n);
1387 if (n > 0)
1388 fprintf (dump_file, " * ");
1389 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1390 fprintf (dump_file, ")");
1392 else
1394 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1395 fprintf (dump_file, " / (");
1396 dump_integer_part (dump_file, "x", n);
1397 fprintf (dump_file, ")");
1400 else
1402 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1403 if (n > 0)
1404 fprintf (dump_file, " * ");
1405 dump_integer_part (dump_file, "x", n);
1408 fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1412 tree fract_res = NULL_TREE;
1413 cache[0] = arg0;
1415 /* Calculate the fractional part of the exponent. */
1416 for (unsigned i = 0; i < synth_info.deepest; i++)
1418 if (synth_info.factors[i])
1420 tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1422 if (!fract_res)
1423 fract_res = sqrt_chain;
1425 else
1426 fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1427 fract_res, sqrt_chain);
1431 tree res = NULL_TREE;
1433 if (neg_exp)
1435 if (one_over)
1437 if (n > 0)
1438 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1439 fract_res, integer_res);
1440 else
1441 res = fract_res;
1443 res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1444 build_real (type, dconst1), res);
1446 else
1448 res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1449 fract_res, integer_res);
1452 else
1453 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1454 fract_res, integer_res);
1455 return res;
1458 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1459 with location info LOC. If possible, create an equivalent and
1460 less expensive sequence of statements prior to GSI, and return an
1461 expession holding the result. */
1463 static tree
1464 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1465 tree arg0, tree arg1)
1467 REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1468 REAL_VALUE_TYPE c2, dconst3;
1469 HOST_WIDE_INT n;
1470 tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1471 machine_mode mode;
1472 bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1473 bool hw_sqrt_exists, c_is_int, c2_is_int;
1475 dconst1_4 = dconst1;
1476 SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1478 /* If the exponent isn't a constant, there's nothing of interest
1479 to be done. */
1480 if (TREE_CODE (arg1) != REAL_CST)
1481 return NULL_TREE;
1483 /* If the exponent is equivalent to an integer, expand to an optimal
1484 multiplication sequence when profitable. */
1485 c = TREE_REAL_CST (arg1);
1486 n = real_to_integer (&c);
1487 real_from_integer (&cint, VOIDmode, n, SIGNED);
1488 c_is_int = real_identical (&c, &cint);
1490 if (c_is_int
1491 && ((n >= -1 && n <= 2)
1492 || (flag_unsafe_math_optimizations
1493 && speed_p
1494 && powi_cost (n) <= POWI_MAX_MULTS)))
1495 return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1497 /* Attempt various optimizations using sqrt and cbrt. */
1498 type = TREE_TYPE (arg0);
1499 mode = TYPE_MODE (type);
1500 sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1502 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1503 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1504 sqrt(-0) = -0. */
1505 if (sqrtfn
1506 && real_equal (&c, &dconsthalf)
1507 && !HONOR_SIGNED_ZEROS (mode))
1508 return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1510 hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1512 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1513 optimizations since 1./3. is not exactly representable. If x
1514 is negative and finite, the correct value of pow(x,1./3.) is
1515 a NaN with the "invalid" exception raised, because the value
1516 of 1./3. actually has an even denominator. The correct value
1517 of cbrt(x) is a negative real value. */
1518 cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1519 dconst1_3 = real_value_truncate (mode, dconst_third ());
1521 if (flag_unsafe_math_optimizations
1522 && cbrtfn
1523 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1524 && real_equal (&c, &dconst1_3))
1525 return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1527 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1528 if we don't have a hardware sqrt insn. */
1529 dconst1_6 = dconst1_3;
1530 SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1532 if (flag_unsafe_math_optimizations
1533 && sqrtfn
1534 && cbrtfn
1535 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1536 && speed_p
1537 && hw_sqrt_exists
1538 && real_equal (&c, &dconst1_6))
1540 /* sqrt(x) */
1541 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1543 /* cbrt(sqrt(x)) */
1544 return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1548 /* Attempt to expand the POW as a product of square root chains.
1549 Expand the 0.25 case even when otpimising for size. */
1550 if (flag_unsafe_math_optimizations
1551 && sqrtfn
1552 && hw_sqrt_exists
1553 && (speed_p || real_equal (&c, &dconst1_4))
1554 && !HONOR_SIGNED_ZEROS (mode))
1556 unsigned int max_depth = speed_p
1557 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1558 : 2;
1560 tree expand_with_sqrts
1561 = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1563 if (expand_with_sqrts)
1564 return expand_with_sqrts;
1567 real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1568 n = real_to_integer (&c2);
1569 real_from_integer (&cint, VOIDmode, n, SIGNED);
1570 c2_is_int = real_identical (&c2, &cint);
1572 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1574 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
1575 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
1577 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
1578 different from pow(x, 1./3.) due to rounding and behavior with
1579 negative x, we need to constrain this transformation to unsafe
1580 math and positive x or finite math. */
1581 real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1582 real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1583 real_round (&c2, mode, &c2);
1584 n = real_to_integer (&c2);
1585 real_from_integer (&cint, VOIDmode, n, SIGNED);
1586 real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1587 real_convert (&c2, mode, &c2);
1589 if (flag_unsafe_math_optimizations
1590 && cbrtfn
1591 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1592 && real_identical (&c2, &c)
1593 && !c2_is_int
1594 && optimize_function_for_speed_p (cfun)
1595 && powi_cost (n / 3) <= POWI_MAX_MULTS)
1597 tree powi_x_ndiv3 = NULL_TREE;
1599 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
1600 possible or profitable, give up. Skip the degenerate case when
1601 abs(n) < 3, where the result is always 1. */
1602 if (absu_hwi (n) >= 3)
1604 powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1605 abs_hwi (n / 3));
1606 if (!powi_x_ndiv3)
1607 return NULL_TREE;
1610 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
1611 as that creates an unnecessary variable. Instead, just produce
1612 either cbrt(x) or cbrt(x) * cbrt(x). */
1613 cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1615 if (absu_hwi (n) % 3 == 1)
1616 powi_cbrt_x = cbrt_x;
1617 else
1618 powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1619 cbrt_x, cbrt_x);
1621 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
1622 if (absu_hwi (n) < 3)
1623 result = powi_cbrt_x;
1624 else
1625 result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1626 powi_x_ndiv3, powi_cbrt_x);
1628 /* If n is negative, reciprocate the result. */
1629 if (n < 0)
1630 result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1631 build_real (type, dconst1), result);
1633 return result;
1636 /* No optimizations succeeded. */
1637 return NULL_TREE;
1640 /* ARG is the argument to a cabs builtin call in GSI with location info
1641 LOC. Create a sequence of statements prior to GSI that calculates
1642 sqrt(R*R + I*I), where R and I are the real and imaginary components
1643 of ARG, respectively. Return an expression holding the result. */
1645 static tree
1646 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1648 tree real_part, imag_part, addend1, addend2, sum, result;
1649 tree type = TREE_TYPE (TREE_TYPE (arg));
1650 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1651 machine_mode mode = TYPE_MODE (type);
1653 if (!flag_unsafe_math_optimizations
1654 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1655 || !sqrtfn
1656 || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1657 return NULL_TREE;
1659 real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1660 REALPART_EXPR, arg);
1661 addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1662 real_part, real_part);
1663 imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1664 IMAGPART_EXPR, arg);
1665 addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1666 imag_part, imag_part);
1667 sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1668 result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1670 return result;
1673 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1674 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
1675 an optimal number of multiplies, when n is a constant. */
1677 namespace {
1679 const pass_data pass_data_cse_sincos =
1681 GIMPLE_PASS, /* type */
1682 "sincos", /* name */
1683 OPTGROUP_NONE, /* optinfo_flags */
1684 TV_NONE, /* tv_id */
1685 PROP_ssa, /* properties_required */
1686 PROP_gimple_opt_math, /* properties_provided */
1687 0, /* properties_destroyed */
1688 0, /* todo_flags_start */
1689 TODO_update_ssa, /* todo_flags_finish */
1692 class pass_cse_sincos : public gimple_opt_pass
1694 public:
1695 pass_cse_sincos (gcc::context *ctxt)
1696 : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1699 /* opt_pass methods: */
1700 virtual bool gate (function *)
1702 /* We no longer require either sincos or cexp, since powi expansion
1703 piggybacks on this pass. */
1704 return optimize;
1707 virtual unsigned int execute (function *);
1709 }; // class pass_cse_sincos
1711 unsigned int
1712 pass_cse_sincos::execute (function *fun)
1714 basic_block bb;
1715 bool cfg_changed = false;
1717 calculate_dominance_info (CDI_DOMINATORS);
1718 memset (&sincos_stats, 0, sizeof (sincos_stats));
1720 FOR_EACH_BB_FN (bb, fun)
1722 gimple_stmt_iterator gsi;
1723 bool cleanup_eh = false;
1725 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1727 gimple *stmt = gsi_stmt (gsi);
1729 /* Only the last stmt in a bb could throw, no need to call
1730 gimple_purge_dead_eh_edges if we change something in the middle
1731 of a basic block. */
1732 cleanup_eh = false;
1734 if (is_gimple_call (stmt)
1735 && gimple_call_lhs (stmt))
1737 tree arg, arg0, arg1, result;
1738 HOST_WIDE_INT n;
1739 location_t loc;
1741 switch (gimple_call_combined_fn (stmt))
1743 CASE_CFN_COS:
1744 CASE_CFN_SIN:
1745 CASE_CFN_CEXPI:
1746 /* Make sure we have either sincos or cexp. */
1747 if (!targetm.libc_has_function (function_c99_math_complex)
1748 && !targetm.libc_has_function (function_sincos))
1749 break;
1751 arg = gimple_call_arg (stmt, 0);
1752 if (TREE_CODE (arg) == SSA_NAME)
1753 cfg_changed |= execute_cse_sincos_1 (arg);
1754 break;
1756 CASE_CFN_POW:
1757 arg0 = gimple_call_arg (stmt, 0);
1758 arg1 = gimple_call_arg (stmt, 1);
1760 loc = gimple_location (stmt);
1761 result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1763 if (result)
1765 tree lhs = gimple_get_lhs (stmt);
1766 gassign *new_stmt = gimple_build_assign (lhs, result);
1767 gimple_set_location (new_stmt, loc);
1768 unlink_stmt_vdef (stmt);
1769 gsi_replace (&gsi, new_stmt, true);
1770 cleanup_eh = true;
1771 if (gimple_vdef (stmt))
1772 release_ssa_name (gimple_vdef (stmt));
1774 break;
1776 CASE_CFN_POWI:
1777 arg0 = gimple_call_arg (stmt, 0);
1778 arg1 = gimple_call_arg (stmt, 1);
1779 loc = gimple_location (stmt);
1781 if (real_minus_onep (arg0))
1783 tree t0, t1, cond, one, minus_one;
1784 gassign *stmt;
1786 t0 = TREE_TYPE (arg0);
1787 t1 = TREE_TYPE (arg1);
1788 one = build_real (t0, dconst1);
1789 minus_one = build_real (t0, dconstm1);
1791 cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1792 stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1793 arg1, build_int_cst (t1, 1));
1794 gimple_set_location (stmt, loc);
1795 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1797 result = make_temp_ssa_name (t0, NULL, "powi");
1798 stmt = gimple_build_assign (result, COND_EXPR, cond,
1799 minus_one, one);
1800 gimple_set_location (stmt, loc);
1801 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1803 else
1805 if (!tree_fits_shwi_p (arg1))
1806 break;
1808 n = tree_to_shwi (arg1);
1809 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1812 if (result)
1814 tree lhs = gimple_get_lhs (stmt);
1815 gassign *new_stmt = gimple_build_assign (lhs, result);
1816 gimple_set_location (new_stmt, loc);
1817 unlink_stmt_vdef (stmt);
1818 gsi_replace (&gsi, new_stmt, true);
1819 cleanup_eh = true;
1820 if (gimple_vdef (stmt))
1821 release_ssa_name (gimple_vdef (stmt));
1823 break;
1825 CASE_CFN_CABS:
1826 arg0 = gimple_call_arg (stmt, 0);
1827 loc = gimple_location (stmt);
1828 result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1830 if (result)
1832 tree lhs = gimple_get_lhs (stmt);
1833 gassign *new_stmt = gimple_build_assign (lhs, result);
1834 gimple_set_location (new_stmt, loc);
1835 unlink_stmt_vdef (stmt);
1836 gsi_replace (&gsi, new_stmt, true);
1837 cleanup_eh = true;
1838 if (gimple_vdef (stmt))
1839 release_ssa_name (gimple_vdef (stmt));
1841 break;
1843 default:;
1847 if (cleanup_eh)
1848 cfg_changed |= gimple_purge_dead_eh_edges (bb);
1851 statistics_counter_event (fun, "sincos statements inserted",
1852 sincos_stats.inserted);
1854 return cfg_changed ? TODO_cleanup_cfg : 0;
1857 } // anon namespace
1859 gimple_opt_pass *
1860 make_pass_cse_sincos (gcc::context *ctxt)
1862 return new pass_cse_sincos (ctxt);
1865 /* A symbolic number is used to detect byte permutation and selection
1866 patterns. Therefore the field N contains an artificial number
1867 consisting of octet sized markers:
1869 0 - target byte has the value 0
1870 FF - target byte has an unknown value (eg. due to sign extension)
1871 1..size - marker value is the target byte index minus one.
1873 To detect permutations on memory sources (arrays and structures), a symbolic
1874 number is also associated a base address (the array or structure the load is
1875 made from), an offset from the base address and a range which gives the
1876 difference between the highest and lowest accessed memory location to make
1877 such a symbolic number. The range is thus different from size which reflects
1878 the size of the type of current expression. Note that for non memory source,
1879 range holds the same value as size.
1881 For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1882 a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1883 still have a size of 2 but this time a range of 1. */
1885 struct symbolic_number {
1886 uint64_t n;
1887 tree type;
1888 tree base_addr;
1889 tree offset;
1890 HOST_WIDE_INT bytepos;
1891 tree alias_set;
1892 tree vuse;
1893 unsigned HOST_WIDE_INT range;
1896 #define BITS_PER_MARKER 8
1897 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1898 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1899 #define HEAD_MARKER(n, size) \
1900 ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1902 /* The number which the find_bswap_or_nop_1 result should match in
1903 order to have a nop. The number is masked according to the size of
1904 the symbolic number before using it. */
1905 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1906 (uint64_t)0x08070605 << 32 | 0x04030201)
1908 /* The number which the find_bswap_or_nop_1 result should match in
1909 order to have a byte swap. The number is masked according to the
1910 size of the symbolic number before using it. */
1911 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1912 (uint64_t)0x01020304 << 32 | 0x05060708)
1914 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1915 number N. Return false if the requested operation is not permitted
1916 on a symbolic number. */
1918 static inline bool
1919 do_shift_rotate (enum tree_code code,
1920 struct symbolic_number *n,
1921 int count)
1923 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1924 unsigned head_marker;
1926 if (count % BITS_PER_UNIT != 0)
1927 return false;
1928 count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1930 /* Zero out the extra bits of N in order to avoid them being shifted
1931 into the significant bits. */
1932 if (size < 64 / BITS_PER_MARKER)
1933 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1935 switch (code)
1937 case LSHIFT_EXPR:
1938 n->n <<= count;
1939 break;
1940 case RSHIFT_EXPR:
1941 head_marker = HEAD_MARKER (n->n, size);
1942 n->n >>= count;
1943 /* Arithmetic shift of signed type: result is dependent on the value. */
1944 if (!TYPE_UNSIGNED (n->type) && head_marker)
1945 for (i = 0; i < count / BITS_PER_MARKER; i++)
1946 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1947 << ((size - 1 - i) * BITS_PER_MARKER);
1948 break;
1949 case LROTATE_EXPR:
1950 n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1951 break;
1952 case RROTATE_EXPR:
1953 n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1954 break;
1955 default:
1956 return false;
1958 /* Zero unused bits for size. */
1959 if (size < 64 / BITS_PER_MARKER)
1960 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1961 return true;
1964 /* Perform sanity checking for the symbolic number N and the gimple
1965 statement STMT. */
1967 static inline bool
1968 verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
1970 tree lhs_type;
1972 lhs_type = gimple_expr_type (stmt);
1974 if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1975 return false;
1977 if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1978 return false;
1980 return true;
1983 /* Initialize the symbolic number N for the bswap pass from the base element
1984 SRC manipulated by the bitwise OR expression. */
1986 static bool
1987 init_symbolic_number (struct symbolic_number *n, tree src)
1989 int size;
1991 n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1993 /* Set up the symbolic number N by setting each byte to a value between 1 and
1994 the byte size of rhs1. The highest order byte is set to n->size and the
1995 lowest order byte to 1. */
1996 n->type = TREE_TYPE (src);
1997 size = TYPE_PRECISION (n->type);
1998 if (size % BITS_PER_UNIT != 0)
1999 return false;
2000 size /= BITS_PER_UNIT;
2001 if (size > 64 / BITS_PER_MARKER)
2002 return false;
2003 n->range = size;
2004 n->n = CMPNOP;
2006 if (size < 64 / BITS_PER_MARKER)
2007 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2009 return true;
2012 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2013 the answer. If so, REF is that memory source and the base of the memory area
2014 accessed and the offset of the access from that base are recorded in N. */
2016 bool
2017 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
2019 /* Leaf node is an array or component ref. Memorize its base and
2020 offset from base to compare to other such leaf node. */
2021 HOST_WIDE_INT bitsize, bitpos;
2022 machine_mode mode;
2023 int unsignedp, reversep, volatilep;
2024 tree offset, base_addr;
2026 /* Not prepared to handle PDP endian. */
2027 if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2028 return false;
2030 if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2031 return false;
2033 base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2034 &unsignedp, &reversep, &volatilep, false);
2036 if (TREE_CODE (base_addr) == MEM_REF)
2038 offset_int bit_offset = 0;
2039 tree off = TREE_OPERAND (base_addr, 1);
2041 if (!integer_zerop (off))
2043 offset_int boff, coff = mem_ref_offset (base_addr);
2044 boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
2045 bit_offset += boff;
2048 base_addr = TREE_OPERAND (base_addr, 0);
2050 /* Avoid returning a negative bitpos as this may wreak havoc later. */
2051 if (wi::neg_p (bit_offset))
2053 offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2054 offset_int tem = bit_offset.and_not (mask);
2055 /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2056 Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */
2057 bit_offset -= tem;
2058 tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
2059 if (offset)
2060 offset = size_binop (PLUS_EXPR, offset,
2061 wide_int_to_tree (sizetype, tem));
2062 else
2063 offset = wide_int_to_tree (sizetype, tem);
2066 bitpos += bit_offset.to_shwi ();
2069 if (bitpos % BITS_PER_UNIT)
2070 return false;
2071 if (bitsize % BITS_PER_UNIT)
2072 return false;
2073 if (reversep)
2074 return false;
2076 if (!init_symbolic_number (n, ref))
2077 return false;
2078 n->base_addr = base_addr;
2079 n->offset = offset;
2080 n->bytepos = bitpos / BITS_PER_UNIT;
2081 n->alias_set = reference_alias_ptr_type (ref);
2082 n->vuse = gimple_vuse (stmt);
2083 return true;
2086 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2087 symbolic number N1 and N2 whose source statements are respectively
2088 SOURCE_STMT1 and SOURCE_STMT2. */
2090 static gimple *
2091 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
2092 gimple *source_stmt2, struct symbolic_number *n2,
2093 struct symbolic_number *n)
2095 int i, size;
2096 uint64_t mask;
2097 gimple *source_stmt;
2098 struct symbolic_number *n_start;
2100 /* Sources are different, cancel bswap if they are not memory location with
2101 the same base (array, structure, ...). */
2102 if (gimple_assign_rhs1 (source_stmt1) != gimple_assign_rhs1 (source_stmt2))
2104 uint64_t inc;
2105 HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2106 struct symbolic_number *toinc_n_ptr, *n_end;
2108 if (!n1->base_addr || !n2->base_addr
2109 || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2110 return NULL;
2112 if (!n1->offset != !n2->offset
2113 || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2114 return NULL;
2116 if (n1->bytepos < n2->bytepos)
2118 n_start = n1;
2119 start_sub = n2->bytepos - n1->bytepos;
2120 source_stmt = source_stmt1;
2122 else
2124 n_start = n2;
2125 start_sub = n1->bytepos - n2->bytepos;
2126 source_stmt = source_stmt2;
2129 /* Find the highest address at which a load is performed and
2130 compute related info. */
2131 end1 = n1->bytepos + (n1->range - 1);
2132 end2 = n2->bytepos + (n2->range - 1);
2133 if (end1 < end2)
2135 end = end2;
2136 end_sub = end2 - end1;
2138 else
2140 end = end1;
2141 end_sub = end1 - end2;
2143 n_end = (end2 > end1) ? n2 : n1;
2145 /* Find symbolic number whose lsb is the most significant. */
2146 if (BYTES_BIG_ENDIAN)
2147 toinc_n_ptr = (n_end == n1) ? n2 : n1;
2148 else
2149 toinc_n_ptr = (n_start == n1) ? n2 : n1;
2151 n->range = end - n_start->bytepos + 1;
2153 /* Check that the range of memory covered can be represented by
2154 a symbolic number. */
2155 if (n->range > 64 / BITS_PER_MARKER)
2156 return NULL;
2158 /* Reinterpret byte marks in symbolic number holding the value of
2159 bigger weight according to target endianness. */
2160 inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2161 size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2162 for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2164 unsigned marker
2165 = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2166 if (marker && marker != MARKER_BYTE_UNKNOWN)
2167 toinc_n_ptr->n += inc;
2170 else
2172 n->range = n1->range;
2173 n_start = n1;
2174 source_stmt = source_stmt1;
2177 if (!n1->alias_set
2178 || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2179 n->alias_set = n1->alias_set;
2180 else
2181 n->alias_set = ptr_type_node;
2182 n->vuse = n_start->vuse;
2183 n->base_addr = n_start->base_addr;
2184 n->offset = n_start->offset;
2185 n->bytepos = n_start->bytepos;
2186 n->type = n_start->type;
2187 size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2189 for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2191 uint64_t masked1, masked2;
2193 masked1 = n1->n & mask;
2194 masked2 = n2->n & mask;
2195 if (masked1 && masked2 && masked1 != masked2)
2196 return NULL;
2198 n->n = n1->n | n2->n;
2200 return source_stmt;
2203 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2204 the operation given by the rhs of STMT on the result. If the operation
2205 could successfully be executed the function returns a gimple stmt whose
2206 rhs's first tree is the expression of the source operand and NULL
2207 otherwise. */
2209 static gimple *
2210 find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
2212 enum tree_code code;
2213 tree rhs1, rhs2 = NULL;
2214 gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
2215 enum gimple_rhs_class rhs_class;
2217 if (!limit || !is_gimple_assign (stmt))
2218 return NULL;
2220 rhs1 = gimple_assign_rhs1 (stmt);
2222 if (find_bswap_or_nop_load (stmt, rhs1, n))
2223 return stmt;
2225 if (TREE_CODE (rhs1) != SSA_NAME)
2226 return NULL;
2228 code = gimple_assign_rhs_code (stmt);
2229 rhs_class = gimple_assign_rhs_class (stmt);
2230 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2232 if (rhs_class == GIMPLE_BINARY_RHS)
2233 rhs2 = gimple_assign_rhs2 (stmt);
2235 /* Handle unary rhs and binary rhs with integer constants as second
2236 operand. */
2238 if (rhs_class == GIMPLE_UNARY_RHS
2239 || (rhs_class == GIMPLE_BINARY_RHS
2240 && TREE_CODE (rhs2) == INTEGER_CST))
2242 if (code != BIT_AND_EXPR
2243 && code != LSHIFT_EXPR
2244 && code != RSHIFT_EXPR
2245 && code != LROTATE_EXPR
2246 && code != RROTATE_EXPR
2247 && !CONVERT_EXPR_CODE_P (code))
2248 return NULL;
2250 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2252 /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2253 we have to initialize the symbolic number. */
2254 if (!source_stmt1)
2256 if (gimple_assign_load_p (stmt)
2257 || !init_symbolic_number (n, rhs1))
2258 return NULL;
2259 source_stmt1 = stmt;
2262 switch (code)
2264 case BIT_AND_EXPR:
2266 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2267 uint64_t val = int_cst_value (rhs2), mask = 0;
2268 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2270 /* Only constants masking full bytes are allowed. */
2271 for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2272 if ((val & tmp) != 0 && (val & tmp) != tmp)
2273 return NULL;
2274 else if (val & tmp)
2275 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2277 n->n &= mask;
2279 break;
2280 case LSHIFT_EXPR:
2281 case RSHIFT_EXPR:
2282 case LROTATE_EXPR:
2283 case RROTATE_EXPR:
2284 if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2285 return NULL;
2286 break;
2287 CASE_CONVERT:
2289 int i, type_size, old_type_size;
2290 tree type;
2292 type = gimple_expr_type (stmt);
2293 type_size = TYPE_PRECISION (type);
2294 if (type_size % BITS_PER_UNIT != 0)
2295 return NULL;
2296 type_size /= BITS_PER_UNIT;
2297 if (type_size > 64 / BITS_PER_MARKER)
2298 return NULL;
2300 /* Sign extension: result is dependent on the value. */
2301 old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2302 if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2303 && HEAD_MARKER (n->n, old_type_size))
2304 for (i = 0; i < type_size - old_type_size; i++)
2305 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2306 << ((type_size - 1 - i) * BITS_PER_MARKER);
2308 if (type_size < 64 / BITS_PER_MARKER)
2310 /* If STMT casts to a smaller type mask out the bits not
2311 belonging to the target type. */
2312 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2314 n->type = type;
2315 if (!n->base_addr)
2316 n->range = type_size;
2318 break;
2319 default:
2320 return NULL;
2322 return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2325 /* Handle binary rhs. */
2327 if (rhs_class == GIMPLE_BINARY_RHS)
2329 struct symbolic_number n1, n2;
2330 gimple *source_stmt, *source_stmt2;
2332 if (code != BIT_IOR_EXPR)
2333 return NULL;
2335 if (TREE_CODE (rhs2) != SSA_NAME)
2336 return NULL;
2338 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2340 switch (code)
2342 case BIT_IOR_EXPR:
2343 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2345 if (!source_stmt1)
2346 return NULL;
2348 source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2350 if (!source_stmt2)
2351 return NULL;
2353 if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2354 return NULL;
2356 if (!n1.vuse != !n2.vuse
2357 || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2358 return NULL;
2360 source_stmt
2361 = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2363 if (!source_stmt)
2364 return NULL;
2366 if (!verify_symbolic_number_p (n, stmt))
2367 return NULL;
2369 break;
2370 default:
2371 return NULL;
2373 return source_stmt;
2375 return NULL;
2378 /* Check if STMT completes a bswap implementation or a read in a given
2379 endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2380 accordingly. It also sets N to represent the kind of operations
2381 performed: size of the resulting expression and whether it works on
2382 a memory source, and if so alias-set and vuse. At last, the
2383 function returns a stmt whose rhs's first tree is the source
2384 expression. */
2386 static gimple *
2387 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
2389 /* The number which the find_bswap_or_nop_1 result should match in order
2390 to have a full byte swap. The number is shifted to the right
2391 according to the size of the symbolic number before using it. */
2392 uint64_t cmpxchg = CMPXCHG;
2393 uint64_t cmpnop = CMPNOP;
2395 gimple *source_stmt;
2396 int limit;
2398 /* The last parameter determines the depth search limit. It usually
2399 correlates directly to the number n of bytes to be touched. We
2400 increase that number by log2(n) + 1 here in order to also
2401 cover signed -> unsigned conversions of the src operand as can be seen
2402 in libgcc, and for initial shift/and operation of the src operand. */
2403 limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2404 limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2405 source_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2407 if (!source_stmt)
2408 return NULL;
2410 /* Find real size of result (highest non-zero byte). */
2411 if (n->base_addr)
2413 int rsize;
2414 uint64_t tmpn;
2416 for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2417 n->range = rsize;
2420 /* Zero out the extra bits of N and CMP*. */
2421 if (n->range < (int) sizeof (int64_t))
2423 uint64_t mask;
2425 mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2426 cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2427 cmpnop &= mask;
2430 /* A complete byte swap should make the symbolic number to start with
2431 the largest digit in the highest order byte. Unchanged symbolic
2432 number indicates a read with same endianness as target architecture. */
2433 if (n->n == cmpnop)
2434 *bswap = false;
2435 else if (n->n == cmpxchg)
2436 *bswap = true;
2437 else
2438 return NULL;
2440 /* Useless bit manipulation performed by code. */
2441 if (!n->base_addr && n->n == cmpnop)
2442 return NULL;
2444 n->range *= BITS_PER_UNIT;
2445 return source_stmt;
2448 namespace {
2450 const pass_data pass_data_optimize_bswap =
2452 GIMPLE_PASS, /* type */
2453 "bswap", /* name */
2454 OPTGROUP_NONE, /* optinfo_flags */
2455 TV_NONE, /* tv_id */
2456 PROP_ssa, /* properties_required */
2457 0, /* properties_provided */
2458 0, /* properties_destroyed */
2459 0, /* todo_flags_start */
2460 0, /* todo_flags_finish */
2463 class pass_optimize_bswap : public gimple_opt_pass
2465 public:
2466 pass_optimize_bswap (gcc::context *ctxt)
2467 : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2470 /* opt_pass methods: */
2471 virtual bool gate (function *)
2473 return flag_expensive_optimizations && optimize;
2476 virtual unsigned int execute (function *);
2478 }; // class pass_optimize_bswap
2480 /* Perform the bswap optimization: replace the expression computed in the rhs
2481 of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2482 Which of these alternatives replace the rhs is given by N->base_addr (non
2483 null if a load is needed) and BSWAP. The type, VUSE and set-alias of the
2484 load to perform are also given in N while the builtin bswap invoke is given
2485 in FNDEL. Finally, if a load is involved, SRC_STMT refers to one of the
2486 load statements involved to construct the rhs in CUR_STMT and N->range gives
2487 the size of the rhs expression for maintaining some statistics.
2489 Note that if the replacement involve a load, CUR_STMT is moved just after
2490 SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2491 changing of basic block. */
2493 static bool
2494 bswap_replace (gimple *cur_stmt, gimple *src_stmt, tree fndecl,
2495 tree bswap_type, tree load_type, struct symbolic_number *n,
2496 bool bswap)
2498 gimple_stmt_iterator gsi;
2499 tree src, tmp, tgt;
2500 gimple *bswap_stmt;
2502 gsi = gsi_for_stmt (cur_stmt);
2503 src = gimple_assign_rhs1 (src_stmt);
2504 tgt = gimple_assign_lhs (cur_stmt);
2506 /* Need to load the value from memory first. */
2507 if (n->base_addr)
2509 gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2510 tree addr_expr, addr_tmp, val_expr, val_tmp;
2511 tree load_offset_ptr, aligned_load_type;
2512 gimple *addr_stmt, *load_stmt;
2513 unsigned align;
2514 HOST_WIDE_INT load_offset = 0;
2516 align = get_object_alignment (src);
2517 /* If the new access is smaller than the original one, we need
2518 to perform big endian adjustment. */
2519 if (BYTES_BIG_ENDIAN)
2521 HOST_WIDE_INT bitsize, bitpos;
2522 machine_mode mode;
2523 int unsignedp, reversep, volatilep;
2524 tree offset;
2526 get_inner_reference (src, &bitsize, &bitpos, &offset, &mode,
2527 &unsignedp, &reversep, &volatilep, false);
2528 if (n->range < (unsigned HOST_WIDE_INT) bitsize)
2530 load_offset = (bitsize - n->range) / BITS_PER_UNIT;
2531 unsigned HOST_WIDE_INT l
2532 = (load_offset * BITS_PER_UNIT) & (align - 1);
2533 if (l)
2534 align = l & -l;
2538 if (bswap
2539 && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2540 && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2541 return false;
2543 /* Move cur_stmt just before one of the load of the original
2544 to ensure it has the same VUSE. See PR61517 for what could
2545 go wrong. */
2546 gsi_move_before (&gsi, &gsi_ins);
2547 gsi = gsi_for_stmt (cur_stmt);
2549 /* Compute address to load from and cast according to the size
2550 of the load. */
2551 addr_expr = build_fold_addr_expr (unshare_expr (src));
2552 if (is_gimple_mem_ref_addr (addr_expr))
2553 addr_tmp = addr_expr;
2554 else
2556 addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2557 "load_src");
2558 addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2559 gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2562 /* Perform the load. */
2563 aligned_load_type = load_type;
2564 if (align < TYPE_ALIGN (load_type))
2565 aligned_load_type = build_aligned_type (load_type, align);
2566 load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2567 val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2568 load_offset_ptr);
2570 if (!bswap)
2572 if (n->range == 16)
2573 nop_stats.found_16bit++;
2574 else if (n->range == 32)
2575 nop_stats.found_32bit++;
2576 else
2578 gcc_assert (n->range == 64);
2579 nop_stats.found_64bit++;
2582 /* Convert the result of load if necessary. */
2583 if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2585 val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2586 "load_dst");
2587 load_stmt = gimple_build_assign (val_tmp, val_expr);
2588 gimple_set_vuse (load_stmt, n->vuse);
2589 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2590 gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2592 else
2594 gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2595 gimple_set_vuse (cur_stmt, n->vuse);
2597 update_stmt (cur_stmt);
2599 if (dump_file)
2601 fprintf (dump_file,
2602 "%d bit load in target endianness found at: ",
2603 (int) n->range);
2604 print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2606 return true;
2608 else
2610 val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2611 load_stmt = gimple_build_assign (val_tmp, val_expr);
2612 gimple_set_vuse (load_stmt, n->vuse);
2613 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2615 src = val_tmp;
2618 if (n->range == 16)
2619 bswap_stats.found_16bit++;
2620 else if (n->range == 32)
2621 bswap_stats.found_32bit++;
2622 else
2624 gcc_assert (n->range == 64);
2625 bswap_stats.found_64bit++;
2628 tmp = src;
2630 /* Convert the src expression if necessary. */
2631 if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2633 gimple *convert_stmt;
2635 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2636 convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2637 gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2640 /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values
2641 are considered as rotation of 2N bit values by N bits is generally not
2642 equivalent to a bswap. Consider for instance 0x01020304 r>> 16 which
2643 gives 0x03040102 while a bswap for that value is 0x04030201. */
2644 if (bswap && n->range == 16)
2646 tree count = build_int_cst (NULL, BITS_PER_UNIT);
2647 src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2648 bswap_stmt = gimple_build_assign (NULL, src);
2650 else
2651 bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2653 tmp = tgt;
2655 /* Convert the result if necessary. */
2656 if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2658 gimple *convert_stmt;
2660 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2661 convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2662 gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2665 gimple_set_lhs (bswap_stmt, tmp);
2667 if (dump_file)
2669 fprintf (dump_file, "%d bit bswap implementation found at: ",
2670 (int) n->range);
2671 print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2674 gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2675 gsi_remove (&gsi, true);
2676 return true;
2679 /* Find manual byte swap implementations as well as load in a given
2680 endianness. Byte swaps are turned into a bswap builtin invokation
2681 while endian loads are converted to bswap builtin invokation or
2682 simple load according to the target endianness. */
2684 unsigned int
2685 pass_optimize_bswap::execute (function *fun)
2687 basic_block bb;
2688 bool bswap32_p, bswap64_p;
2689 bool changed = false;
2690 tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2692 if (BITS_PER_UNIT != 8)
2693 return 0;
2695 bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2696 && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2697 bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2698 && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2699 || (bswap32_p && word_mode == SImode)));
2701 /* Determine the argument type of the builtins. The code later on
2702 assumes that the return and argument type are the same. */
2703 if (bswap32_p)
2705 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2706 bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2709 if (bswap64_p)
2711 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2712 bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2715 memset (&nop_stats, 0, sizeof (nop_stats));
2716 memset (&bswap_stats, 0, sizeof (bswap_stats));
2718 FOR_EACH_BB_FN (bb, fun)
2720 gimple_stmt_iterator gsi;
2722 /* We do a reverse scan for bswap patterns to make sure we get the
2723 widest match. As bswap pattern matching doesn't handle previously
2724 inserted smaller bswap replacements as sub-patterns, the wider
2725 variant wouldn't be detected. */
2726 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2728 gimple *src_stmt, *cur_stmt = gsi_stmt (gsi);
2729 tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2730 enum tree_code code;
2731 struct symbolic_number n;
2732 bool bswap;
2734 /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2735 might be moved to a different basic block by bswap_replace and gsi
2736 must not points to it if that's the case. Moving the gsi_prev
2737 there make sure that gsi points to the statement previous to
2738 cur_stmt while still making sure that all statements are
2739 considered in this basic block. */
2740 gsi_prev (&gsi);
2742 if (!is_gimple_assign (cur_stmt))
2743 continue;
2745 code = gimple_assign_rhs_code (cur_stmt);
2746 switch (code)
2748 case LROTATE_EXPR:
2749 case RROTATE_EXPR:
2750 if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2751 || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2752 % BITS_PER_UNIT)
2753 continue;
2754 /* Fall through. */
2755 case BIT_IOR_EXPR:
2756 break;
2757 default:
2758 continue;
2761 src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2763 if (!src_stmt)
2764 continue;
2766 switch (n.range)
2768 case 16:
2769 /* Already in canonical form, nothing to do. */
2770 if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2771 continue;
2772 load_type = bswap_type = uint16_type_node;
2773 break;
2774 case 32:
2775 load_type = uint32_type_node;
2776 if (bswap32_p)
2778 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2779 bswap_type = bswap32_type;
2781 break;
2782 case 64:
2783 load_type = uint64_type_node;
2784 if (bswap64_p)
2786 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2787 bswap_type = bswap64_type;
2789 break;
2790 default:
2791 continue;
2794 if (bswap && !fndecl && n.range != 16)
2795 continue;
2797 if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2798 &n, bswap))
2799 changed = true;
2803 statistics_counter_event (fun, "16-bit nop implementations found",
2804 nop_stats.found_16bit);
2805 statistics_counter_event (fun, "32-bit nop implementations found",
2806 nop_stats.found_32bit);
2807 statistics_counter_event (fun, "64-bit nop implementations found",
2808 nop_stats.found_64bit);
2809 statistics_counter_event (fun, "16-bit bswap implementations found",
2810 bswap_stats.found_16bit);
2811 statistics_counter_event (fun, "32-bit bswap implementations found",
2812 bswap_stats.found_32bit);
2813 statistics_counter_event (fun, "64-bit bswap implementations found",
2814 bswap_stats.found_64bit);
2816 return (changed ? TODO_update_ssa : 0);
2819 } // anon namespace
2821 gimple_opt_pass *
2822 make_pass_optimize_bswap (gcc::context *ctxt)
2824 return new pass_optimize_bswap (ctxt);
2827 /* Return true if stmt is a type conversion operation that can be stripped
2828 when used in a widening multiply operation. */
2829 static bool
2830 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2832 enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2834 if (TREE_CODE (result_type) == INTEGER_TYPE)
2836 tree op_type;
2837 tree inner_op_type;
2839 if (!CONVERT_EXPR_CODE_P (rhs_code))
2840 return false;
2842 op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2844 /* If the type of OP has the same precision as the result, then
2845 we can strip this conversion. The multiply operation will be
2846 selected to create the correct extension as a by-product. */
2847 if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2848 return true;
2850 /* We can also strip a conversion if it preserves the signed-ness of
2851 the operation and doesn't narrow the range. */
2852 inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2854 /* If the inner-most type is unsigned, then we can strip any
2855 intermediate widening operation. If it's signed, then the
2856 intermediate widening operation must also be signed. */
2857 if ((TYPE_UNSIGNED (inner_op_type)
2858 || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2859 && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2860 return true;
2862 return false;
2865 return rhs_code == FIXED_CONVERT_EXPR;
2868 /* Return true if RHS is a suitable operand for a widening multiplication,
2869 assuming a target type of TYPE.
2870 There are two cases:
2872 - RHS makes some value at least twice as wide. Store that value
2873 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2875 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
2876 but leave *TYPE_OUT untouched. */
2878 static bool
2879 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2880 tree *new_rhs_out)
2882 gimple *stmt;
2883 tree type1, rhs1;
2885 if (TREE_CODE (rhs) == SSA_NAME)
2887 stmt = SSA_NAME_DEF_STMT (rhs);
2888 if (is_gimple_assign (stmt))
2890 if (! widening_mult_conversion_strippable_p (type, stmt))
2891 rhs1 = rhs;
2892 else
2894 rhs1 = gimple_assign_rhs1 (stmt);
2896 if (TREE_CODE (rhs1) == INTEGER_CST)
2898 *new_rhs_out = rhs1;
2899 *type_out = NULL;
2900 return true;
2904 else
2905 rhs1 = rhs;
2907 type1 = TREE_TYPE (rhs1);
2909 if (TREE_CODE (type1) != TREE_CODE (type)
2910 || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2911 return false;
2913 *new_rhs_out = rhs1;
2914 *type_out = type1;
2915 return true;
2918 if (TREE_CODE (rhs) == INTEGER_CST)
2920 *new_rhs_out = rhs;
2921 *type_out = NULL;
2922 return true;
2925 return false;
2928 /* Return true if STMT performs a widening multiplication, assuming the
2929 output type is TYPE. If so, store the unwidened types of the operands
2930 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
2931 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2932 and *TYPE2_OUT would give the operands of the multiplication. */
2934 static bool
2935 is_widening_mult_p (gimple *stmt,
2936 tree *type1_out, tree *rhs1_out,
2937 tree *type2_out, tree *rhs2_out)
2939 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2941 if (TREE_CODE (type) != INTEGER_TYPE
2942 && TREE_CODE (type) != FIXED_POINT_TYPE)
2943 return false;
2945 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2946 rhs1_out))
2947 return false;
2949 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2950 rhs2_out))
2951 return false;
2953 if (*type1_out == NULL)
2955 if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2956 return false;
2957 *type1_out = *type2_out;
2960 if (*type2_out == NULL)
2962 if (!int_fits_type_p (*rhs2_out, *type1_out))
2963 return false;
2964 *type2_out = *type1_out;
2967 /* Ensure that the larger of the two operands comes first. */
2968 if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2970 std::swap (*type1_out, *type2_out);
2971 std::swap (*rhs1_out, *rhs2_out);
2974 return true;
2977 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2978 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
2979 value is true iff we converted the statement. */
2981 static bool
2982 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
2984 tree lhs, rhs1, rhs2, type, type1, type2;
2985 enum insn_code handler;
2986 machine_mode to_mode, from_mode, actual_mode;
2987 optab op;
2988 int actual_precision;
2989 location_t loc = gimple_location (stmt);
2990 bool from_unsigned1, from_unsigned2;
2992 lhs = gimple_assign_lhs (stmt);
2993 type = TREE_TYPE (lhs);
2994 if (TREE_CODE (type) != INTEGER_TYPE)
2995 return false;
2997 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2998 return false;
3000 to_mode = TYPE_MODE (type);
3001 from_mode = TYPE_MODE (type1);
3002 from_unsigned1 = TYPE_UNSIGNED (type1);
3003 from_unsigned2 = TYPE_UNSIGNED (type2);
3005 if (from_unsigned1 && from_unsigned2)
3006 op = umul_widen_optab;
3007 else if (!from_unsigned1 && !from_unsigned2)
3008 op = smul_widen_optab;
3009 else
3010 op = usmul_widen_optab;
3012 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3013 0, &actual_mode);
3015 if (handler == CODE_FOR_nothing)
3017 if (op != smul_widen_optab)
3019 /* We can use a signed multiply with unsigned types as long as
3020 there is a wider mode to use, or it is the smaller of the two
3021 types that is unsigned. Note that type1 >= type2, always. */
3022 if ((TYPE_UNSIGNED (type1)
3023 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3024 || (TYPE_UNSIGNED (type2)
3025 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3027 from_mode = GET_MODE_WIDER_MODE (from_mode);
3028 if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
3029 return false;
3032 op = smul_widen_optab;
3033 handler = find_widening_optab_handler_and_mode (op, to_mode,
3034 from_mode, 0,
3035 &actual_mode);
3037 if (handler == CODE_FOR_nothing)
3038 return false;
3040 from_unsigned1 = from_unsigned2 = false;
3042 else
3043 return false;
3046 /* Ensure that the inputs to the handler are in the correct precison
3047 for the opcode. This will be the full mode size. */
3048 actual_precision = GET_MODE_PRECISION (actual_mode);
3049 if (2 * actual_precision > TYPE_PRECISION (type))
3050 return false;
3051 if (actual_precision != TYPE_PRECISION (type1)
3052 || from_unsigned1 != TYPE_UNSIGNED (type1))
3053 rhs1 = build_and_insert_cast (gsi, loc,
3054 build_nonstandard_integer_type
3055 (actual_precision, from_unsigned1), rhs1);
3056 if (actual_precision != TYPE_PRECISION (type2)
3057 || from_unsigned2 != TYPE_UNSIGNED (type2))
3058 rhs2 = build_and_insert_cast (gsi, loc,
3059 build_nonstandard_integer_type
3060 (actual_precision, from_unsigned2), rhs2);
3062 /* Handle constants. */
3063 if (TREE_CODE (rhs1) == INTEGER_CST)
3064 rhs1 = fold_convert (type1, rhs1);
3065 if (TREE_CODE (rhs2) == INTEGER_CST)
3066 rhs2 = fold_convert (type2, rhs2);
3068 gimple_assign_set_rhs1 (stmt, rhs1);
3069 gimple_assign_set_rhs2 (stmt, rhs2);
3070 gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
3071 update_stmt (stmt);
3072 widen_mul_stats.widen_mults_inserted++;
3073 return true;
3076 /* Process a single gimple statement STMT, which is found at the
3077 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3078 rhs (given by CODE), and try to convert it into a
3079 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
3080 is true iff we converted the statement. */
3082 static bool
3083 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
3084 enum tree_code code)
3086 gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
3087 gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
3088 tree type, type1, type2, optype;
3089 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3090 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3091 optab this_optab;
3092 enum tree_code wmult_code;
3093 enum insn_code handler;
3094 machine_mode to_mode, from_mode, actual_mode;
3095 location_t loc = gimple_location (stmt);
3096 int actual_precision;
3097 bool from_unsigned1, from_unsigned2;
3099 lhs = gimple_assign_lhs (stmt);
3100 type = TREE_TYPE (lhs);
3101 if (TREE_CODE (type) != INTEGER_TYPE
3102 && TREE_CODE (type) != FIXED_POINT_TYPE)
3103 return false;
3105 if (code == MINUS_EXPR)
3106 wmult_code = WIDEN_MULT_MINUS_EXPR;
3107 else
3108 wmult_code = WIDEN_MULT_PLUS_EXPR;
3110 rhs1 = gimple_assign_rhs1 (stmt);
3111 rhs2 = gimple_assign_rhs2 (stmt);
3113 if (TREE_CODE (rhs1) == SSA_NAME)
3115 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3116 if (is_gimple_assign (rhs1_stmt))
3117 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3120 if (TREE_CODE (rhs2) == SSA_NAME)
3122 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3123 if (is_gimple_assign (rhs2_stmt))
3124 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3127 /* Allow for one conversion statement between the multiply
3128 and addition/subtraction statement. If there are more than
3129 one conversions then we assume they would invalidate this
3130 transformation. If that's not the case then they should have
3131 been folded before now. */
3132 if (CONVERT_EXPR_CODE_P (rhs1_code))
3134 conv1_stmt = rhs1_stmt;
3135 rhs1 = gimple_assign_rhs1 (rhs1_stmt);
3136 if (TREE_CODE (rhs1) == SSA_NAME)
3138 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3139 if (is_gimple_assign (rhs1_stmt))
3140 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3142 else
3143 return false;
3145 if (CONVERT_EXPR_CODE_P (rhs2_code))
3147 conv2_stmt = rhs2_stmt;
3148 rhs2 = gimple_assign_rhs1 (rhs2_stmt);
3149 if (TREE_CODE (rhs2) == SSA_NAME)
3151 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3152 if (is_gimple_assign (rhs2_stmt))
3153 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3155 else
3156 return false;
3159 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3160 is_widening_mult_p, but we still need the rhs returns.
3162 It might also appear that it would be sufficient to use the existing
3163 operands of the widening multiply, but that would limit the choice of
3164 multiply-and-accumulate instructions.
3166 If the widened-multiplication result has more than one uses, it is
3167 probably wiser not to do the conversion. */
3168 if (code == PLUS_EXPR
3169 && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
3171 if (!has_single_use (rhs1)
3172 || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
3173 &type2, &mult_rhs2))
3174 return false;
3175 add_rhs = rhs2;
3176 conv_stmt = conv1_stmt;
3178 else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
3180 if (!has_single_use (rhs2)
3181 || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
3182 &type2, &mult_rhs2))
3183 return false;
3184 add_rhs = rhs1;
3185 conv_stmt = conv2_stmt;
3187 else
3188 return false;
3190 to_mode = TYPE_MODE (type);
3191 from_mode = TYPE_MODE (type1);
3192 from_unsigned1 = TYPE_UNSIGNED (type1);
3193 from_unsigned2 = TYPE_UNSIGNED (type2);
3194 optype = type1;
3196 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
3197 if (from_unsigned1 != from_unsigned2)
3199 if (!INTEGRAL_TYPE_P (type))
3200 return false;
3201 /* We can use a signed multiply with unsigned types as long as
3202 there is a wider mode to use, or it is the smaller of the two
3203 types that is unsigned. Note that type1 >= type2, always. */
3204 if ((from_unsigned1
3205 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3206 || (from_unsigned2
3207 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3209 from_mode = GET_MODE_WIDER_MODE (from_mode);
3210 if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
3211 return false;
3214 from_unsigned1 = from_unsigned2 = false;
3215 optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3216 false);
3219 /* If there was a conversion between the multiply and addition
3220 then we need to make sure it fits a multiply-and-accumulate.
3221 The should be a single mode change which does not change the
3222 value. */
3223 if (conv_stmt)
3225 /* We use the original, unmodified data types for this. */
3226 tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3227 tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3228 int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3229 bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3231 if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3233 /* Conversion is a truncate. */
3234 if (TYPE_PRECISION (to_type) < data_size)
3235 return false;
3237 else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3239 /* Conversion is an extend. Check it's the right sort. */
3240 if (TYPE_UNSIGNED (from_type) != is_unsigned
3241 && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3242 return false;
3244 /* else convert is a no-op for our purposes. */
3247 /* Verify that the machine can perform a widening multiply
3248 accumulate in this mode/signedness combination, otherwise
3249 this transformation is likely to pessimize code. */
3250 this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3251 handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3252 from_mode, 0, &actual_mode);
3254 if (handler == CODE_FOR_nothing)
3255 return false;
3257 /* Ensure that the inputs to the handler are in the correct precison
3258 for the opcode. This will be the full mode size. */
3259 actual_precision = GET_MODE_PRECISION (actual_mode);
3260 if (actual_precision != TYPE_PRECISION (type1)
3261 || from_unsigned1 != TYPE_UNSIGNED (type1))
3262 mult_rhs1 = build_and_insert_cast (gsi, loc,
3263 build_nonstandard_integer_type
3264 (actual_precision, from_unsigned1),
3265 mult_rhs1);
3266 if (actual_precision != TYPE_PRECISION (type2)
3267 || from_unsigned2 != TYPE_UNSIGNED (type2))
3268 mult_rhs2 = build_and_insert_cast (gsi, loc,
3269 build_nonstandard_integer_type
3270 (actual_precision, from_unsigned2),
3271 mult_rhs2);
3273 if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3274 add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3276 /* Handle constants. */
3277 if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3278 mult_rhs1 = fold_convert (type1, mult_rhs1);
3279 if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3280 mult_rhs2 = fold_convert (type2, mult_rhs2);
3282 gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3283 add_rhs);
3284 update_stmt (gsi_stmt (*gsi));
3285 widen_mul_stats.maccs_inserted++;
3286 return true;
3289 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3290 with uses in additions and subtractions to form fused multiply-add
3291 operations. Returns true if successful and MUL_STMT should be removed. */
3293 static bool
3294 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2)
3296 tree mul_result = gimple_get_lhs (mul_stmt);
3297 tree type = TREE_TYPE (mul_result);
3298 gimple *use_stmt, *neguse_stmt;
3299 gassign *fma_stmt;
3300 use_operand_p use_p;
3301 imm_use_iterator imm_iter;
3303 if (FLOAT_TYPE_P (type)
3304 && flag_fp_contract_mode == FP_CONTRACT_OFF)
3305 return false;
3307 /* We don't want to do bitfield reduction ops. */
3308 if (INTEGRAL_TYPE_P (type)
3309 && (TYPE_PRECISION (type)
3310 != GET_MODE_PRECISION (TYPE_MODE (type))))
3311 return false;
3313 /* If the target doesn't support it, don't generate it. We assume that
3314 if fma isn't available then fms, fnma or fnms are not either. */
3315 if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3316 return false;
3318 /* If the multiplication has zero uses, it is kept around probably because
3319 of -fnon-call-exceptions. Don't optimize it away in that case,
3320 it is DCE job. */
3321 if (has_zero_uses (mul_result))
3322 return false;
3324 /* Make sure that the multiplication statement becomes dead after
3325 the transformation, thus that all uses are transformed to FMAs.
3326 This means we assume that an FMA operation has the same cost
3327 as an addition. */
3328 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3330 enum tree_code use_code;
3331 tree result = mul_result;
3332 bool negate_p = false;
3334 use_stmt = USE_STMT (use_p);
3336 if (is_gimple_debug (use_stmt))
3337 continue;
3339 /* For now restrict this operations to single basic blocks. In theory
3340 we would want to support sinking the multiplication in
3341 m = a*b;
3342 if ()
3343 ma = m + c;
3344 else
3345 d = m;
3346 to form a fma in the then block and sink the multiplication to the
3347 else block. */
3348 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3349 return false;
3351 if (!is_gimple_assign (use_stmt))
3352 return false;
3354 use_code = gimple_assign_rhs_code (use_stmt);
3356 /* A negate on the multiplication leads to FNMA. */
3357 if (use_code == NEGATE_EXPR)
3359 ssa_op_iter iter;
3360 use_operand_p usep;
3362 result = gimple_assign_lhs (use_stmt);
3364 /* Make sure the negate statement becomes dead with this
3365 single transformation. */
3366 if (!single_imm_use (gimple_assign_lhs (use_stmt),
3367 &use_p, &neguse_stmt))
3368 return false;
3370 /* Make sure the multiplication isn't also used on that stmt. */
3371 FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3372 if (USE_FROM_PTR (usep) == mul_result)
3373 return false;
3375 /* Re-validate. */
3376 use_stmt = neguse_stmt;
3377 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3378 return false;
3379 if (!is_gimple_assign (use_stmt))
3380 return false;
3382 use_code = gimple_assign_rhs_code (use_stmt);
3383 negate_p = true;
3386 switch (use_code)
3388 case MINUS_EXPR:
3389 if (gimple_assign_rhs2 (use_stmt) == result)
3390 negate_p = !negate_p;
3391 break;
3392 case PLUS_EXPR:
3393 break;
3394 default:
3395 /* FMA can only be formed from PLUS and MINUS. */
3396 return false;
3399 /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3400 by a MULT_EXPR that we'll visit later, we might be able to
3401 get a more profitable match with fnma.
3402 OTOH, if we don't, a negate / fma pair has likely lower latency
3403 that a mult / subtract pair. */
3404 if (use_code == MINUS_EXPR && !negate_p
3405 && gimple_assign_rhs1 (use_stmt) == result
3406 && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3407 && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3409 tree rhs2 = gimple_assign_rhs2 (use_stmt);
3411 if (TREE_CODE (rhs2) == SSA_NAME)
3413 gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2);
3414 if (has_single_use (rhs2)
3415 && is_gimple_assign (stmt2)
3416 && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3417 return false;
3421 /* We can't handle a * b + a * b. */
3422 if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3423 return false;
3425 /* While it is possible to validate whether or not the exact form
3426 that we've recognized is available in the backend, the assumption
3427 is that the transformation is never a loss. For instance, suppose
3428 the target only has the plain FMA pattern available. Consider
3429 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3430 is still two operations. Consider -(a*b)-c -> fma(-a,b,-c): we
3431 still have 3 operations, but in the FMA form the two NEGs are
3432 independent and could be run in parallel. */
3435 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3437 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3438 enum tree_code use_code;
3439 tree addop, mulop1 = op1, result = mul_result;
3440 bool negate_p = false;
3442 if (is_gimple_debug (use_stmt))
3443 continue;
3445 use_code = gimple_assign_rhs_code (use_stmt);
3446 if (use_code == NEGATE_EXPR)
3448 result = gimple_assign_lhs (use_stmt);
3449 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3450 gsi_remove (&gsi, true);
3451 release_defs (use_stmt);
3453 use_stmt = neguse_stmt;
3454 gsi = gsi_for_stmt (use_stmt);
3455 use_code = gimple_assign_rhs_code (use_stmt);
3456 negate_p = true;
3459 if (gimple_assign_rhs1 (use_stmt) == result)
3461 addop = gimple_assign_rhs2 (use_stmt);
3462 /* a * b - c -> a * b + (-c) */
3463 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3464 addop = force_gimple_operand_gsi (&gsi,
3465 build1 (NEGATE_EXPR,
3466 type, addop),
3467 true, NULL_TREE, true,
3468 GSI_SAME_STMT);
3470 else
3472 addop = gimple_assign_rhs1 (use_stmt);
3473 /* a - b * c -> (-b) * c + a */
3474 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3475 negate_p = !negate_p;
3478 if (negate_p)
3479 mulop1 = force_gimple_operand_gsi (&gsi,
3480 build1 (NEGATE_EXPR,
3481 type, mulop1),
3482 true, NULL_TREE, true,
3483 GSI_SAME_STMT);
3485 fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3486 FMA_EXPR, mulop1, op2, addop);
3487 gsi_replace (&gsi, fma_stmt, true);
3488 widen_mul_stats.fmas_inserted++;
3491 return true;
3494 /* Find integer multiplications where the operands are extended from
3495 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3496 where appropriate. */
3498 namespace {
3500 const pass_data pass_data_optimize_widening_mul =
3502 GIMPLE_PASS, /* type */
3503 "widening_mul", /* name */
3504 OPTGROUP_NONE, /* optinfo_flags */
3505 TV_NONE, /* tv_id */
3506 PROP_ssa, /* properties_required */
3507 0, /* properties_provided */
3508 0, /* properties_destroyed */
3509 0, /* todo_flags_start */
3510 TODO_update_ssa, /* todo_flags_finish */
3513 class pass_optimize_widening_mul : public gimple_opt_pass
3515 public:
3516 pass_optimize_widening_mul (gcc::context *ctxt)
3517 : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3520 /* opt_pass methods: */
3521 virtual bool gate (function *)
3523 return flag_expensive_optimizations && optimize;
3526 virtual unsigned int execute (function *);
3528 }; // class pass_optimize_widening_mul
3530 unsigned int
3531 pass_optimize_widening_mul::execute (function *fun)
3533 basic_block bb;
3534 bool cfg_changed = false;
3536 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3538 FOR_EACH_BB_FN (bb, fun)
3540 gimple_stmt_iterator gsi;
3542 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3544 gimple *stmt = gsi_stmt (gsi);
3545 enum tree_code code;
3547 if (is_gimple_assign (stmt))
3549 code = gimple_assign_rhs_code (stmt);
3550 switch (code)
3552 case MULT_EXPR:
3553 if (!convert_mult_to_widen (stmt, &gsi)
3554 && convert_mult_to_fma (stmt,
3555 gimple_assign_rhs1 (stmt),
3556 gimple_assign_rhs2 (stmt)))
3558 gsi_remove (&gsi, true);
3559 release_defs (stmt);
3560 continue;
3562 break;
3564 case PLUS_EXPR:
3565 case MINUS_EXPR:
3566 convert_plusminus_to_widen (&gsi, stmt, code);
3567 break;
3569 default:;
3572 else if (is_gimple_call (stmt)
3573 && gimple_call_lhs (stmt))
3575 tree fndecl = gimple_call_fndecl (stmt);
3576 if (fndecl
3577 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3579 switch (DECL_FUNCTION_CODE (fndecl))
3581 case BUILT_IN_POWF:
3582 case BUILT_IN_POW:
3583 case BUILT_IN_POWL:
3584 if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3585 && real_equal
3586 (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3587 &dconst2)
3588 && convert_mult_to_fma (stmt,
3589 gimple_call_arg (stmt, 0),
3590 gimple_call_arg (stmt, 0)))
3592 unlink_stmt_vdef (stmt);
3593 if (gsi_remove (&gsi, true)
3594 && gimple_purge_dead_eh_edges (bb))
3595 cfg_changed = true;
3596 release_defs (stmt);
3597 continue;
3599 break;
3601 default:;
3605 gsi_next (&gsi);
3609 statistics_counter_event (fun, "widening multiplications inserted",
3610 widen_mul_stats.widen_mults_inserted);
3611 statistics_counter_event (fun, "widening maccs inserted",
3612 widen_mul_stats.maccs_inserted);
3613 statistics_counter_event (fun, "fused multiply-adds inserted",
3614 widen_mul_stats.fmas_inserted);
3616 return cfg_changed ? TODO_cleanup_cfg : 0;
3619 } // anon namespace
3621 gimple_opt_pass *
3622 make_pass_optimize_widening_mul (gcc::context *ctxt)
3624 return new pass_optimize_widening_mul (ctxt);