1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2014 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
23 modulus = sqrt(x*x + y*y + z*z);
28 that can be optimized to
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 hy the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
89 #include "coretypes.h"
98 #include "hard-reg-set.h"
100 #include "function.h"
101 #include "dominance.h"
103 #include "basic-block.h"
104 #include "tree-ssa-alias.h"
105 #include "internal-fn.h"
106 #include "gimple-fold.h"
107 #include "gimple-expr.h"
110 #include "gimple-iterator.h"
111 #include "gimplify.h"
112 #include "gimplify-me.h"
113 #include "stor-layout.h"
114 #include "gimple-ssa.h"
115 #include "tree-cfg.h"
116 #include "tree-phinodes.h"
117 #include "ssa-iterators.h"
118 #include "stringpool.h"
119 #include "tree-ssanames.h"
121 #include "tree-dfa.h"
122 #include "tree-ssa.h"
123 #include "tree-pass.h"
124 #include "alloc-pool.h"
126 #include "gimple-pretty-print.h"
127 #include "builtins.h"
129 /* FIXME: RTL headers have to be included here for optabs. */
130 #include "rtl.h" /* Because optabs.h wants enum rtx_code. */
131 #include "expr.h" /* Because optabs.h wants sepops. */
132 #include "insn-codes.h"
135 /* This structure represents one basic block that either computes a
136 division, or is a common dominator for basic block that compute a
139 /* The basic block represented by this structure. */
142 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
146 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
147 was inserted in BB. */
148 gimple recip_def_stmt
;
150 /* Pointer to a list of "struct occurrence"s for blocks dominated
152 struct occurrence
*children
;
154 /* Pointer to the next "struct occurrence"s in the list of blocks
155 sharing a common dominator. */
156 struct occurrence
*next
;
158 /* The number of divisions that are in BB before compute_merit. The
159 number of divisions that are in BB or post-dominate it after
163 /* True if the basic block has a division, false if it is a common
164 dominator for basic blocks that do. If it is false and trapping
165 math is active, BB is not a candidate for inserting a reciprocal. */
166 bool bb_has_division
;
171 /* Number of 1.0/X ops inserted. */
174 /* Number of 1.0/FUNC ops inserted. */
180 /* Number of cexpi calls inserted. */
186 /* Number of hand-written 16-bit nop / bswaps found. */
189 /* Number of hand-written 32-bit nop / bswaps found. */
192 /* Number of hand-written 64-bit nop / bswaps found. */
194 } nop_stats
, bswap_stats
;
198 /* Number of widening multiplication ops inserted. */
199 int widen_mults_inserted
;
201 /* Number of integer multiply-and-accumulate ops inserted. */
204 /* Number of fp fused multiply-add ops inserted. */
208 /* The instance of "struct occurrence" representing the highest
209 interesting block in the dominator tree. */
210 static struct occurrence
*occ_head
;
212 /* Allocation pool for getting instances of "struct occurrence". */
213 static alloc_pool occ_pool
;
217 /* Allocate and return a new struct occurrence for basic block BB, and
218 whose children list is headed by CHILDREN. */
219 static struct occurrence
*
220 occ_new (basic_block bb
, struct occurrence
*children
)
222 struct occurrence
*occ
;
224 bb
->aux
= occ
= (struct occurrence
*) pool_alloc (occ_pool
);
225 memset (occ
, 0, sizeof (struct occurrence
));
228 occ
->children
= children
;
233 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
234 list of "struct occurrence"s, one per basic block, having IDOM as
235 their common dominator.
237 We try to insert NEW_OCC as deep as possible in the tree, and we also
238 insert any other block that is a common dominator for BB and one
239 block already in the tree. */
242 insert_bb (struct occurrence
*new_occ
, basic_block idom
,
243 struct occurrence
**p_head
)
245 struct occurrence
*occ
, **p_occ
;
247 for (p_occ
= p_head
; (occ
= *p_occ
) != NULL
; )
249 basic_block bb
= new_occ
->bb
, occ_bb
= occ
->bb
;
250 basic_block dom
= nearest_common_dominator (CDI_DOMINATORS
, occ_bb
, bb
);
253 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
256 occ
->next
= new_occ
->children
;
257 new_occ
->children
= occ
;
259 /* Try the next block (it may as well be dominated by BB). */
262 else if (dom
== occ_bb
)
264 /* OCC_BB dominates BB. Tail recurse to look deeper. */
265 insert_bb (new_occ
, dom
, &occ
->children
);
269 else if (dom
!= idom
)
271 gcc_assert (!dom
->aux
);
273 /* There is a dominator between IDOM and BB, add it and make
274 two children out of NEW_OCC and OCC. First, remove OCC from
280 /* None of the previous blocks has DOM as a dominator: if we tail
281 recursed, we would reexamine them uselessly. Just switch BB with
282 DOM, and go on looking for blocks dominated by DOM. */
283 new_occ
= occ_new (dom
, new_occ
);
288 /* Nothing special, go on with the next element. */
293 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
294 new_occ
->next
= *p_head
;
298 /* Register that we found a division in BB. */
301 register_division_in (basic_block bb
)
303 struct occurrence
*occ
;
305 occ
= (struct occurrence
*) bb
->aux
;
308 occ
= occ_new (bb
, NULL
);
309 insert_bb (occ
, ENTRY_BLOCK_PTR_FOR_FN (cfun
), &occ_head
);
312 occ
->bb_has_division
= true;
313 occ
->num_divisions
++;
317 /* Compute the number of divisions that postdominate each block in OCC and
321 compute_merit (struct occurrence
*occ
)
323 struct occurrence
*occ_child
;
324 basic_block dom
= occ
->bb
;
326 for (occ_child
= occ
->children
; occ_child
; occ_child
= occ_child
->next
)
329 if (occ_child
->children
)
330 compute_merit (occ_child
);
333 bb
= single_noncomplex_succ (dom
);
337 if (dominated_by_p (CDI_POST_DOMINATORS
, bb
, occ_child
->bb
))
338 occ
->num_divisions
+= occ_child
->num_divisions
;
343 /* Return whether USE_STMT is a floating-point division by DEF. */
345 is_division_by (gimple use_stmt
, tree def
)
347 return is_gimple_assign (use_stmt
)
348 && gimple_assign_rhs_code (use_stmt
) == RDIV_EXPR
349 && gimple_assign_rhs2 (use_stmt
) == def
350 /* Do not recognize x / x as valid division, as we are getting
351 confused later by replacing all immediate uses x in such
353 && gimple_assign_rhs1 (use_stmt
) != def
;
356 /* Walk the subset of the dominator tree rooted at OCC, setting the
357 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
358 the given basic block. The field may be left NULL, of course,
359 if it is not possible or profitable to do the optimization.
361 DEF_BSI is an iterator pointing at the statement defining DEF.
362 If RECIP_DEF is set, a dominator already has a computation that can
366 insert_reciprocals (gimple_stmt_iterator
*def_gsi
, struct occurrence
*occ
,
367 tree def
, tree recip_def
, int threshold
)
371 gimple_stmt_iterator gsi
;
372 struct occurrence
*occ_child
;
375 && (occ
->bb_has_division
|| !flag_trapping_math
)
376 && occ
->num_divisions
>= threshold
)
378 /* Make a variable with the replacement and substitute it. */
379 type
= TREE_TYPE (def
);
380 recip_def
= create_tmp_reg (type
, "reciptmp");
381 new_stmt
= gimple_build_assign_with_ops (RDIV_EXPR
, recip_def
,
382 build_one_cst (type
), def
);
384 if (occ
->bb_has_division
)
386 /* Case 1: insert before an existing division. */
387 gsi
= gsi_after_labels (occ
->bb
);
388 while (!gsi_end_p (gsi
) && !is_division_by (gsi_stmt (gsi
), def
))
391 gsi_insert_before (&gsi
, new_stmt
, GSI_SAME_STMT
);
393 else if (def_gsi
&& occ
->bb
== def_gsi
->bb
)
395 /* Case 2: insert right after the definition. Note that this will
396 never happen if the definition statement can throw, because in
397 that case the sole successor of the statement's basic block will
398 dominate all the uses as well. */
399 gsi_insert_after (def_gsi
, new_stmt
, GSI_NEW_STMT
);
403 /* Case 3: insert in a basic block not containing defs/uses. */
404 gsi
= gsi_after_labels (occ
->bb
);
405 gsi_insert_before (&gsi
, new_stmt
, GSI_SAME_STMT
);
408 reciprocal_stats
.rdivs_inserted
++;
410 occ
->recip_def_stmt
= new_stmt
;
413 occ
->recip_def
= recip_def
;
414 for (occ_child
= occ
->children
; occ_child
; occ_child
= occ_child
->next
)
415 insert_reciprocals (def_gsi
, occ_child
, def
, recip_def
, threshold
);
419 /* Replace the division at USE_P with a multiplication by the reciprocal, if
423 replace_reciprocal (use_operand_p use_p
)
425 gimple use_stmt
= USE_STMT (use_p
);
426 basic_block bb
= gimple_bb (use_stmt
);
427 struct occurrence
*occ
= (struct occurrence
*) bb
->aux
;
429 if (optimize_bb_for_speed_p (bb
)
430 && occ
->recip_def
&& use_stmt
!= occ
->recip_def_stmt
)
432 gimple_stmt_iterator gsi
= gsi_for_stmt (use_stmt
);
433 gimple_assign_set_rhs_code (use_stmt
, MULT_EXPR
);
434 SET_USE (use_p
, occ
->recip_def
);
435 fold_stmt_inplace (&gsi
);
436 update_stmt (use_stmt
);
441 /* Free OCC and return one more "struct occurrence" to be freed. */
443 static struct occurrence
*
444 free_bb (struct occurrence
*occ
)
446 struct occurrence
*child
, *next
;
448 /* First get the two pointers hanging off OCC. */
450 child
= occ
->children
;
452 pool_free (occ_pool
, occ
);
454 /* Now ensure that we don't recurse unless it is necessary. */
460 next
= free_bb (next
);
467 /* Look for floating-point divisions among DEF's uses, and try to
468 replace them by multiplications with the reciprocal. Add
469 as many statements computing the reciprocal as needed.
471 DEF must be a GIMPLE register of a floating-point type. */
474 execute_cse_reciprocals_1 (gimple_stmt_iterator
*def_gsi
, tree def
)
477 imm_use_iterator use_iter
;
478 struct occurrence
*occ
;
479 int count
= 0, threshold
;
481 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def
)) && is_gimple_reg (def
));
483 FOR_EACH_IMM_USE_FAST (use_p
, use_iter
, def
)
485 gimple use_stmt
= USE_STMT (use_p
);
486 if (is_division_by (use_stmt
, def
))
488 register_division_in (gimple_bb (use_stmt
));
493 /* Do the expensive part only if we can hope to optimize something. */
494 threshold
= targetm
.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def
)));
495 if (count
>= threshold
)
498 for (occ
= occ_head
; occ
; occ
= occ
->next
)
501 insert_reciprocals (def_gsi
, occ
, def
, NULL
, threshold
);
504 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, def
)
506 if (is_division_by (use_stmt
, def
))
508 FOR_EACH_IMM_USE_ON_STMT (use_p
, use_iter
)
509 replace_reciprocal (use_p
);
514 for (occ
= occ_head
; occ
; )
520 /* Go through all the floating-point SSA_NAMEs, and call
521 execute_cse_reciprocals_1 on each of them. */
524 const pass_data pass_data_cse_reciprocals
=
526 GIMPLE_PASS
, /* type */
528 OPTGROUP_NONE
, /* optinfo_flags */
530 PROP_ssa
, /* properties_required */
531 0, /* properties_provided */
532 0, /* properties_destroyed */
533 0, /* todo_flags_start */
534 TODO_update_ssa
, /* todo_flags_finish */
537 class pass_cse_reciprocals
: public gimple_opt_pass
540 pass_cse_reciprocals (gcc::context
*ctxt
)
541 : gimple_opt_pass (pass_data_cse_reciprocals
, ctxt
)
544 /* opt_pass methods: */
545 virtual bool gate (function
*) { return optimize
&& flag_reciprocal_math
; }
546 virtual unsigned int execute (function
*);
548 }; // class pass_cse_reciprocals
551 pass_cse_reciprocals::execute (function
*fun
)
556 occ_pool
= create_alloc_pool ("dominators for recip",
557 sizeof (struct occurrence
),
558 n_basic_blocks_for_fn (fun
) / 3 + 1);
560 memset (&reciprocal_stats
, 0, sizeof (reciprocal_stats
));
561 calculate_dominance_info (CDI_DOMINATORS
);
562 calculate_dominance_info (CDI_POST_DOMINATORS
);
564 #ifdef ENABLE_CHECKING
565 FOR_EACH_BB_FN (bb
, fun
)
566 gcc_assert (!bb
->aux
);
569 for (arg
= DECL_ARGUMENTS (fun
->decl
); arg
; arg
= DECL_CHAIN (arg
))
570 if (FLOAT_TYPE_P (TREE_TYPE (arg
))
571 && is_gimple_reg (arg
))
573 tree name
= ssa_default_def (fun
, arg
);
575 execute_cse_reciprocals_1 (NULL
, name
);
578 FOR_EACH_BB_FN (bb
, fun
)
582 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
585 gphi
*phi
= gsi
.phi ();
586 def
= PHI_RESULT (phi
);
587 if (! virtual_operand_p (def
)
588 && FLOAT_TYPE_P (TREE_TYPE (def
)))
589 execute_cse_reciprocals_1 (NULL
, def
);
592 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
595 gimple stmt
= gsi_stmt (gsi
);
597 if (gimple_has_lhs (stmt
)
598 && (def
= SINGLE_SSA_TREE_OPERAND (stmt
, SSA_OP_DEF
)) != NULL
599 && FLOAT_TYPE_P (TREE_TYPE (def
))
600 && TREE_CODE (def
) == SSA_NAME
)
601 execute_cse_reciprocals_1 (&gsi
, def
);
604 if (optimize_bb_for_size_p (bb
))
607 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
608 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
611 gimple stmt
= gsi_stmt (gsi
);
614 if (is_gimple_assign (stmt
)
615 && gimple_assign_rhs_code (stmt
) == RDIV_EXPR
)
617 tree arg1
= gimple_assign_rhs2 (stmt
);
620 if (TREE_CODE (arg1
) != SSA_NAME
)
623 stmt1
= SSA_NAME_DEF_STMT (arg1
);
625 if (is_gimple_call (stmt1
)
626 && gimple_call_lhs (stmt1
)
627 && (fndecl
= gimple_call_fndecl (stmt1
))
628 && (DECL_BUILT_IN_CLASS (fndecl
) == BUILT_IN_NORMAL
629 || DECL_BUILT_IN_CLASS (fndecl
) == BUILT_IN_MD
))
631 enum built_in_function code
;
636 code
= DECL_FUNCTION_CODE (fndecl
);
637 md_code
= DECL_BUILT_IN_CLASS (fndecl
) == BUILT_IN_MD
;
639 fndecl
= targetm
.builtin_reciprocal (code
, md_code
, false);
643 /* Check that all uses of the SSA name are divisions,
644 otherwise replacing the defining statement will do
647 FOR_EACH_IMM_USE_FAST (use_p
, ui
, arg1
)
649 gimple stmt2
= USE_STMT (use_p
);
650 if (is_gimple_debug (stmt2
))
652 if (!is_gimple_assign (stmt2
)
653 || gimple_assign_rhs_code (stmt2
) != RDIV_EXPR
654 || gimple_assign_rhs1 (stmt2
) == arg1
655 || gimple_assign_rhs2 (stmt2
) != arg1
)
664 gimple_replace_ssa_lhs (stmt1
, arg1
);
665 gimple_call_set_fndecl (stmt1
, fndecl
);
667 reciprocal_stats
.rfuncs_inserted
++;
669 FOR_EACH_IMM_USE_STMT (stmt
, ui
, arg1
)
671 gimple_stmt_iterator gsi
= gsi_for_stmt (stmt
);
672 gimple_assign_set_rhs_code (stmt
, MULT_EXPR
);
673 fold_stmt_inplace (&gsi
);
681 statistics_counter_event (fun
, "reciprocal divs inserted",
682 reciprocal_stats
.rdivs_inserted
);
683 statistics_counter_event (fun
, "reciprocal functions inserted",
684 reciprocal_stats
.rfuncs_inserted
);
686 free_dominance_info (CDI_DOMINATORS
);
687 free_dominance_info (CDI_POST_DOMINATORS
);
688 free_alloc_pool (occ_pool
);
695 make_pass_cse_reciprocals (gcc::context
*ctxt
)
697 return new pass_cse_reciprocals (ctxt
);
700 /* Records an occurrence at statement USE_STMT in the vector of trees
701 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
702 is not yet initialized. Returns true if the occurrence was pushed on
703 the vector. Adjusts *TOP_BB to be the basic block dominating all
704 statements in the vector. */
707 maybe_record_sincos (vec
<gimple
> *stmts
,
708 basic_block
*top_bb
, gimple use_stmt
)
710 basic_block use_bb
= gimple_bb (use_stmt
);
712 && (*top_bb
== use_bb
713 || dominated_by_p (CDI_DOMINATORS
, use_bb
, *top_bb
)))
714 stmts
->safe_push (use_stmt
);
716 || dominated_by_p (CDI_DOMINATORS
, *top_bb
, use_bb
))
718 stmts
->safe_push (use_stmt
);
727 /* Look for sin, cos and cexpi calls with the same argument NAME and
728 create a single call to cexpi CSEing the result in this case.
729 We first walk over all immediate uses of the argument collecting
730 statements that we can CSE in a vector and in a second pass replace
731 the statement rhs with a REALPART or IMAGPART expression on the
732 result of the cexpi call we insert before the use statement that
733 dominates all other candidates. */
736 execute_cse_sincos_1 (tree name
)
738 gimple_stmt_iterator gsi
;
739 imm_use_iterator use_iter
;
740 tree fndecl
, res
, type
;
741 gimple def_stmt
, use_stmt
, stmt
;
742 int seen_cos
= 0, seen_sin
= 0, seen_cexpi
= 0;
743 vec
<gimple
> stmts
= vNULL
;
744 basic_block top_bb
= NULL
;
746 bool cfg_changed
= false;
748 type
= TREE_TYPE (name
);
749 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, name
)
751 if (gimple_code (use_stmt
) != GIMPLE_CALL
752 || !gimple_call_lhs (use_stmt
)
753 || !(fndecl
= gimple_call_fndecl (use_stmt
))
754 || DECL_BUILT_IN_CLASS (fndecl
) != BUILT_IN_NORMAL
)
757 switch (DECL_FUNCTION_CODE (fndecl
))
759 CASE_FLT_FN (BUILT_IN_COS
):
760 seen_cos
|= maybe_record_sincos (&stmts
, &top_bb
, use_stmt
) ? 1 : 0;
763 CASE_FLT_FN (BUILT_IN_SIN
):
764 seen_sin
|= maybe_record_sincos (&stmts
, &top_bb
, use_stmt
) ? 1 : 0;
767 CASE_FLT_FN (BUILT_IN_CEXPI
):
768 seen_cexpi
|= maybe_record_sincos (&stmts
, &top_bb
, use_stmt
) ? 1 : 0;
775 if (seen_cos
+ seen_sin
+ seen_cexpi
<= 1)
781 /* Simply insert cexpi at the beginning of top_bb but not earlier than
782 the name def statement. */
783 fndecl
= mathfn_built_in (type
, BUILT_IN_CEXPI
);
786 stmt
= gimple_build_call (fndecl
, 1, name
);
787 res
= make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl
)), stmt
, "sincostmp");
788 gimple_call_set_lhs (stmt
, res
);
790 def_stmt
= SSA_NAME_DEF_STMT (name
);
791 if (!SSA_NAME_IS_DEFAULT_DEF (name
)
792 && gimple_code (def_stmt
) != GIMPLE_PHI
793 && gimple_bb (def_stmt
) == top_bb
)
795 gsi
= gsi_for_stmt (def_stmt
);
796 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
800 gsi
= gsi_after_labels (top_bb
);
801 gsi_insert_before (&gsi
, stmt
, GSI_SAME_STMT
);
803 sincos_stats
.inserted
++;
805 /* And adjust the recorded old call sites. */
806 for (i
= 0; stmts
.iterate (i
, &use_stmt
); ++i
)
809 fndecl
= gimple_call_fndecl (use_stmt
);
811 switch (DECL_FUNCTION_CODE (fndecl
))
813 CASE_FLT_FN (BUILT_IN_COS
):
814 rhs
= fold_build1 (REALPART_EXPR
, type
, res
);
817 CASE_FLT_FN (BUILT_IN_SIN
):
818 rhs
= fold_build1 (IMAGPART_EXPR
, type
, res
);
821 CASE_FLT_FN (BUILT_IN_CEXPI
):
829 /* Replace call with a copy. */
830 stmt
= gimple_build_assign (gimple_call_lhs (use_stmt
), rhs
);
832 gsi
= gsi_for_stmt (use_stmt
);
833 gsi_replace (&gsi
, stmt
, true);
834 if (gimple_purge_dead_eh_edges (gimple_bb (stmt
)))
843 /* To evaluate powi(x,n), the floating point value x raised to the
844 constant integer exponent n, we use a hybrid algorithm that
845 combines the "window method" with look-up tables. For an
846 introduction to exponentiation algorithms and "addition chains",
847 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
848 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
849 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
850 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
852 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
853 multiplications to inline before calling the system library's pow
854 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
855 so this default never requires calling pow, powf or powl. */
857 #ifndef POWI_MAX_MULTS
858 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
861 /* The size of the "optimal power tree" lookup table. All
862 exponents less than this value are simply looked up in the
863 powi_table below. This threshold is also used to size the
864 cache of pseudo registers that hold intermediate results. */
865 #define POWI_TABLE_SIZE 256
867 /* The size, in bits of the window, used in the "window method"
868 exponentiation algorithm. This is equivalent to a radix of
869 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
870 #define POWI_WINDOW_SIZE 3
872 /* The following table is an efficient representation of an
873 "optimal power tree". For each value, i, the corresponding
874 value, j, in the table states than an optimal evaluation
875 sequence for calculating pow(x,i) can be found by evaluating
876 pow(x,j)*pow(x,i-j). An optimal power tree for the first
877 100 integers is given in Knuth's "Seminumerical algorithms". */
879 static const unsigned char powi_table
[POWI_TABLE_SIZE
] =
881 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
882 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
883 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
884 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
885 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
886 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
887 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
888 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
889 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
890 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
891 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
892 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
893 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
894 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
895 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
896 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
897 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
898 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
899 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
900 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
901 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
902 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
903 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
904 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
905 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
906 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
907 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
908 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
909 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
910 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
911 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
912 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
916 /* Return the number of multiplications required to calculate
917 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
918 subroutine of powi_cost. CACHE is an array indicating
919 which exponents have already been calculated. */
922 powi_lookup_cost (unsigned HOST_WIDE_INT n
, bool *cache
)
924 /* If we've already calculated this exponent, then this evaluation
925 doesn't require any additional multiplications. */
930 return powi_lookup_cost (n
- powi_table
[n
], cache
)
931 + powi_lookup_cost (powi_table
[n
], cache
) + 1;
934 /* Return the number of multiplications required to calculate
935 powi(x,n) for an arbitrary x, given the exponent N. This
936 function needs to be kept in sync with powi_as_mults below. */
939 powi_cost (HOST_WIDE_INT n
)
941 bool cache
[POWI_TABLE_SIZE
];
942 unsigned HOST_WIDE_INT digit
;
943 unsigned HOST_WIDE_INT val
;
949 /* Ignore the reciprocal when calculating the cost. */
950 val
= (n
< 0) ? -n
: n
;
952 /* Initialize the exponent cache. */
953 memset (cache
, 0, POWI_TABLE_SIZE
* sizeof (bool));
958 while (val
>= POWI_TABLE_SIZE
)
962 digit
= val
& ((1 << POWI_WINDOW_SIZE
) - 1);
963 result
+= powi_lookup_cost (digit
, cache
)
964 + POWI_WINDOW_SIZE
+ 1;
965 val
>>= POWI_WINDOW_SIZE
;
974 return result
+ powi_lookup_cost (val
, cache
);
977 /* Recursive subroutine of powi_as_mults. This function takes the
978 array, CACHE, of already calculated exponents and an exponent N and
979 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
982 powi_as_mults_1 (gimple_stmt_iterator
*gsi
, location_t loc
, tree type
,
983 HOST_WIDE_INT n
, tree
*cache
)
985 tree op0
, op1
, ssa_target
;
986 unsigned HOST_WIDE_INT digit
;
989 if (n
< POWI_TABLE_SIZE
&& cache
[n
])
992 ssa_target
= make_temp_ssa_name (type
, NULL
, "powmult");
994 if (n
< POWI_TABLE_SIZE
)
996 cache
[n
] = ssa_target
;
997 op0
= powi_as_mults_1 (gsi
, loc
, type
, n
- powi_table
[n
], cache
);
998 op1
= powi_as_mults_1 (gsi
, loc
, type
, powi_table
[n
], cache
);
1002 digit
= n
& ((1 << POWI_WINDOW_SIZE
) - 1);
1003 op0
= powi_as_mults_1 (gsi
, loc
, type
, n
- digit
, cache
);
1004 op1
= powi_as_mults_1 (gsi
, loc
, type
, digit
, cache
);
1008 op0
= powi_as_mults_1 (gsi
, loc
, type
, n
>> 1, cache
);
1012 mult_stmt
= gimple_build_assign_with_ops (MULT_EXPR
, ssa_target
, op0
, op1
);
1013 gimple_set_location (mult_stmt
, loc
);
1014 gsi_insert_before (gsi
, mult_stmt
, GSI_SAME_STMT
);
1019 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1020 This function needs to be kept in sync with powi_cost above. */
1023 powi_as_mults (gimple_stmt_iterator
*gsi
, location_t loc
,
1024 tree arg0
, HOST_WIDE_INT n
)
1026 tree cache
[POWI_TABLE_SIZE
], result
, type
= TREE_TYPE (arg0
);
1031 return build_real (type
, dconst1
);
1033 memset (cache
, 0, sizeof (cache
));
1036 result
= powi_as_mults_1 (gsi
, loc
, type
, (n
< 0) ? -n
: n
, cache
);
1040 /* If the original exponent was negative, reciprocate the result. */
1041 target
= make_temp_ssa_name (type
, NULL
, "powmult");
1042 div_stmt
= gimple_build_assign_with_ops (RDIV_EXPR
, target
,
1043 build_real (type
, dconst1
),
1045 gimple_set_location (div_stmt
, loc
);
1046 gsi_insert_before (gsi
, div_stmt
, GSI_SAME_STMT
);
1051 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1052 location info LOC. If the arguments are appropriate, create an
1053 equivalent sequence of statements prior to GSI using an optimal
1054 number of multiplications, and return an expession holding the
1058 gimple_expand_builtin_powi (gimple_stmt_iterator
*gsi
, location_t loc
,
1059 tree arg0
, HOST_WIDE_INT n
)
1061 /* Avoid largest negative number. */
1063 && ((n
>= -1 && n
<= 2)
1064 || (optimize_function_for_speed_p (cfun
)
1065 && powi_cost (n
) <= POWI_MAX_MULTS
)))
1066 return powi_as_mults (gsi
, loc
, arg0
, n
);
1071 /* Build a gimple call statement that calls FN with argument ARG.
1072 Set the lhs of the call statement to a fresh SSA name. Insert the
1073 statement prior to GSI's current position, and return the fresh
1077 build_and_insert_call (gimple_stmt_iterator
*gsi
, location_t loc
,
1083 call_stmt
= gimple_build_call (fn
, 1, arg
);
1084 ssa_target
= make_temp_ssa_name (TREE_TYPE (arg
), NULL
, "powroot");
1085 gimple_set_lhs (call_stmt
, ssa_target
);
1086 gimple_set_location (call_stmt
, loc
);
1087 gsi_insert_before (gsi
, call_stmt
, GSI_SAME_STMT
);
1092 /* Build a gimple binary operation with the given CODE and arguments
1093 ARG0, ARG1, assigning the result to a new SSA name for variable
1094 TARGET. Insert the statement prior to GSI's current position, and
1095 return the fresh SSA name.*/
1098 build_and_insert_binop (gimple_stmt_iterator
*gsi
, location_t loc
,
1099 const char *name
, enum tree_code code
,
1100 tree arg0
, tree arg1
)
1102 tree result
= make_temp_ssa_name (TREE_TYPE (arg0
), NULL
, name
);
1103 gassign
*stmt
= gimple_build_assign_with_ops (code
, result
, arg0
, arg1
);
1104 gimple_set_location (stmt
, loc
);
1105 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
1109 /* Build a gimple reference operation with the given CODE and argument
1110 ARG, assigning the result to a new SSA name of TYPE with NAME.
1111 Insert the statement prior to GSI's current position, and return
1112 the fresh SSA name. */
1115 build_and_insert_ref (gimple_stmt_iterator
*gsi
, location_t loc
, tree type
,
1116 const char *name
, enum tree_code code
, tree arg0
)
1118 tree result
= make_temp_ssa_name (type
, NULL
, name
);
1119 gimple stmt
= gimple_build_assign (result
, build1 (code
, type
, arg0
));
1120 gimple_set_location (stmt
, loc
);
1121 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
1125 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1126 prior to GSI's current position, and return the fresh SSA name. */
1129 build_and_insert_cast (gimple_stmt_iterator
*gsi
, location_t loc
,
1130 tree type
, tree val
)
1132 tree result
= make_ssa_name (type
, NULL
);
1134 gimple_build_assign_with_ops (NOP_EXPR
, result
, val
, NULL_TREE
);
1135 gimple_set_location (stmt
, loc
);
1136 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
1140 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1141 with location info LOC. If possible, create an equivalent and
1142 less expensive sequence of statements prior to GSI, and return an
1143 expession holding the result. */
1146 gimple_expand_builtin_pow (gimple_stmt_iterator
*gsi
, location_t loc
,
1147 tree arg0
, tree arg1
)
1149 REAL_VALUE_TYPE c
, cint
, dconst1_4
, dconst3_4
, dconst1_3
, dconst1_6
;
1150 REAL_VALUE_TYPE c2
, dconst3
;
1152 tree type
, sqrtfn
, cbrtfn
, sqrt_arg0
, sqrt_sqrt
, result
, cbrt_x
, powi_cbrt_x
;
1154 bool hw_sqrt_exists
, c_is_int
, c2_is_int
;
1156 /* If the exponent isn't a constant, there's nothing of interest
1158 if (TREE_CODE (arg1
) != REAL_CST
)
1161 /* If the exponent is equivalent to an integer, expand to an optimal
1162 multiplication sequence when profitable. */
1163 c
= TREE_REAL_CST (arg1
);
1164 n
= real_to_integer (&c
);
1165 real_from_integer (&cint
, VOIDmode
, n
, SIGNED
);
1166 c_is_int
= real_identical (&c
, &cint
);
1169 && ((n
>= -1 && n
<= 2)
1170 || (flag_unsafe_math_optimizations
1171 && optimize_bb_for_speed_p (gsi_bb (*gsi
))
1172 && powi_cost (n
) <= POWI_MAX_MULTS
)))
1173 return gimple_expand_builtin_powi (gsi
, loc
, arg0
, n
);
1175 /* Attempt various optimizations using sqrt and cbrt. */
1176 type
= TREE_TYPE (arg0
);
1177 mode
= TYPE_MODE (type
);
1178 sqrtfn
= mathfn_built_in (type
, BUILT_IN_SQRT
);
1180 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1181 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1184 && REAL_VALUES_EQUAL (c
, dconsthalf
)
1185 && !HONOR_SIGNED_ZEROS (mode
))
1186 return build_and_insert_call (gsi
, loc
, sqrtfn
, arg0
);
1188 /* Optimize pow(x,0.25) = sqrt(sqrt(x)). Assume on most machines that
1189 a builtin sqrt instruction is smaller than a call to pow with 0.25,
1190 so do this optimization even if -Os. Don't do this optimization
1191 if we don't have a hardware sqrt insn. */
1192 dconst1_4
= dconst1
;
1193 SET_REAL_EXP (&dconst1_4
, REAL_EXP (&dconst1_4
) - 2);
1194 hw_sqrt_exists
= optab_handler (sqrt_optab
, mode
) != CODE_FOR_nothing
;
1196 if (flag_unsafe_math_optimizations
1198 && REAL_VALUES_EQUAL (c
, dconst1_4
)
1202 sqrt_arg0
= build_and_insert_call (gsi
, loc
, sqrtfn
, arg0
);
1205 return build_and_insert_call (gsi
, loc
, sqrtfn
, sqrt_arg0
);
1208 /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1209 optimizing for space. Don't do this optimization if we don't have
1210 a hardware sqrt insn. */
1211 real_from_integer (&dconst3_4
, VOIDmode
, 3, SIGNED
);
1212 SET_REAL_EXP (&dconst3_4
, REAL_EXP (&dconst3_4
) - 2);
1214 if (flag_unsafe_math_optimizations
1216 && optimize_function_for_speed_p (cfun
)
1217 && REAL_VALUES_EQUAL (c
, dconst3_4
)
1221 sqrt_arg0
= build_and_insert_call (gsi
, loc
, sqrtfn
, arg0
);
1224 sqrt_sqrt
= build_and_insert_call (gsi
, loc
, sqrtfn
, sqrt_arg0
);
1226 /* sqrt(x) * sqrt(sqrt(x)) */
1227 return build_and_insert_binop (gsi
, loc
, "powroot", MULT_EXPR
,
1228 sqrt_arg0
, sqrt_sqrt
);
1231 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1232 optimizations since 1./3. is not exactly representable. If x
1233 is negative and finite, the correct value of pow(x,1./3.) is
1234 a NaN with the "invalid" exception raised, because the value
1235 of 1./3. actually has an even denominator. The correct value
1236 of cbrt(x) is a negative real value. */
1237 cbrtfn
= mathfn_built_in (type
, BUILT_IN_CBRT
);
1238 dconst1_3
= real_value_truncate (mode
, dconst_third ());
1240 if (flag_unsafe_math_optimizations
1242 && (gimple_val_nonnegative_real_p (arg0
) || !HONOR_NANS (mode
))
1243 && REAL_VALUES_EQUAL (c
, dconst1_3
))
1244 return build_and_insert_call (gsi
, loc
, cbrtfn
, arg0
);
1246 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1247 if we don't have a hardware sqrt insn. */
1248 dconst1_6
= dconst1_3
;
1249 SET_REAL_EXP (&dconst1_6
, REAL_EXP (&dconst1_6
) - 1);
1251 if (flag_unsafe_math_optimizations
1254 && (gimple_val_nonnegative_real_p (arg0
) || !HONOR_NANS (mode
))
1255 && optimize_function_for_speed_p (cfun
)
1257 && REAL_VALUES_EQUAL (c
, dconst1_6
))
1260 sqrt_arg0
= build_and_insert_call (gsi
, loc
, sqrtfn
, arg0
);
1263 return build_and_insert_call (gsi
, loc
, cbrtfn
, sqrt_arg0
);
1266 /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1267 and c not an integer, into
1269 sqrt(x) * powi(x, n/2), n > 0;
1270 1.0 / (sqrt(x) * powi(x, abs(n/2))), n < 0.
1272 Do not calculate the powi factor when n/2 = 0. */
1273 real_arithmetic (&c2
, MULT_EXPR
, &c
, &dconst2
);
1274 n
= real_to_integer (&c2
);
1275 real_from_integer (&cint
, VOIDmode
, n
, SIGNED
);
1276 c2_is_int
= real_identical (&c2
, &cint
);
1278 if (flag_unsafe_math_optimizations
1282 && optimize_function_for_speed_p (cfun
))
1284 tree powi_x_ndiv2
= NULL_TREE
;
1286 /* Attempt to fold powi(arg0, abs(n/2)) into multiplies. If not
1287 possible or profitable, give up. Skip the degenerate case when
1288 n is 1 or -1, where the result is always 1. */
1289 if (absu_hwi (n
) != 1)
1291 powi_x_ndiv2
= gimple_expand_builtin_powi (gsi
, loc
, arg0
,
1297 /* Calculate sqrt(x). When n is not 1 or -1, multiply it by the
1298 result of the optimal multiply sequence just calculated. */
1299 sqrt_arg0
= build_and_insert_call (gsi
, loc
, sqrtfn
, arg0
);
1301 if (absu_hwi (n
) == 1)
1304 result
= build_and_insert_binop (gsi
, loc
, "powroot", MULT_EXPR
,
1305 sqrt_arg0
, powi_x_ndiv2
);
1307 /* If n is negative, reciprocate the result. */
1309 result
= build_and_insert_binop (gsi
, loc
, "powroot", RDIV_EXPR
,
1310 build_real (type
, dconst1
), result
);
1314 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1316 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
1317 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
1319 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
1320 different from pow(x, 1./3.) due to rounding and behavior with
1321 negative x, we need to constrain this transformation to unsafe
1322 math and positive x or finite math. */
1323 real_from_integer (&dconst3
, VOIDmode
, 3, SIGNED
);
1324 real_arithmetic (&c2
, MULT_EXPR
, &c
, &dconst3
);
1325 real_round (&c2
, mode
, &c2
);
1326 n
= real_to_integer (&c2
);
1327 real_from_integer (&cint
, VOIDmode
, n
, SIGNED
);
1328 real_arithmetic (&c2
, RDIV_EXPR
, &cint
, &dconst3
);
1329 real_convert (&c2
, mode
, &c2
);
1331 if (flag_unsafe_math_optimizations
1333 && (gimple_val_nonnegative_real_p (arg0
) || !HONOR_NANS (mode
))
1334 && real_identical (&c2
, &c
)
1336 && optimize_function_for_speed_p (cfun
)
1337 && powi_cost (n
/ 3) <= POWI_MAX_MULTS
)
1339 tree powi_x_ndiv3
= NULL_TREE
;
1341 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
1342 possible or profitable, give up. Skip the degenerate case when
1343 abs(n) < 3, where the result is always 1. */
1344 if (absu_hwi (n
) >= 3)
1346 powi_x_ndiv3
= gimple_expand_builtin_powi (gsi
, loc
, arg0
,
1352 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
1353 as that creates an unnecessary variable. Instead, just produce
1354 either cbrt(x) or cbrt(x) * cbrt(x). */
1355 cbrt_x
= build_and_insert_call (gsi
, loc
, cbrtfn
, arg0
);
1357 if (absu_hwi (n
) % 3 == 1)
1358 powi_cbrt_x
= cbrt_x
;
1360 powi_cbrt_x
= build_and_insert_binop (gsi
, loc
, "powroot", MULT_EXPR
,
1363 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
1364 if (absu_hwi (n
) < 3)
1365 result
= powi_cbrt_x
;
1367 result
= build_and_insert_binop (gsi
, loc
, "powroot", MULT_EXPR
,
1368 powi_x_ndiv3
, powi_cbrt_x
);
1370 /* If n is negative, reciprocate the result. */
1372 result
= build_and_insert_binop (gsi
, loc
, "powroot", RDIV_EXPR
,
1373 build_real (type
, dconst1
), result
);
1378 /* No optimizations succeeded. */
1382 /* ARG is the argument to a cabs builtin call in GSI with location info
1383 LOC. Create a sequence of statements prior to GSI that calculates
1384 sqrt(R*R + I*I), where R and I are the real and imaginary components
1385 of ARG, respectively. Return an expression holding the result. */
1388 gimple_expand_builtin_cabs (gimple_stmt_iterator
*gsi
, location_t loc
, tree arg
)
1390 tree real_part
, imag_part
, addend1
, addend2
, sum
, result
;
1391 tree type
= TREE_TYPE (TREE_TYPE (arg
));
1392 tree sqrtfn
= mathfn_built_in (type
, BUILT_IN_SQRT
);
1393 machine_mode mode
= TYPE_MODE (type
);
1395 if (!flag_unsafe_math_optimizations
1396 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi
)))
1398 || optab_handler (sqrt_optab
, mode
) == CODE_FOR_nothing
)
1401 real_part
= build_and_insert_ref (gsi
, loc
, type
, "cabs",
1402 REALPART_EXPR
, arg
);
1403 addend1
= build_and_insert_binop (gsi
, loc
, "cabs", MULT_EXPR
,
1404 real_part
, real_part
);
1405 imag_part
= build_and_insert_ref (gsi
, loc
, type
, "cabs",
1406 IMAGPART_EXPR
, arg
);
1407 addend2
= build_and_insert_binop (gsi
, loc
, "cabs", MULT_EXPR
,
1408 imag_part
, imag_part
);
1409 sum
= build_and_insert_binop (gsi
, loc
, "cabs", PLUS_EXPR
, addend1
, addend2
);
1410 result
= build_and_insert_call (gsi
, loc
, sqrtfn
, sum
);
1415 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1416 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
1417 an optimal number of multiplies, when n is a constant. */
1421 const pass_data pass_data_cse_sincos
=
1423 GIMPLE_PASS
, /* type */
1424 "sincos", /* name */
1425 OPTGROUP_NONE
, /* optinfo_flags */
1426 TV_NONE
, /* tv_id */
1427 PROP_ssa
, /* properties_required */
1428 0, /* properties_provided */
1429 0, /* properties_destroyed */
1430 0, /* todo_flags_start */
1431 TODO_update_ssa
, /* todo_flags_finish */
1434 class pass_cse_sincos
: public gimple_opt_pass
1437 pass_cse_sincos (gcc::context
*ctxt
)
1438 : gimple_opt_pass (pass_data_cse_sincos
, ctxt
)
1441 /* opt_pass methods: */
1442 virtual bool gate (function
*)
1444 /* We no longer require either sincos or cexp, since powi expansion
1445 piggybacks on this pass. */
1449 virtual unsigned int execute (function
*);
1451 }; // class pass_cse_sincos
1454 pass_cse_sincos::execute (function
*fun
)
1457 bool cfg_changed
= false;
1459 calculate_dominance_info (CDI_DOMINATORS
);
1460 memset (&sincos_stats
, 0, sizeof (sincos_stats
));
1462 FOR_EACH_BB_FN (bb
, fun
)
1464 gimple_stmt_iterator gsi
;
1465 bool cleanup_eh
= false;
1467 for (gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
1469 gimple stmt
= gsi_stmt (gsi
);
1472 /* Only the last stmt in a bb could throw, no need to call
1473 gimple_purge_dead_eh_edges if we change something in the middle
1474 of a basic block. */
1477 if (is_gimple_call (stmt
)
1478 && gimple_call_lhs (stmt
)
1479 && (fndecl
= gimple_call_fndecl (stmt
))
1480 && DECL_BUILT_IN_CLASS (fndecl
) == BUILT_IN_NORMAL
)
1482 tree arg
, arg0
, arg1
, result
;
1486 switch (DECL_FUNCTION_CODE (fndecl
))
1488 CASE_FLT_FN (BUILT_IN_COS
):
1489 CASE_FLT_FN (BUILT_IN_SIN
):
1490 CASE_FLT_FN (BUILT_IN_CEXPI
):
1491 /* Make sure we have either sincos or cexp. */
1492 if (!targetm
.libc_has_function (function_c99_math_complex
)
1493 && !targetm
.libc_has_function (function_sincos
))
1496 arg
= gimple_call_arg (stmt
, 0);
1497 if (TREE_CODE (arg
) == SSA_NAME
)
1498 cfg_changed
|= execute_cse_sincos_1 (arg
);
1501 CASE_FLT_FN (BUILT_IN_POW
):
1502 arg0
= gimple_call_arg (stmt
, 0);
1503 arg1
= gimple_call_arg (stmt
, 1);
1505 loc
= gimple_location (stmt
);
1506 result
= gimple_expand_builtin_pow (&gsi
, loc
, arg0
, arg1
);
1510 tree lhs
= gimple_get_lhs (stmt
);
1512 gimple_build_assign (lhs
, result
);
1513 gimple_set_location (new_stmt
, loc
);
1514 unlink_stmt_vdef (stmt
);
1515 gsi_replace (&gsi
, new_stmt
, true);
1517 if (gimple_vdef (stmt
))
1518 release_ssa_name (gimple_vdef (stmt
));
1522 CASE_FLT_FN (BUILT_IN_POWI
):
1523 arg0
= gimple_call_arg (stmt
, 0);
1524 arg1
= gimple_call_arg (stmt
, 1);
1525 loc
= gimple_location (stmt
);
1527 if (real_minus_onep (arg0
))
1529 tree t0
, t1
, cond
, one
, minus_one
;
1532 t0
= TREE_TYPE (arg0
);
1533 t1
= TREE_TYPE (arg1
);
1534 one
= build_real (t0
, dconst1
);
1535 minus_one
= build_real (t0
, dconstm1
);
1537 cond
= make_temp_ssa_name (t1
, NULL
, "powi_cond");
1538 stmt
= gimple_build_assign_with_ops (BIT_AND_EXPR
, cond
,
1542 gimple_set_location (stmt
, loc
);
1543 gsi_insert_before (&gsi
, stmt
, GSI_SAME_STMT
);
1545 result
= make_temp_ssa_name (t0
, NULL
, "powi");
1546 stmt
= gimple_build_assign_with_ops (COND_EXPR
, result
,
1549 gimple_set_location (stmt
, loc
);
1550 gsi_insert_before (&gsi
, stmt
, GSI_SAME_STMT
);
1554 if (!tree_fits_shwi_p (arg1
))
1557 n
= tree_to_shwi (arg1
);
1558 result
= gimple_expand_builtin_powi (&gsi
, loc
, arg0
, n
);
1563 tree lhs
= gimple_get_lhs (stmt
);
1564 gassign
*new_stmt
= gimple_build_assign (lhs
, result
);
1565 gimple_set_location (new_stmt
, loc
);
1566 unlink_stmt_vdef (stmt
);
1567 gsi_replace (&gsi
, new_stmt
, true);
1569 if (gimple_vdef (stmt
))
1570 release_ssa_name (gimple_vdef (stmt
));
1574 CASE_FLT_FN (BUILT_IN_CABS
):
1575 arg0
= gimple_call_arg (stmt
, 0);
1576 loc
= gimple_location (stmt
);
1577 result
= gimple_expand_builtin_cabs (&gsi
, loc
, arg0
);
1581 tree lhs
= gimple_get_lhs (stmt
);
1582 gassign
*new_stmt
= gimple_build_assign (lhs
, result
);
1583 gimple_set_location (new_stmt
, loc
);
1584 unlink_stmt_vdef (stmt
);
1585 gsi_replace (&gsi
, new_stmt
, true);
1587 if (gimple_vdef (stmt
))
1588 release_ssa_name (gimple_vdef (stmt
));
1597 cfg_changed
|= gimple_purge_dead_eh_edges (bb
);
1600 statistics_counter_event (fun
, "sincos statements inserted",
1601 sincos_stats
.inserted
);
1603 free_dominance_info (CDI_DOMINATORS
);
1604 return cfg_changed
? TODO_cleanup_cfg
: 0;
1610 make_pass_cse_sincos (gcc::context
*ctxt
)
1612 return new pass_cse_sincos (ctxt
);
1615 /* A symbolic number is used to detect byte permutation and selection
1616 patterns. Therefore the field N contains an artificial number
1617 consisting of octet sized markers:
1619 0 - target byte has the value 0
1620 FF - target byte has an unknown value (eg. due to sign extension)
1621 1..size - marker value is the target byte index minus one.
1623 To detect permutations on memory sources (arrays and structures), a symbolic
1624 number is also associated a base address (the array or structure the load is
1625 made from), an offset from the base address and a range which gives the
1626 difference between the highest and lowest accessed memory location to make
1627 such a symbolic number. The range is thus different from size which reflects
1628 the size of the type of current expression. Note that for non memory source,
1629 range holds the same value as size.
1631 For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1632 a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1633 still have a size of 2 but this time a range of 1. */
1635 struct symbolic_number
{
1640 HOST_WIDE_INT bytepos
;
1643 unsigned HOST_WIDE_INT range
;
1646 #define BITS_PER_MARKER 8
1647 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1648 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1649 #define HEAD_MARKER(n, size) \
1650 ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1652 /* The number which the find_bswap_or_nop_1 result should match in
1653 order to have a nop. The number is masked according to the size of
1654 the symbolic number before using it. */
1655 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1656 (uint64_t)0x08070605 << 32 | 0x04030201)
1658 /* The number which the find_bswap_or_nop_1 result should match in
1659 order to have a byte swap. The number is masked according to the
1660 size of the symbolic number before using it. */
1661 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1662 (uint64_t)0x01020304 << 32 | 0x05060708)
1664 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1665 number N. Return false if the requested operation is not permitted
1666 on a symbolic number. */
1669 do_shift_rotate (enum tree_code code
,
1670 struct symbolic_number
*n
,
1673 int i
, size
= TYPE_PRECISION (n
->type
) / BITS_PER_UNIT
;
1674 unsigned head_marker
;
1676 if (count
% BITS_PER_UNIT
!= 0)
1678 count
= (count
/ BITS_PER_UNIT
) * BITS_PER_MARKER
;
1680 /* Zero out the extra bits of N in order to avoid them being shifted
1681 into the significant bits. */
1682 if (size
< 64 / BITS_PER_MARKER
)
1683 n
->n
&= ((uint64_t) 1 << (size
* BITS_PER_MARKER
)) - 1;
1691 head_marker
= HEAD_MARKER (n
->n
, size
);
1693 /* Arithmetic shift of signed type: result is dependent on the value. */
1694 if (!TYPE_UNSIGNED (n
->type
) && head_marker
)
1695 for (i
= 0; i
< count
/ BITS_PER_MARKER
; i
++)
1696 n
->n
|= (uint64_t) MARKER_BYTE_UNKNOWN
1697 << ((size
- 1 - i
) * BITS_PER_MARKER
);
1700 n
->n
= (n
->n
<< count
) | (n
->n
>> ((size
* BITS_PER_MARKER
) - count
));
1703 n
->n
= (n
->n
>> count
) | (n
->n
<< ((size
* BITS_PER_MARKER
) - count
));
1708 /* Zero unused bits for size. */
1709 if (size
< 64 / BITS_PER_MARKER
)
1710 n
->n
&= ((uint64_t) 1 << (size
* BITS_PER_MARKER
)) - 1;
1714 /* Perform sanity checking for the symbolic number N and the gimple
1718 verify_symbolic_number_p (struct symbolic_number
*n
, gimple stmt
)
1722 lhs_type
= gimple_expr_type (stmt
);
1724 if (TREE_CODE (lhs_type
) != INTEGER_TYPE
)
1727 if (TYPE_PRECISION (lhs_type
) != TYPE_PRECISION (n
->type
))
1733 /* Initialize the symbolic number N for the bswap pass from the base element
1734 SRC manipulated by the bitwise OR expression. */
1737 init_symbolic_number (struct symbolic_number
*n
, tree src
)
1741 n
->base_addr
= n
->offset
= n
->alias_set
= n
->vuse
= NULL_TREE
;
1743 /* Set up the symbolic number N by setting each byte to a value between 1 and
1744 the byte size of rhs1. The highest order byte is set to n->size and the
1745 lowest order byte to 1. */
1746 n
->type
= TREE_TYPE (src
);
1747 size
= TYPE_PRECISION (n
->type
);
1748 if (size
% BITS_PER_UNIT
!= 0)
1750 size
/= BITS_PER_UNIT
;
1751 if (size
> 64 / BITS_PER_MARKER
)
1756 if (size
< 64 / BITS_PER_MARKER
)
1757 n
->n
&= ((uint64_t) 1 << (size
* BITS_PER_MARKER
)) - 1;
1762 /* Check if STMT might be a byte swap or a nop from a memory source and returns
1763 the answer. If so, REF is that memory source and the base of the memory area
1764 accessed and the offset of the access from that base are recorded in N. */
1767 find_bswap_or_nop_load (gimple stmt
, tree ref
, struct symbolic_number
*n
)
1769 /* Leaf node is an array or component ref. Memorize its base and
1770 offset from base to compare to other such leaf node. */
1771 HOST_WIDE_INT bitsize
, bitpos
;
1773 int unsignedp
, volatilep
;
1774 tree offset
, base_addr
;
1776 if (!gimple_assign_load_p (stmt
) || gimple_has_volatile_ops (stmt
))
1779 base_addr
= get_inner_reference (ref
, &bitsize
, &bitpos
, &offset
, &mode
,
1780 &unsignedp
, &volatilep
, false);
1782 if (TREE_CODE (base_addr
) == MEM_REF
)
1784 offset_int bit_offset
= 0;
1785 tree off
= TREE_OPERAND (base_addr
, 1);
1787 if (!integer_zerop (off
))
1789 offset_int boff
, coff
= mem_ref_offset (base_addr
);
1790 boff
= wi::lshift (coff
, LOG2_BITS_PER_UNIT
);
1794 base_addr
= TREE_OPERAND (base_addr
, 0);
1796 /* Avoid returning a negative bitpos as this may wreak havoc later. */
1797 if (wi::neg_p (bit_offset
))
1799 offset_int mask
= wi::mask
<offset_int
> (LOG2_BITS_PER_UNIT
, false);
1800 offset_int tem
= bit_offset
.and_not (mask
);
1801 /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
1802 Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */
1804 tem
= wi::arshift (tem
, LOG2_BITS_PER_UNIT
);
1806 offset
= size_binop (PLUS_EXPR
, offset
,
1807 wide_int_to_tree (sizetype
, tem
));
1809 offset
= wide_int_to_tree (sizetype
, tem
);
1812 bitpos
+= bit_offset
.to_shwi ();
1815 if (bitpos
% BITS_PER_UNIT
)
1817 if (bitsize
% BITS_PER_UNIT
)
1820 if (!init_symbolic_number (n
, ref
))
1822 n
->base_addr
= base_addr
;
1824 n
->bytepos
= bitpos
/ BITS_PER_UNIT
;
1825 n
->alias_set
= reference_alias_ptr_type (ref
);
1826 n
->vuse
= gimple_vuse (stmt
);
1830 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
1831 the operation given by the rhs of STMT on the result. If the operation
1832 could successfully be executed the function returns a gimple stmt whose
1833 rhs's first tree is the expression of the source operand and NULL
1837 find_bswap_or_nop_1 (gimple stmt
, struct symbolic_number
*n
, int limit
)
1839 enum tree_code code
;
1840 tree rhs1
, rhs2
= NULL
;
1841 gimple rhs1_stmt
, rhs2_stmt
, source_stmt1
;
1842 enum gimple_rhs_class rhs_class
;
1844 if (!limit
|| !is_gimple_assign (stmt
))
1847 rhs1
= gimple_assign_rhs1 (stmt
);
1849 if (find_bswap_or_nop_load (stmt
, rhs1
, n
))
1852 if (TREE_CODE (rhs1
) != SSA_NAME
)
1855 code
= gimple_assign_rhs_code (stmt
);
1856 rhs_class
= gimple_assign_rhs_class (stmt
);
1857 rhs1_stmt
= SSA_NAME_DEF_STMT (rhs1
);
1859 if (rhs_class
== GIMPLE_BINARY_RHS
)
1860 rhs2
= gimple_assign_rhs2 (stmt
);
1862 /* Handle unary rhs and binary rhs with integer constants as second
1865 if (rhs_class
== GIMPLE_UNARY_RHS
1866 || (rhs_class
== GIMPLE_BINARY_RHS
1867 && TREE_CODE (rhs2
) == INTEGER_CST
))
1869 if (code
!= BIT_AND_EXPR
1870 && code
!= LSHIFT_EXPR
1871 && code
!= RSHIFT_EXPR
1872 && code
!= LROTATE_EXPR
1873 && code
!= RROTATE_EXPR
1874 && !CONVERT_EXPR_CODE_P (code
))
1877 source_stmt1
= find_bswap_or_nop_1 (rhs1_stmt
, n
, limit
- 1);
1879 /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
1880 we have to initialize the symbolic number. */
1883 if (gimple_assign_load_p (stmt
)
1884 || !init_symbolic_number (n
, rhs1
))
1886 source_stmt1
= stmt
;
1893 int i
, size
= TYPE_PRECISION (n
->type
) / BITS_PER_UNIT
;
1894 uint64_t val
= int_cst_value (rhs2
), mask
= 0;
1895 uint64_t tmp
= (1 << BITS_PER_UNIT
) - 1;
1897 /* Only constants masking full bytes are allowed. */
1898 for (i
= 0; i
< size
; i
++, tmp
<<= BITS_PER_UNIT
)
1899 if ((val
& tmp
) != 0 && (val
& tmp
) != tmp
)
1902 mask
|= (uint64_t) MARKER_MASK
<< (i
* BITS_PER_MARKER
);
1911 if (!do_shift_rotate (code
, n
, (int)TREE_INT_CST_LOW (rhs2
)))
1916 int i
, type_size
, old_type_size
;
1919 type
= gimple_expr_type (stmt
);
1920 type_size
= TYPE_PRECISION (type
);
1921 if (type_size
% BITS_PER_UNIT
!= 0)
1923 type_size
/= BITS_PER_UNIT
;
1924 if (type_size
> 64 / BITS_PER_MARKER
)
1927 /* Sign extension: result is dependent on the value. */
1928 old_type_size
= TYPE_PRECISION (n
->type
) / BITS_PER_UNIT
;
1929 if (!TYPE_UNSIGNED (n
->type
) && type_size
> old_type_size
1930 && HEAD_MARKER (n
->n
, old_type_size
))
1931 for (i
= 0; i
< type_size
- old_type_size
; i
++)
1932 n
->n
|= (uint64_t) MARKER_BYTE_UNKNOWN
1933 << ((type_size
- 1 - i
) * BITS_PER_MARKER
);
1935 if (type_size
< 64 / BITS_PER_MARKER
)
1937 /* If STMT casts to a smaller type mask out the bits not
1938 belonging to the target type. */
1939 n
->n
&= ((uint64_t) 1 << (type_size
* BITS_PER_MARKER
)) - 1;
1943 n
->range
= type_size
;
1949 return verify_symbolic_number_p (n
, stmt
) ? source_stmt1
: NULL
;
1952 /* Handle binary rhs. */
1954 if (rhs_class
== GIMPLE_BINARY_RHS
)
1957 struct symbolic_number n1
, n2
;
1959 gimple source_stmt2
;
1961 if (code
!= BIT_IOR_EXPR
)
1964 if (TREE_CODE (rhs2
) != SSA_NAME
)
1967 rhs2_stmt
= SSA_NAME_DEF_STMT (rhs2
);
1972 source_stmt1
= find_bswap_or_nop_1 (rhs1_stmt
, &n1
, limit
- 1);
1977 source_stmt2
= find_bswap_or_nop_1 (rhs2_stmt
, &n2
, limit
- 1);
1982 if (TYPE_PRECISION (n1
.type
) != TYPE_PRECISION (n2
.type
))
1985 if (!n1
.vuse
!= !n2
.vuse
||
1986 (n1
.vuse
&& !operand_equal_p (n1
.vuse
, n2
.vuse
, 0)))
1989 if (gimple_assign_rhs1 (source_stmt1
)
1990 != gimple_assign_rhs1 (source_stmt2
))
1993 HOST_WIDE_INT off_sub
;
1994 struct symbolic_number
*n_ptr
;
1996 if (!n1
.base_addr
|| !n2
.base_addr
1997 || !operand_equal_p (n1
.base_addr
, n2
.base_addr
, 0))
1999 if (!n1
.offset
!= !n2
.offset
||
2000 (n1
.offset
&& !operand_equal_p (n1
.offset
, n2
.offset
, 0)))
2003 /* We swap n1 with n2 to have n1 < n2. */
2004 if (n2
.bytepos
< n1
.bytepos
)
2006 struct symbolic_number tmpn
;
2011 source_stmt1
= source_stmt2
;
2014 off_sub
= n2
.bytepos
- n1
.bytepos
;
2016 /* Check that the range of memory covered can be represented by
2017 a symbolic number. */
2018 if (off_sub
+ n2
.range
> 64 / BITS_PER_MARKER
)
2020 n
->range
= n2
.range
+ off_sub
;
2022 /* Reinterpret byte marks in symbolic number holding the value of
2023 bigger weight according to target endianness. */
2024 inc
= BYTES_BIG_ENDIAN
? off_sub
+ n2
.range
- n1
.range
: off_sub
;
2025 size
= TYPE_PRECISION (n1
.type
) / BITS_PER_UNIT
;
2026 if (BYTES_BIG_ENDIAN
)
2030 for (i
= 0; i
< size
; i
++, inc
<<= BITS_PER_MARKER
)
2033 (n_ptr
->n
>> (i
* BITS_PER_MARKER
)) & MARKER_MASK
;
2034 if (marker
&& marker
!= MARKER_BYTE_UNKNOWN
)
2039 n
->range
= n1
.range
;
2042 || alias_ptr_types_compatible_p (n1
.alias_set
, n2
.alias_set
))
2043 n
->alias_set
= n1
.alias_set
;
2045 n
->alias_set
= ptr_type_node
;
2047 n
->base_addr
= n1
.base_addr
;
2048 n
->offset
= n1
.offset
;
2049 n
->bytepos
= n1
.bytepos
;
2051 size
= TYPE_PRECISION (n
->type
) / BITS_PER_UNIT
;
2052 for (i
= 0, mask
= MARKER_MASK
; i
< size
;
2053 i
++, mask
<<= BITS_PER_MARKER
)
2055 uint64_t masked1
, masked2
;
2057 masked1
= n1
.n
& mask
;
2058 masked2
= n2
.n
& mask
;
2059 if (masked1
&& masked2
&& masked1
!= masked2
)
2064 if (!verify_symbolic_number_p (n
, stmt
))
2071 return source_stmt1
;
2076 /* Check if STMT completes a bswap implementation or a read in a given
2077 endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2078 accordingly. It also sets N to represent the kind of operations
2079 performed: size of the resulting expression and whether it works on
2080 a memory source, and if so alias-set and vuse. At last, the
2081 function returns a stmt whose rhs's first tree is the source
2085 find_bswap_or_nop (gimple stmt
, struct symbolic_number
*n
, bool *bswap
)
2087 /* The number which the find_bswap_or_nop_1 result should match in order
2088 to have a full byte swap. The number is shifted to the right
2089 according to the size of the symbolic number before using it. */
2090 uint64_t cmpxchg
= CMPXCHG
;
2091 uint64_t cmpnop
= CMPNOP
;
2096 /* The last parameter determines the depth search limit. It usually
2097 correlates directly to the number n of bytes to be touched. We
2098 increase that number by log2(n) + 1 here in order to also
2099 cover signed -> unsigned conversions of the src operand as can be seen
2100 in libgcc, and for initial shift/and operation of the src operand. */
2101 limit
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt
)));
2102 limit
+= 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT
) limit
);
2103 source_stmt
= find_bswap_or_nop_1 (stmt
, n
, limit
);
2108 /* Find real size of result (highest non zero byte). */
2114 for (tmpn
= n
->n
, rsize
= 0; tmpn
; tmpn
>>= BITS_PER_MARKER
, rsize
++);
2118 /* Zero out the extra bits of N and CMP*. */
2119 if (n
->range
< (int) sizeof (int64_t))
2123 mask
= ((uint64_t) 1 << (n
->range
* BITS_PER_MARKER
)) - 1;
2124 cmpxchg
>>= (64 / BITS_PER_MARKER
- n
->range
) * BITS_PER_MARKER
;
2128 /* A complete byte swap should make the symbolic number to start with
2129 the largest digit in the highest order byte. Unchanged symbolic
2130 number indicates a read with same endianness as target architecture. */
2133 else if (n
->n
== cmpxchg
)
2138 /* Useless bit manipulation performed by code. */
2139 if (!n
->base_addr
&& n
->n
== cmpnop
)
2142 n
->range
*= BITS_PER_UNIT
;
2148 const pass_data pass_data_optimize_bswap
=
2150 GIMPLE_PASS
, /* type */
2152 OPTGROUP_NONE
, /* optinfo_flags */
2153 TV_NONE
, /* tv_id */
2154 PROP_ssa
, /* properties_required */
2155 0, /* properties_provided */
2156 0, /* properties_destroyed */
2157 0, /* todo_flags_start */
2158 0, /* todo_flags_finish */
2161 class pass_optimize_bswap
: public gimple_opt_pass
2164 pass_optimize_bswap (gcc::context
*ctxt
)
2165 : gimple_opt_pass (pass_data_optimize_bswap
, ctxt
)
2168 /* opt_pass methods: */
2169 virtual bool gate (function
*)
2171 return flag_expensive_optimizations
&& optimize
;
2174 virtual unsigned int execute (function
*);
2176 }; // class pass_optimize_bswap
2178 /* Perform the bswap optimization: replace the expression computed in the rhs
2179 of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2180 Which of these alternatives replace the rhs is given by N->base_addr (non
2181 null if a load is needed) and BSWAP. The type, VUSE and set-alias of the
2182 load to perform are also given in N while the builtin bswap invoke is given
2183 in FNDEL. Finally, if a load is involved, SRC_STMT refers to one of the
2184 load statements involved to construct the rhs in CUR_STMT and N->range gives
2185 the size of the rhs expression for maintaining some statistics.
2187 Note that if the replacement involve a load, CUR_STMT is moved just after
2188 SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2189 changing of basic block. */
2192 bswap_replace (gimple cur_stmt
, gimple src_stmt
, tree fndecl
, tree bswap_type
,
2193 tree load_type
, struct symbolic_number
*n
, bool bswap
)
2195 gimple_stmt_iterator gsi
;
2199 gsi
= gsi_for_stmt (cur_stmt
);
2200 src
= gimple_assign_rhs1 (src_stmt
);
2201 tgt
= gimple_assign_lhs (cur_stmt
);
2203 /* Need to load the value from memory first. */
2206 gimple_stmt_iterator gsi_ins
= gsi_for_stmt (src_stmt
);
2207 tree addr_expr
, addr_tmp
, val_expr
, val_tmp
;
2208 tree load_offset_ptr
, aligned_load_type
;
2209 gimple addr_stmt
, load_stmt
;
2212 align
= get_object_alignment (src
);
2214 && align
< GET_MODE_ALIGNMENT (TYPE_MODE (load_type
))
2215 && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type
), align
))
2218 /* Move cur_stmt just before one of the load of the original
2219 to ensure it has the same VUSE. See PR61517 for what could
2221 gsi_move_before (&gsi
, &gsi_ins
);
2222 gsi
= gsi_for_stmt (cur_stmt
);
2224 /* Compute address to load from and cast according to the size
2226 addr_expr
= build_fold_addr_expr (unshare_expr (src
));
2227 if (is_gimple_min_invariant (addr_expr
))
2228 addr_tmp
= addr_expr
;
2231 addr_tmp
= make_temp_ssa_name (TREE_TYPE (addr_expr
), NULL
,
2233 addr_stmt
= gimple_build_assign (addr_tmp
, addr_expr
);
2234 gsi_insert_before (&gsi
, addr_stmt
, GSI_SAME_STMT
);
2237 /* Perform the load. */
2238 aligned_load_type
= load_type
;
2239 if (align
< TYPE_ALIGN (load_type
))
2240 aligned_load_type
= build_aligned_type (load_type
, align
);
2241 load_offset_ptr
= build_int_cst (n
->alias_set
, 0);
2242 val_expr
= fold_build2 (MEM_REF
, aligned_load_type
, addr_tmp
,
2248 nop_stats
.found_16bit
++;
2249 else if (n
->range
== 32)
2250 nop_stats
.found_32bit
++;
2253 gcc_assert (n
->range
== 64);
2254 nop_stats
.found_64bit
++;
2257 /* Convert the result of load if necessary. */
2258 if (!useless_type_conversion_p (TREE_TYPE (tgt
), load_type
))
2260 val_tmp
= make_temp_ssa_name (aligned_load_type
, NULL
,
2262 load_stmt
= gimple_build_assign (val_tmp
, val_expr
);
2263 gimple_set_vuse (load_stmt
, n
->vuse
);
2264 gsi_insert_before (&gsi
, load_stmt
, GSI_SAME_STMT
);
2265 gimple_assign_set_rhs_with_ops_1 (&gsi
, NOP_EXPR
, val_tmp
,
2266 NULL_TREE
, NULL_TREE
);
2270 gimple_assign_set_rhs_with_ops_1 (&gsi
, MEM_REF
, val_expr
,
2271 NULL_TREE
, NULL_TREE
);
2272 gimple_set_vuse (cur_stmt
, n
->vuse
);
2274 update_stmt (cur_stmt
);
2279 "%d bit load in target endianness found at: ",
2281 print_gimple_stmt (dump_file
, cur_stmt
, 0, 0);
2287 val_tmp
= make_temp_ssa_name (aligned_load_type
, NULL
, "load_dst");
2288 load_stmt
= gimple_build_assign (val_tmp
, val_expr
);
2289 gimple_set_vuse (load_stmt
, n
->vuse
);
2290 gsi_insert_before (&gsi
, load_stmt
, GSI_SAME_STMT
);
2296 bswap_stats
.found_16bit
++;
2297 else if (n
->range
== 32)
2298 bswap_stats
.found_32bit
++;
2301 gcc_assert (n
->range
== 64);
2302 bswap_stats
.found_64bit
++;
2307 /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values
2308 are considered as rotation of 2N bit values by N bits is generally not
2309 equivalent to a bswap. Consider for instance 0x01020304 >> 16 which gives
2310 0x03040102 while a bswap for that value is 0x04030201. */
2311 if (bswap
&& n
->range
== 16)
2313 tree count
= build_int_cst (NULL
, BITS_PER_UNIT
);
2314 bswap_type
= TREE_TYPE (src
);
2315 src
= fold_build2 (LROTATE_EXPR
, bswap_type
, src
, count
);
2316 bswap_stmt
= gimple_build_assign (NULL
, src
);
2320 /* Convert the src expression if necessary. */
2321 if (!useless_type_conversion_p (TREE_TYPE (tmp
), bswap_type
))
2323 gimple convert_stmt
;
2324 tmp
= make_temp_ssa_name (bswap_type
, NULL
, "bswapsrc");
2325 convert_stmt
= gimple_build_assign_with_ops (NOP_EXPR
, tmp
, src
,
2327 gsi_insert_before (&gsi
, convert_stmt
, GSI_SAME_STMT
);
2330 bswap_stmt
= gimple_build_call (fndecl
, 1, tmp
);
2335 /* Convert the result if necessary. */
2336 if (!useless_type_conversion_p (TREE_TYPE (tgt
), bswap_type
))
2338 gimple convert_stmt
;
2339 tmp
= make_temp_ssa_name (bswap_type
, NULL
, "bswapdst");
2340 convert_stmt
= gimple_build_assign_with_ops (NOP_EXPR
, tgt
, tmp
, NULL
);
2341 gsi_insert_after (&gsi
, convert_stmt
, GSI_SAME_STMT
);
2344 gimple_set_lhs (bswap_stmt
, tmp
);
2348 fprintf (dump_file
, "%d bit bswap implementation found at: ",
2350 print_gimple_stmt (dump_file
, cur_stmt
, 0, 0);
2353 gsi_insert_after (&gsi
, bswap_stmt
, GSI_SAME_STMT
);
2354 gsi_remove (&gsi
, true);
2358 /* Find manual byte swap implementations as well as load in a given
2359 endianness. Byte swaps are turned into a bswap builtin invokation
2360 while endian loads are converted to bswap builtin invokation or
2361 simple load according to the target endianness. */
2364 pass_optimize_bswap::execute (function
*fun
)
2367 bool bswap16_p
, bswap32_p
, bswap64_p
;
2368 bool changed
= false;
2369 tree bswap16_type
= NULL_TREE
, bswap32_type
= NULL_TREE
, bswap64_type
= NULL_TREE
;
2371 if (BITS_PER_UNIT
!= 8)
2374 bswap16_p
= (builtin_decl_explicit_p (BUILT_IN_BSWAP16
)
2375 && optab_handler (bswap_optab
, HImode
) != CODE_FOR_nothing
);
2376 bswap32_p
= (builtin_decl_explicit_p (BUILT_IN_BSWAP32
)
2377 && optab_handler (bswap_optab
, SImode
) != CODE_FOR_nothing
);
2378 bswap64_p
= (builtin_decl_explicit_p (BUILT_IN_BSWAP64
)
2379 && (optab_handler (bswap_optab
, DImode
) != CODE_FOR_nothing
2380 || (bswap32_p
&& word_mode
== SImode
)));
2382 /* Determine the argument type of the builtins. The code later on
2383 assumes that the return and argument type are the same. */
2386 tree fndecl
= builtin_decl_explicit (BUILT_IN_BSWAP16
);
2387 bswap16_type
= TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl
)));
2392 tree fndecl
= builtin_decl_explicit (BUILT_IN_BSWAP32
);
2393 bswap32_type
= TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl
)));
2398 tree fndecl
= builtin_decl_explicit (BUILT_IN_BSWAP64
);
2399 bswap64_type
= TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl
)));
2402 memset (&nop_stats
, 0, sizeof (nop_stats
));
2403 memset (&bswap_stats
, 0, sizeof (bswap_stats
));
2405 FOR_EACH_BB_FN (bb
, fun
)
2407 gimple_stmt_iterator gsi
;
2409 /* We do a reverse scan for bswap patterns to make sure we get the
2410 widest match. As bswap pattern matching doesn't handle previously
2411 inserted smaller bswap replacements as sub-patterns, the wider
2412 variant wouldn't be detected. */
2413 for (gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);)
2415 gimple src_stmt
, cur_stmt
= gsi_stmt (gsi
);
2416 tree fndecl
= NULL_TREE
, bswap_type
= NULL_TREE
, load_type
;
2417 enum tree_code code
;
2418 struct symbolic_number n
;
2421 /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2422 might be moved to a different basic block by bswap_replace and gsi
2423 must not points to it if that's the case. Moving the gsi_prev
2424 there make sure that gsi points to the statement previous to
2425 cur_stmt while still making sure that all statements are
2426 considered in this basic block. */
2429 if (!is_gimple_assign (cur_stmt
))
2432 code
= gimple_assign_rhs_code (cur_stmt
);
2437 if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt
))
2438 || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt
))
2448 src_stmt
= find_bswap_or_nop (cur_stmt
, &n
, &bswap
);
2456 /* Already in canonical form, nothing to do. */
2457 if (code
== LROTATE_EXPR
|| code
== RROTATE_EXPR
)
2459 load_type
= uint16_type_node
;
2462 fndecl
= builtin_decl_explicit (BUILT_IN_BSWAP16
);
2463 bswap_type
= bswap16_type
;
2467 load_type
= uint32_type_node
;
2470 fndecl
= builtin_decl_explicit (BUILT_IN_BSWAP32
);
2471 bswap_type
= bswap32_type
;
2475 load_type
= uint64_type_node
;
2478 fndecl
= builtin_decl_explicit (BUILT_IN_BSWAP64
);
2479 bswap_type
= bswap64_type
;
2486 if (bswap
&& !fndecl
)
2489 if (bswap_replace (cur_stmt
, src_stmt
, fndecl
, bswap_type
, load_type
,
2495 statistics_counter_event (fun
, "16-bit nop implementations found",
2496 nop_stats
.found_16bit
);
2497 statistics_counter_event (fun
, "32-bit nop implementations found",
2498 nop_stats
.found_32bit
);
2499 statistics_counter_event (fun
, "64-bit nop implementations found",
2500 nop_stats
.found_64bit
);
2501 statistics_counter_event (fun
, "16-bit bswap implementations found",
2502 bswap_stats
.found_16bit
);
2503 statistics_counter_event (fun
, "32-bit bswap implementations found",
2504 bswap_stats
.found_32bit
);
2505 statistics_counter_event (fun
, "64-bit bswap implementations found",
2506 bswap_stats
.found_64bit
);
2508 return (changed
? TODO_update_ssa
: 0);
2514 make_pass_optimize_bswap (gcc::context
*ctxt
)
2516 return new pass_optimize_bswap (ctxt
);
2519 /* Return true if stmt is a type conversion operation that can be stripped
2520 when used in a widening multiply operation. */
2522 widening_mult_conversion_strippable_p (tree result_type
, gimple stmt
)
2524 enum tree_code rhs_code
= gimple_assign_rhs_code (stmt
);
2526 if (TREE_CODE (result_type
) == INTEGER_TYPE
)
2531 if (!CONVERT_EXPR_CODE_P (rhs_code
))
2534 op_type
= TREE_TYPE (gimple_assign_lhs (stmt
));
2536 /* If the type of OP has the same precision as the result, then
2537 we can strip this conversion. The multiply operation will be
2538 selected to create the correct extension as a by-product. */
2539 if (TYPE_PRECISION (result_type
) == TYPE_PRECISION (op_type
))
2542 /* We can also strip a conversion if it preserves the signed-ness of
2543 the operation and doesn't narrow the range. */
2544 inner_op_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
2546 /* If the inner-most type is unsigned, then we can strip any
2547 intermediate widening operation. If it's signed, then the
2548 intermediate widening operation must also be signed. */
2549 if ((TYPE_UNSIGNED (inner_op_type
)
2550 || TYPE_UNSIGNED (op_type
) == TYPE_UNSIGNED (inner_op_type
))
2551 && TYPE_PRECISION (op_type
) > TYPE_PRECISION (inner_op_type
))
2557 return rhs_code
== FIXED_CONVERT_EXPR
;
2560 /* Return true if RHS is a suitable operand for a widening multiplication,
2561 assuming a target type of TYPE.
2562 There are two cases:
2564 - RHS makes some value at least twice as wide. Store that value
2565 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2567 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
2568 but leave *TYPE_OUT untouched. */
2571 is_widening_mult_rhs_p (tree type
, tree rhs
, tree
*type_out
,
2577 if (TREE_CODE (rhs
) == SSA_NAME
)
2579 stmt
= SSA_NAME_DEF_STMT (rhs
);
2580 if (is_gimple_assign (stmt
))
2582 if (! widening_mult_conversion_strippable_p (type
, stmt
))
2586 rhs1
= gimple_assign_rhs1 (stmt
);
2588 if (TREE_CODE (rhs1
) == INTEGER_CST
)
2590 *new_rhs_out
= rhs1
;
2599 type1
= TREE_TYPE (rhs1
);
2601 if (TREE_CODE (type1
) != TREE_CODE (type
)
2602 || TYPE_PRECISION (type1
) * 2 > TYPE_PRECISION (type
))
2605 *new_rhs_out
= rhs1
;
2610 if (TREE_CODE (rhs
) == INTEGER_CST
)
2620 /* Return true if STMT performs a widening multiplication, assuming the
2621 output type is TYPE. If so, store the unwidened types of the operands
2622 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
2623 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2624 and *TYPE2_OUT would give the operands of the multiplication. */
2627 is_widening_mult_p (gimple stmt
,
2628 tree
*type1_out
, tree
*rhs1_out
,
2629 tree
*type2_out
, tree
*rhs2_out
)
2631 tree type
= TREE_TYPE (gimple_assign_lhs (stmt
));
2633 if (TREE_CODE (type
) != INTEGER_TYPE
2634 && TREE_CODE (type
) != FIXED_POINT_TYPE
)
2637 if (!is_widening_mult_rhs_p (type
, gimple_assign_rhs1 (stmt
), type1_out
,
2641 if (!is_widening_mult_rhs_p (type
, gimple_assign_rhs2 (stmt
), type2_out
,
2645 if (*type1_out
== NULL
)
2647 if (*type2_out
== NULL
|| !int_fits_type_p (*rhs1_out
, *type2_out
))
2649 *type1_out
= *type2_out
;
2652 if (*type2_out
== NULL
)
2654 if (!int_fits_type_p (*rhs2_out
, *type1_out
))
2656 *type2_out
= *type1_out
;
2659 /* Ensure that the larger of the two operands comes first. */
2660 if (TYPE_PRECISION (*type1_out
) < TYPE_PRECISION (*type2_out
))
2664 *type1_out
= *type2_out
;
2667 *rhs1_out
= *rhs2_out
;
2674 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2675 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
2676 value is true iff we converted the statement. */
2679 convert_mult_to_widen (gimple stmt
, gimple_stmt_iterator
*gsi
)
2681 tree lhs
, rhs1
, rhs2
, type
, type1
, type2
;
2682 enum insn_code handler
;
2683 machine_mode to_mode
, from_mode
, actual_mode
;
2685 int actual_precision
;
2686 location_t loc
= gimple_location (stmt
);
2687 bool from_unsigned1
, from_unsigned2
;
2689 lhs
= gimple_assign_lhs (stmt
);
2690 type
= TREE_TYPE (lhs
);
2691 if (TREE_CODE (type
) != INTEGER_TYPE
)
2694 if (!is_widening_mult_p (stmt
, &type1
, &rhs1
, &type2
, &rhs2
))
2697 to_mode
= TYPE_MODE (type
);
2698 from_mode
= TYPE_MODE (type1
);
2699 from_unsigned1
= TYPE_UNSIGNED (type1
);
2700 from_unsigned2
= TYPE_UNSIGNED (type2
);
2702 if (from_unsigned1
&& from_unsigned2
)
2703 op
= umul_widen_optab
;
2704 else if (!from_unsigned1
&& !from_unsigned2
)
2705 op
= smul_widen_optab
;
2707 op
= usmul_widen_optab
;
2709 handler
= find_widening_optab_handler_and_mode (op
, to_mode
, from_mode
,
2712 if (handler
== CODE_FOR_nothing
)
2714 if (op
!= smul_widen_optab
)
2716 /* We can use a signed multiply with unsigned types as long as
2717 there is a wider mode to use, or it is the smaller of the two
2718 types that is unsigned. Note that type1 >= type2, always. */
2719 if ((TYPE_UNSIGNED (type1
)
2720 && TYPE_PRECISION (type1
) == GET_MODE_PRECISION (from_mode
))
2721 || (TYPE_UNSIGNED (type2
)
2722 && TYPE_PRECISION (type2
) == GET_MODE_PRECISION (from_mode
)))
2724 from_mode
= GET_MODE_WIDER_MODE (from_mode
);
2725 if (GET_MODE_SIZE (to_mode
) <= GET_MODE_SIZE (from_mode
))
2729 op
= smul_widen_optab
;
2730 handler
= find_widening_optab_handler_and_mode (op
, to_mode
,
2734 if (handler
== CODE_FOR_nothing
)
2737 from_unsigned1
= from_unsigned2
= false;
2743 /* Ensure that the inputs to the handler are in the correct precison
2744 for the opcode. This will be the full mode size. */
2745 actual_precision
= GET_MODE_PRECISION (actual_mode
);
2746 if (2 * actual_precision
> TYPE_PRECISION (type
))
2748 if (actual_precision
!= TYPE_PRECISION (type1
)
2749 || from_unsigned1
!= TYPE_UNSIGNED (type1
))
2750 rhs1
= build_and_insert_cast (gsi
, loc
,
2751 build_nonstandard_integer_type
2752 (actual_precision
, from_unsigned1
), rhs1
);
2753 if (actual_precision
!= TYPE_PRECISION (type2
)
2754 || from_unsigned2
!= TYPE_UNSIGNED (type2
))
2755 rhs2
= build_and_insert_cast (gsi
, loc
,
2756 build_nonstandard_integer_type
2757 (actual_precision
, from_unsigned2
), rhs2
);
2759 /* Handle constants. */
2760 if (TREE_CODE (rhs1
) == INTEGER_CST
)
2761 rhs1
= fold_convert (type1
, rhs1
);
2762 if (TREE_CODE (rhs2
) == INTEGER_CST
)
2763 rhs2
= fold_convert (type2
, rhs2
);
2765 gimple_assign_set_rhs1 (stmt
, rhs1
);
2766 gimple_assign_set_rhs2 (stmt
, rhs2
);
2767 gimple_assign_set_rhs_code (stmt
, WIDEN_MULT_EXPR
);
2769 widen_mul_stats
.widen_mults_inserted
++;
2773 /* Process a single gimple statement STMT, which is found at the
2774 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2775 rhs (given by CODE), and try to convert it into a
2776 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
2777 is true iff we converted the statement. */
2780 convert_plusminus_to_widen (gimple_stmt_iterator
*gsi
, gimple stmt
,
2781 enum tree_code code
)
2783 gimple rhs1_stmt
= NULL
, rhs2_stmt
= NULL
;
2784 gimple conv1_stmt
= NULL
, conv2_stmt
= NULL
, conv_stmt
;
2785 tree type
, type1
, type2
, optype
;
2786 tree lhs
, rhs1
, rhs2
, mult_rhs1
, mult_rhs2
, add_rhs
;
2787 enum tree_code rhs1_code
= ERROR_MARK
, rhs2_code
= ERROR_MARK
;
2789 enum tree_code wmult_code
;
2790 enum insn_code handler
;
2791 machine_mode to_mode
, from_mode
, actual_mode
;
2792 location_t loc
= gimple_location (stmt
);
2793 int actual_precision
;
2794 bool from_unsigned1
, from_unsigned2
;
2796 lhs
= gimple_assign_lhs (stmt
);
2797 type
= TREE_TYPE (lhs
);
2798 if (TREE_CODE (type
) != INTEGER_TYPE
2799 && TREE_CODE (type
) != FIXED_POINT_TYPE
)
2802 if (code
== MINUS_EXPR
)
2803 wmult_code
= WIDEN_MULT_MINUS_EXPR
;
2805 wmult_code
= WIDEN_MULT_PLUS_EXPR
;
2807 rhs1
= gimple_assign_rhs1 (stmt
);
2808 rhs2
= gimple_assign_rhs2 (stmt
);
2810 if (TREE_CODE (rhs1
) == SSA_NAME
)
2812 rhs1_stmt
= SSA_NAME_DEF_STMT (rhs1
);
2813 if (is_gimple_assign (rhs1_stmt
))
2814 rhs1_code
= gimple_assign_rhs_code (rhs1_stmt
);
2817 if (TREE_CODE (rhs2
) == SSA_NAME
)
2819 rhs2_stmt
= SSA_NAME_DEF_STMT (rhs2
);
2820 if (is_gimple_assign (rhs2_stmt
))
2821 rhs2_code
= gimple_assign_rhs_code (rhs2_stmt
);
2824 /* Allow for one conversion statement between the multiply
2825 and addition/subtraction statement. If there are more than
2826 one conversions then we assume they would invalidate this
2827 transformation. If that's not the case then they should have
2828 been folded before now. */
2829 if (CONVERT_EXPR_CODE_P (rhs1_code
))
2831 conv1_stmt
= rhs1_stmt
;
2832 rhs1
= gimple_assign_rhs1 (rhs1_stmt
);
2833 if (TREE_CODE (rhs1
) == SSA_NAME
)
2835 rhs1_stmt
= SSA_NAME_DEF_STMT (rhs1
);
2836 if (is_gimple_assign (rhs1_stmt
))
2837 rhs1_code
= gimple_assign_rhs_code (rhs1_stmt
);
2842 if (CONVERT_EXPR_CODE_P (rhs2_code
))
2844 conv2_stmt
= rhs2_stmt
;
2845 rhs2
= gimple_assign_rhs1 (rhs2_stmt
);
2846 if (TREE_CODE (rhs2
) == SSA_NAME
)
2848 rhs2_stmt
= SSA_NAME_DEF_STMT (rhs2
);
2849 if (is_gimple_assign (rhs2_stmt
))
2850 rhs2_code
= gimple_assign_rhs_code (rhs2_stmt
);
2856 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2857 is_widening_mult_p, but we still need the rhs returns.
2859 It might also appear that it would be sufficient to use the existing
2860 operands of the widening multiply, but that would limit the choice of
2861 multiply-and-accumulate instructions.
2863 If the widened-multiplication result has more than one uses, it is
2864 probably wiser not to do the conversion. */
2865 if (code
== PLUS_EXPR
2866 && (rhs1_code
== MULT_EXPR
|| rhs1_code
== WIDEN_MULT_EXPR
))
2868 if (!has_single_use (rhs1
)
2869 || !is_widening_mult_p (rhs1_stmt
, &type1
, &mult_rhs1
,
2870 &type2
, &mult_rhs2
))
2873 conv_stmt
= conv1_stmt
;
2875 else if (rhs2_code
== MULT_EXPR
|| rhs2_code
== WIDEN_MULT_EXPR
)
2877 if (!has_single_use (rhs2
)
2878 || !is_widening_mult_p (rhs2_stmt
, &type1
, &mult_rhs1
,
2879 &type2
, &mult_rhs2
))
2882 conv_stmt
= conv2_stmt
;
2887 to_mode
= TYPE_MODE (type
);
2888 from_mode
= TYPE_MODE (type1
);
2889 from_unsigned1
= TYPE_UNSIGNED (type1
);
2890 from_unsigned2
= TYPE_UNSIGNED (type2
);
2893 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
2894 if (from_unsigned1
!= from_unsigned2
)
2896 if (!INTEGRAL_TYPE_P (type
))
2898 /* We can use a signed multiply with unsigned types as long as
2899 there is a wider mode to use, or it is the smaller of the two
2900 types that is unsigned. Note that type1 >= type2, always. */
2902 && TYPE_PRECISION (type1
) == GET_MODE_PRECISION (from_mode
))
2904 && TYPE_PRECISION (type2
) == GET_MODE_PRECISION (from_mode
)))
2906 from_mode
= GET_MODE_WIDER_MODE (from_mode
);
2907 if (GET_MODE_SIZE (from_mode
) >= GET_MODE_SIZE (to_mode
))
2911 from_unsigned1
= from_unsigned2
= false;
2912 optype
= build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode
),
2916 /* If there was a conversion between the multiply and addition
2917 then we need to make sure it fits a multiply-and-accumulate.
2918 The should be a single mode change which does not change the
2922 /* We use the original, unmodified data types for this. */
2923 tree from_type
= TREE_TYPE (gimple_assign_rhs1 (conv_stmt
));
2924 tree to_type
= TREE_TYPE (gimple_assign_lhs (conv_stmt
));
2925 int data_size
= TYPE_PRECISION (type1
) + TYPE_PRECISION (type2
);
2926 bool is_unsigned
= TYPE_UNSIGNED (type1
) && TYPE_UNSIGNED (type2
);
2928 if (TYPE_PRECISION (from_type
) > TYPE_PRECISION (to_type
))
2930 /* Conversion is a truncate. */
2931 if (TYPE_PRECISION (to_type
) < data_size
)
2934 else if (TYPE_PRECISION (from_type
) < TYPE_PRECISION (to_type
))
2936 /* Conversion is an extend. Check it's the right sort. */
2937 if (TYPE_UNSIGNED (from_type
) != is_unsigned
2938 && !(is_unsigned
&& TYPE_PRECISION (from_type
) > data_size
))
2941 /* else convert is a no-op for our purposes. */
2944 /* Verify that the machine can perform a widening multiply
2945 accumulate in this mode/signedness combination, otherwise
2946 this transformation is likely to pessimize code. */
2947 this_optab
= optab_for_tree_code (wmult_code
, optype
, optab_default
);
2948 handler
= find_widening_optab_handler_and_mode (this_optab
, to_mode
,
2949 from_mode
, 0, &actual_mode
);
2951 if (handler
== CODE_FOR_nothing
)
2954 /* Ensure that the inputs to the handler are in the correct precison
2955 for the opcode. This will be the full mode size. */
2956 actual_precision
= GET_MODE_PRECISION (actual_mode
);
2957 if (actual_precision
!= TYPE_PRECISION (type1
)
2958 || from_unsigned1
!= TYPE_UNSIGNED (type1
))
2959 mult_rhs1
= build_and_insert_cast (gsi
, loc
,
2960 build_nonstandard_integer_type
2961 (actual_precision
, from_unsigned1
),
2963 if (actual_precision
!= TYPE_PRECISION (type2
)
2964 || from_unsigned2
!= TYPE_UNSIGNED (type2
))
2965 mult_rhs2
= build_and_insert_cast (gsi
, loc
,
2966 build_nonstandard_integer_type
2967 (actual_precision
, from_unsigned2
),
2970 if (!useless_type_conversion_p (type
, TREE_TYPE (add_rhs
)))
2971 add_rhs
= build_and_insert_cast (gsi
, loc
, type
, add_rhs
);
2973 /* Handle constants. */
2974 if (TREE_CODE (mult_rhs1
) == INTEGER_CST
)
2975 mult_rhs1
= fold_convert (type1
, mult_rhs1
);
2976 if (TREE_CODE (mult_rhs2
) == INTEGER_CST
)
2977 mult_rhs2
= fold_convert (type2
, mult_rhs2
);
2979 gimple_assign_set_rhs_with_ops_1 (gsi
, wmult_code
, mult_rhs1
, mult_rhs2
,
2981 update_stmt (gsi_stmt (*gsi
));
2982 widen_mul_stats
.maccs_inserted
++;
2986 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2987 with uses in additions and subtractions to form fused multiply-add
2988 operations. Returns true if successful and MUL_STMT should be removed. */
2991 convert_mult_to_fma (gimple mul_stmt
, tree op1
, tree op2
)
2993 tree mul_result
= gimple_get_lhs (mul_stmt
);
2994 tree type
= TREE_TYPE (mul_result
);
2995 gimple use_stmt
, neguse_stmt
;
2997 use_operand_p use_p
;
2998 imm_use_iterator imm_iter
;
3000 if (FLOAT_TYPE_P (type
)
3001 && flag_fp_contract_mode
== FP_CONTRACT_OFF
)
3004 /* We don't want to do bitfield reduction ops. */
3005 if (INTEGRAL_TYPE_P (type
)
3006 && (TYPE_PRECISION (type
)
3007 != GET_MODE_PRECISION (TYPE_MODE (type
))))
3010 /* If the target doesn't support it, don't generate it. We assume that
3011 if fma isn't available then fms, fnma or fnms are not either. */
3012 if (optab_handler (fma_optab
, TYPE_MODE (type
)) == CODE_FOR_nothing
)
3015 /* If the multiplication has zero uses, it is kept around probably because
3016 of -fnon-call-exceptions. Don't optimize it away in that case,
3018 if (has_zero_uses (mul_result
))
3021 /* Make sure that the multiplication statement becomes dead after
3022 the transformation, thus that all uses are transformed to FMAs.
3023 This means we assume that an FMA operation has the same cost
3025 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, mul_result
)
3027 enum tree_code use_code
;
3028 tree result
= mul_result
;
3029 bool negate_p
= false;
3031 use_stmt
= USE_STMT (use_p
);
3033 if (is_gimple_debug (use_stmt
))
3036 /* For now restrict this operations to single basic blocks. In theory
3037 we would want to support sinking the multiplication in
3043 to form a fma in the then block and sink the multiplication to the
3045 if (gimple_bb (use_stmt
) != gimple_bb (mul_stmt
))
3048 if (!is_gimple_assign (use_stmt
))
3051 use_code
= gimple_assign_rhs_code (use_stmt
);
3053 /* A negate on the multiplication leads to FNMA. */
3054 if (use_code
== NEGATE_EXPR
)
3059 result
= gimple_assign_lhs (use_stmt
);
3061 /* Make sure the negate statement becomes dead with this
3062 single transformation. */
3063 if (!single_imm_use (gimple_assign_lhs (use_stmt
),
3064 &use_p
, &neguse_stmt
))
3067 /* Make sure the multiplication isn't also used on that stmt. */
3068 FOR_EACH_PHI_OR_STMT_USE (usep
, neguse_stmt
, iter
, SSA_OP_USE
)
3069 if (USE_FROM_PTR (usep
) == mul_result
)
3073 use_stmt
= neguse_stmt
;
3074 if (gimple_bb (use_stmt
) != gimple_bb (mul_stmt
))
3076 if (!is_gimple_assign (use_stmt
))
3079 use_code
= gimple_assign_rhs_code (use_stmt
);
3086 if (gimple_assign_rhs2 (use_stmt
) == result
)
3087 negate_p
= !negate_p
;
3092 /* FMA can only be formed from PLUS and MINUS. */
3096 /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3097 by a MULT_EXPR that we'll visit later, we might be able to
3098 get a more profitable match with fnma.
3099 OTOH, if we don't, a negate / fma pair has likely lower latency
3100 that a mult / subtract pair. */
3101 if (use_code
== MINUS_EXPR
&& !negate_p
3102 && gimple_assign_rhs1 (use_stmt
) == result
3103 && optab_handler (fms_optab
, TYPE_MODE (type
)) == CODE_FOR_nothing
3104 && optab_handler (fnma_optab
, TYPE_MODE (type
)) != CODE_FOR_nothing
)
3106 tree rhs2
= gimple_assign_rhs2 (use_stmt
);
3108 if (TREE_CODE (rhs2
) == SSA_NAME
)
3110 gimple stmt2
= SSA_NAME_DEF_STMT (rhs2
);
3111 if (has_single_use (rhs2
)
3112 && is_gimple_assign (stmt2
)
3113 && gimple_assign_rhs_code (stmt2
) == MULT_EXPR
)
3118 /* We can't handle a * b + a * b. */
3119 if (gimple_assign_rhs1 (use_stmt
) == gimple_assign_rhs2 (use_stmt
))
3122 /* While it is possible to validate whether or not the exact form
3123 that we've recognized is available in the backend, the assumption
3124 is that the transformation is never a loss. For instance, suppose
3125 the target only has the plain FMA pattern available. Consider
3126 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3127 is still two operations. Consider -(a*b)-c -> fma(-a,b,-c): we
3128 still have 3 operations, but in the FMA form the two NEGs are
3129 independent and could be run in parallel. */
3132 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, mul_result
)
3134 gimple_stmt_iterator gsi
= gsi_for_stmt (use_stmt
);
3135 enum tree_code use_code
;
3136 tree addop
, mulop1
= op1
, result
= mul_result
;
3137 bool negate_p
= false;
3139 if (is_gimple_debug (use_stmt
))
3142 use_code
= gimple_assign_rhs_code (use_stmt
);
3143 if (use_code
== NEGATE_EXPR
)
3145 result
= gimple_assign_lhs (use_stmt
);
3146 single_imm_use (gimple_assign_lhs (use_stmt
), &use_p
, &neguse_stmt
);
3147 gsi_remove (&gsi
, true);
3148 release_defs (use_stmt
);
3150 use_stmt
= neguse_stmt
;
3151 gsi
= gsi_for_stmt (use_stmt
);
3152 use_code
= gimple_assign_rhs_code (use_stmt
);
3156 if (gimple_assign_rhs1 (use_stmt
) == result
)
3158 addop
= gimple_assign_rhs2 (use_stmt
);
3159 /* a * b - c -> a * b + (-c) */
3160 if (gimple_assign_rhs_code (use_stmt
) == MINUS_EXPR
)
3161 addop
= force_gimple_operand_gsi (&gsi
,
3162 build1 (NEGATE_EXPR
,
3164 true, NULL_TREE
, true,
3169 addop
= gimple_assign_rhs1 (use_stmt
);
3170 /* a - b * c -> (-b) * c + a */
3171 if (gimple_assign_rhs_code (use_stmt
) == MINUS_EXPR
)
3172 negate_p
= !negate_p
;
3176 mulop1
= force_gimple_operand_gsi (&gsi
,
3177 build1 (NEGATE_EXPR
,
3179 true, NULL_TREE
, true,
3182 fma_stmt
= gimple_build_assign_with_ops (FMA_EXPR
,
3183 gimple_assign_lhs (use_stmt
),
3186 gsi_replace (&gsi
, fma_stmt
, true);
3187 widen_mul_stats
.fmas_inserted
++;
3193 /* Find integer multiplications where the operands are extended from
3194 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3195 where appropriate. */
3199 const pass_data pass_data_optimize_widening_mul
=
3201 GIMPLE_PASS
, /* type */
3202 "widening_mul", /* name */
3203 OPTGROUP_NONE
, /* optinfo_flags */
3204 TV_NONE
, /* tv_id */
3205 PROP_ssa
, /* properties_required */
3206 0, /* properties_provided */
3207 0, /* properties_destroyed */
3208 0, /* todo_flags_start */
3209 TODO_update_ssa
, /* todo_flags_finish */
3212 class pass_optimize_widening_mul
: public gimple_opt_pass
3215 pass_optimize_widening_mul (gcc::context
*ctxt
)
3216 : gimple_opt_pass (pass_data_optimize_widening_mul
, ctxt
)
3219 /* opt_pass methods: */
3220 virtual bool gate (function
*)
3222 return flag_expensive_optimizations
&& optimize
;
3225 virtual unsigned int execute (function
*);
3227 }; // class pass_optimize_widening_mul
3230 pass_optimize_widening_mul::execute (function
*fun
)
3233 bool cfg_changed
= false;
3235 memset (&widen_mul_stats
, 0, sizeof (widen_mul_stats
));
3237 FOR_EACH_BB_FN (bb
, fun
)
3239 gimple_stmt_iterator gsi
;
3241 for (gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);)
3243 gimple stmt
= gsi_stmt (gsi
);
3244 enum tree_code code
;
3246 if (is_gimple_assign (stmt
))
3248 code
= gimple_assign_rhs_code (stmt
);
3252 if (!convert_mult_to_widen (stmt
, &gsi
)
3253 && convert_mult_to_fma (stmt
,
3254 gimple_assign_rhs1 (stmt
),
3255 gimple_assign_rhs2 (stmt
)))
3257 gsi_remove (&gsi
, true);
3258 release_defs (stmt
);
3265 convert_plusminus_to_widen (&gsi
, stmt
, code
);
3271 else if (is_gimple_call (stmt
)
3272 && gimple_call_lhs (stmt
))
3274 tree fndecl
= gimple_call_fndecl (stmt
);
3276 && DECL_BUILT_IN_CLASS (fndecl
) == BUILT_IN_NORMAL
)
3278 switch (DECL_FUNCTION_CODE (fndecl
))
3283 if (TREE_CODE (gimple_call_arg (stmt
, 1)) == REAL_CST
3284 && REAL_VALUES_EQUAL
3285 (TREE_REAL_CST (gimple_call_arg (stmt
, 1)),
3287 && convert_mult_to_fma (stmt
,
3288 gimple_call_arg (stmt
, 0),
3289 gimple_call_arg (stmt
, 0)))
3291 unlink_stmt_vdef (stmt
);
3292 if (gsi_remove (&gsi
, true)
3293 && gimple_purge_dead_eh_edges (bb
))
3295 release_defs (stmt
);
3308 statistics_counter_event (fun
, "widening multiplications inserted",
3309 widen_mul_stats
.widen_mults_inserted
);
3310 statistics_counter_event (fun
, "widening maccs inserted",
3311 widen_mul_stats
.maccs_inserted
);
3312 statistics_counter_event (fun
, "fused multiply-adds inserted",
3313 widen_mul_stats
.fmas_inserted
);
3315 return cfg_changed
? TODO_cleanup_cfg
: 0;
3321 make_pass_optimize_widening_mul (gcc::context
*ctxt
)
3323 return new pass_optimize_widening_mul (ctxt
);