1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2015 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
23 modulus = sqrt(x*x + y*y + z*z);
28 that can be optimized to
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 hy the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
89 #include "coretypes.h"
96 #include "fold-const.h"
98 #include "hard-reg-set.h"
100 #include "dominance.h"
102 #include "basic-block.h"
103 #include "tree-ssa-alias.h"
104 #include "internal-fn.h"
105 #include "gimple-fold.h"
106 #include "gimple-expr.h"
109 #include "gimple-iterator.h"
110 #include "gimplify.h"
111 #include "gimplify-me.h"
112 #include "stor-layout.h"
113 #include "gimple-ssa.h"
114 #include "tree-cfg.h"
115 #include "tree-phinodes.h"
116 #include "ssa-iterators.h"
117 #include "stringpool.h"
118 #include "tree-ssanames.h"
120 #include "insn-config.h"
125 #include "emit-rtl.h"
129 #include "tree-dfa.h"
130 #include "tree-ssa.h"
131 #include "tree-pass.h"
132 #include "alloc-pool.h"
134 #include "gimple-pretty-print.h"
135 #include "builtins.h"
138 /* FIXME: RTL headers have to be included here for optabs. */
139 #include "rtl.h" /* Because optabs.h wants enum rtx_code. */
140 #include "expr.h" /* Because optabs.h wants sepops. */
141 #include "insn-codes.h"
144 /* This structure represents one basic block that either computes a
145 division, or is a common dominator for basic block that compute a
148 /* The basic block represented by this structure. */
151 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
155 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
156 was inserted in BB. */
157 gimple recip_def_stmt
;
159 /* Pointer to a list of "struct occurrence"s for blocks dominated
161 struct occurrence
*children
;
163 /* Pointer to the next "struct occurrence"s in the list of blocks
164 sharing a common dominator. */
165 struct occurrence
*next
;
167 /* The number of divisions that are in BB before compute_merit. The
168 number of divisions that are in BB or post-dominate it after
172 /* True if the basic block has a division, false if it is a common
173 dominator for basic blocks that do. If it is false and trapping
174 math is active, BB is not a candidate for inserting a reciprocal. */
175 bool bb_has_division
;
180 /* Number of 1.0/X ops inserted. */
183 /* Number of 1.0/FUNC ops inserted. */
189 /* Number of cexpi calls inserted. */
195 /* Number of hand-written 16-bit nop / bswaps found. */
198 /* Number of hand-written 32-bit nop / bswaps found. */
201 /* Number of hand-written 64-bit nop / bswaps found. */
203 } nop_stats
, bswap_stats
;
207 /* Number of widening multiplication ops inserted. */
208 int widen_mults_inserted
;
210 /* Number of integer multiply-and-accumulate ops inserted. */
213 /* Number of fp fused multiply-add ops inserted. */
217 /* The instance of "struct occurrence" representing the highest
218 interesting block in the dominator tree. */
219 static struct occurrence
*occ_head
;
221 /* Allocation pool for getting instances of "struct occurrence". */
222 static pool_allocator
<occurrence
> *occ_pool
;
226 /* Allocate and return a new struct occurrence for basic block BB, and
227 whose children list is headed by CHILDREN. */
228 static struct occurrence
*
229 occ_new (basic_block bb
, struct occurrence
*children
)
231 struct occurrence
*occ
;
233 bb
->aux
= occ
= occ_pool
->allocate ();
234 memset (occ
, 0, sizeof (struct occurrence
));
237 occ
->children
= children
;
242 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
243 list of "struct occurrence"s, one per basic block, having IDOM as
244 their common dominator.
246 We try to insert NEW_OCC as deep as possible in the tree, and we also
247 insert any other block that is a common dominator for BB and one
248 block already in the tree. */
251 insert_bb (struct occurrence
*new_occ
, basic_block idom
,
252 struct occurrence
**p_head
)
254 struct occurrence
*occ
, **p_occ
;
256 for (p_occ
= p_head
; (occ
= *p_occ
) != NULL
; )
258 basic_block bb
= new_occ
->bb
, occ_bb
= occ
->bb
;
259 basic_block dom
= nearest_common_dominator (CDI_DOMINATORS
, occ_bb
, bb
);
262 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
265 occ
->next
= new_occ
->children
;
266 new_occ
->children
= occ
;
268 /* Try the next block (it may as well be dominated by BB). */
271 else if (dom
== occ_bb
)
273 /* OCC_BB dominates BB. Tail recurse to look deeper. */
274 insert_bb (new_occ
, dom
, &occ
->children
);
278 else if (dom
!= idom
)
280 gcc_assert (!dom
->aux
);
282 /* There is a dominator between IDOM and BB, add it and make
283 two children out of NEW_OCC and OCC. First, remove OCC from
289 /* None of the previous blocks has DOM as a dominator: if we tail
290 recursed, we would reexamine them uselessly. Just switch BB with
291 DOM, and go on looking for blocks dominated by DOM. */
292 new_occ
= occ_new (dom
, new_occ
);
297 /* Nothing special, go on with the next element. */
302 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
303 new_occ
->next
= *p_head
;
307 /* Register that we found a division in BB. */
310 register_division_in (basic_block bb
)
312 struct occurrence
*occ
;
314 occ
= (struct occurrence
*) bb
->aux
;
317 occ
= occ_new (bb
, NULL
);
318 insert_bb (occ
, ENTRY_BLOCK_PTR_FOR_FN (cfun
), &occ_head
);
321 occ
->bb_has_division
= true;
322 occ
->num_divisions
++;
326 /* Compute the number of divisions that postdominate each block in OCC and
330 compute_merit (struct occurrence
*occ
)
332 struct occurrence
*occ_child
;
333 basic_block dom
= occ
->bb
;
335 for (occ_child
= occ
->children
; occ_child
; occ_child
= occ_child
->next
)
338 if (occ_child
->children
)
339 compute_merit (occ_child
);
342 bb
= single_noncomplex_succ (dom
);
346 if (dominated_by_p (CDI_POST_DOMINATORS
, bb
, occ_child
->bb
))
347 occ
->num_divisions
+= occ_child
->num_divisions
;
352 /* Return whether USE_STMT is a floating-point division by DEF. */
354 is_division_by (gimple use_stmt
, tree def
)
356 return is_gimple_assign (use_stmt
)
357 && gimple_assign_rhs_code (use_stmt
) == RDIV_EXPR
358 && gimple_assign_rhs2 (use_stmt
) == def
359 /* Do not recognize x / x as valid division, as we are getting
360 confused later by replacing all immediate uses x in such
362 && gimple_assign_rhs1 (use_stmt
) != def
;
365 /* Walk the subset of the dominator tree rooted at OCC, setting the
366 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
367 the given basic block. The field may be left NULL, of course,
368 if it is not possible or profitable to do the optimization.
370 DEF_BSI is an iterator pointing at the statement defining DEF.
371 If RECIP_DEF is set, a dominator already has a computation that can
375 insert_reciprocals (gimple_stmt_iterator
*def_gsi
, struct occurrence
*occ
,
376 tree def
, tree recip_def
, int threshold
)
380 gimple_stmt_iterator gsi
;
381 struct occurrence
*occ_child
;
384 && (occ
->bb_has_division
|| !flag_trapping_math
)
385 && occ
->num_divisions
>= threshold
)
387 /* Make a variable with the replacement and substitute it. */
388 type
= TREE_TYPE (def
);
389 recip_def
= create_tmp_reg (type
, "reciptmp");
390 new_stmt
= gimple_build_assign (recip_def
, RDIV_EXPR
,
391 build_one_cst (type
), def
);
393 if (occ
->bb_has_division
)
395 /* Case 1: insert before an existing division. */
396 gsi
= gsi_after_labels (occ
->bb
);
397 while (!gsi_end_p (gsi
) && !is_division_by (gsi_stmt (gsi
), def
))
400 gsi_insert_before (&gsi
, new_stmt
, GSI_SAME_STMT
);
402 else if (def_gsi
&& occ
->bb
== def_gsi
->bb
)
404 /* Case 2: insert right after the definition. Note that this will
405 never happen if the definition statement can throw, because in
406 that case the sole successor of the statement's basic block will
407 dominate all the uses as well. */
408 gsi_insert_after (def_gsi
, new_stmt
, GSI_NEW_STMT
);
412 /* Case 3: insert in a basic block not containing defs/uses. */
413 gsi
= gsi_after_labels (occ
->bb
);
414 gsi_insert_before (&gsi
, new_stmt
, GSI_SAME_STMT
);
417 reciprocal_stats
.rdivs_inserted
++;
419 occ
->recip_def_stmt
= new_stmt
;
422 occ
->recip_def
= recip_def
;
423 for (occ_child
= occ
->children
; occ_child
; occ_child
= occ_child
->next
)
424 insert_reciprocals (def_gsi
, occ_child
, def
, recip_def
, threshold
);
428 /* Replace the division at USE_P with a multiplication by the reciprocal, if
432 replace_reciprocal (use_operand_p use_p
)
434 gimple use_stmt
= USE_STMT (use_p
);
435 basic_block bb
= gimple_bb (use_stmt
);
436 struct occurrence
*occ
= (struct occurrence
*) bb
->aux
;
438 if (optimize_bb_for_speed_p (bb
)
439 && occ
->recip_def
&& use_stmt
!= occ
->recip_def_stmt
)
441 gimple_stmt_iterator gsi
= gsi_for_stmt (use_stmt
);
442 gimple_assign_set_rhs_code (use_stmt
, MULT_EXPR
);
443 SET_USE (use_p
, occ
->recip_def
);
444 fold_stmt_inplace (&gsi
);
445 update_stmt (use_stmt
);
450 /* Free OCC and return one more "struct occurrence" to be freed. */
452 static struct occurrence
*
453 free_bb (struct occurrence
*occ
)
455 struct occurrence
*child
, *next
;
457 /* First get the two pointers hanging off OCC. */
459 child
= occ
->children
;
461 occ_pool
->remove (occ
);
463 /* Now ensure that we don't recurse unless it is necessary. */
469 next
= free_bb (next
);
476 /* Look for floating-point divisions among DEF's uses, and try to
477 replace them by multiplications with the reciprocal. Add
478 as many statements computing the reciprocal as needed.
480 DEF must be a GIMPLE register of a floating-point type. */
483 execute_cse_reciprocals_1 (gimple_stmt_iterator
*def_gsi
, tree def
)
486 imm_use_iterator use_iter
;
487 struct occurrence
*occ
;
488 int count
= 0, threshold
;
490 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def
)) && is_gimple_reg (def
));
492 FOR_EACH_IMM_USE_FAST (use_p
, use_iter
, def
)
494 gimple use_stmt
= USE_STMT (use_p
);
495 if (is_division_by (use_stmt
, def
))
497 register_division_in (gimple_bb (use_stmt
));
502 /* Do the expensive part only if we can hope to optimize something. */
503 threshold
= targetm
.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def
)));
504 if (count
>= threshold
)
507 for (occ
= occ_head
; occ
; occ
= occ
->next
)
510 insert_reciprocals (def_gsi
, occ
, def
, NULL
, threshold
);
513 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, def
)
515 if (is_division_by (use_stmt
, def
))
517 FOR_EACH_IMM_USE_ON_STMT (use_p
, use_iter
)
518 replace_reciprocal (use_p
);
523 for (occ
= occ_head
; occ
; )
529 /* Go through all the floating-point SSA_NAMEs, and call
530 execute_cse_reciprocals_1 on each of them. */
533 const pass_data pass_data_cse_reciprocals
=
535 GIMPLE_PASS
, /* type */
537 OPTGROUP_NONE
, /* optinfo_flags */
539 PROP_ssa
, /* properties_required */
540 0, /* properties_provided */
541 0, /* properties_destroyed */
542 0, /* todo_flags_start */
543 TODO_update_ssa
, /* todo_flags_finish */
546 class pass_cse_reciprocals
: public gimple_opt_pass
549 pass_cse_reciprocals (gcc::context
*ctxt
)
550 : gimple_opt_pass (pass_data_cse_reciprocals
, ctxt
)
553 /* opt_pass methods: */
554 virtual bool gate (function
*) { return optimize
&& flag_reciprocal_math
; }
555 virtual unsigned int execute (function
*);
557 }; // class pass_cse_reciprocals
560 pass_cse_reciprocals::execute (function
*fun
)
565 occ_pool
= new pool_allocator
<occurrence
>
566 ("dominators for recip", n_basic_blocks_for_fn (fun
) / 3 + 1);
568 memset (&reciprocal_stats
, 0, sizeof (reciprocal_stats
));
569 calculate_dominance_info (CDI_DOMINATORS
);
570 calculate_dominance_info (CDI_POST_DOMINATORS
);
572 #ifdef ENABLE_CHECKING
573 FOR_EACH_BB_FN (bb
, fun
)
574 gcc_assert (!bb
->aux
);
577 for (arg
= DECL_ARGUMENTS (fun
->decl
); arg
; arg
= DECL_CHAIN (arg
))
578 if (FLOAT_TYPE_P (TREE_TYPE (arg
))
579 && is_gimple_reg (arg
))
581 tree name
= ssa_default_def (fun
, arg
);
583 execute_cse_reciprocals_1 (NULL
, name
);
586 FOR_EACH_BB_FN (bb
, fun
)
590 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
593 gphi
*phi
= gsi
.phi ();
594 def
= PHI_RESULT (phi
);
595 if (! virtual_operand_p (def
)
596 && FLOAT_TYPE_P (TREE_TYPE (def
)))
597 execute_cse_reciprocals_1 (NULL
, def
);
600 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
603 gimple stmt
= gsi_stmt (gsi
);
605 if (gimple_has_lhs (stmt
)
606 && (def
= SINGLE_SSA_TREE_OPERAND (stmt
, SSA_OP_DEF
)) != NULL
607 && FLOAT_TYPE_P (TREE_TYPE (def
))
608 && TREE_CODE (def
) == SSA_NAME
)
609 execute_cse_reciprocals_1 (&gsi
, def
);
612 if (optimize_bb_for_size_p (bb
))
615 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
616 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
619 gimple stmt
= gsi_stmt (gsi
);
622 if (is_gimple_assign (stmt
)
623 && gimple_assign_rhs_code (stmt
) == RDIV_EXPR
)
625 tree arg1
= gimple_assign_rhs2 (stmt
);
628 if (TREE_CODE (arg1
) != SSA_NAME
)
631 stmt1
= SSA_NAME_DEF_STMT (arg1
);
633 if (is_gimple_call (stmt1
)
634 && gimple_call_lhs (stmt1
)
635 && (fndecl
= gimple_call_fndecl (stmt1
))
636 && (DECL_BUILT_IN_CLASS (fndecl
) == BUILT_IN_NORMAL
637 || DECL_BUILT_IN_CLASS (fndecl
) == BUILT_IN_MD
))
639 enum built_in_function code
;
644 code
= DECL_FUNCTION_CODE (fndecl
);
645 md_code
= DECL_BUILT_IN_CLASS (fndecl
) == BUILT_IN_MD
;
647 fndecl
= targetm
.builtin_reciprocal (code
, md_code
, false);
651 /* Check that all uses of the SSA name are divisions,
652 otherwise replacing the defining statement will do
655 FOR_EACH_IMM_USE_FAST (use_p
, ui
, arg1
)
657 gimple stmt2
= USE_STMT (use_p
);
658 if (is_gimple_debug (stmt2
))
660 if (!is_gimple_assign (stmt2
)
661 || gimple_assign_rhs_code (stmt2
) != RDIV_EXPR
662 || gimple_assign_rhs1 (stmt2
) == arg1
663 || gimple_assign_rhs2 (stmt2
) != arg1
)
672 gimple_replace_ssa_lhs (stmt1
, arg1
);
673 gimple_call_set_fndecl (stmt1
, fndecl
);
675 reciprocal_stats
.rfuncs_inserted
++;
677 FOR_EACH_IMM_USE_STMT (stmt
, ui
, arg1
)
679 gimple_stmt_iterator gsi
= gsi_for_stmt (stmt
);
680 gimple_assign_set_rhs_code (stmt
, MULT_EXPR
);
681 fold_stmt_inplace (&gsi
);
689 statistics_counter_event (fun
, "reciprocal divs inserted",
690 reciprocal_stats
.rdivs_inserted
);
691 statistics_counter_event (fun
, "reciprocal functions inserted",
692 reciprocal_stats
.rfuncs_inserted
);
694 free_dominance_info (CDI_DOMINATORS
);
695 free_dominance_info (CDI_POST_DOMINATORS
);
703 make_pass_cse_reciprocals (gcc::context
*ctxt
)
705 return new pass_cse_reciprocals (ctxt
);
708 /* Records an occurrence at statement USE_STMT in the vector of trees
709 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
710 is not yet initialized. Returns true if the occurrence was pushed on
711 the vector. Adjusts *TOP_BB to be the basic block dominating all
712 statements in the vector. */
715 maybe_record_sincos (vec
<gimple
> *stmts
,
716 basic_block
*top_bb
, gimple use_stmt
)
718 basic_block use_bb
= gimple_bb (use_stmt
);
720 && (*top_bb
== use_bb
721 || dominated_by_p (CDI_DOMINATORS
, use_bb
, *top_bb
)))
722 stmts
->safe_push (use_stmt
);
724 || dominated_by_p (CDI_DOMINATORS
, *top_bb
, use_bb
))
726 stmts
->safe_push (use_stmt
);
735 /* Look for sin, cos and cexpi calls with the same argument NAME and
736 create a single call to cexpi CSEing the result in this case.
737 We first walk over all immediate uses of the argument collecting
738 statements that we can CSE in a vector and in a second pass replace
739 the statement rhs with a REALPART or IMAGPART expression on the
740 result of the cexpi call we insert before the use statement that
741 dominates all other candidates. */
744 execute_cse_sincos_1 (tree name
)
746 gimple_stmt_iterator gsi
;
747 imm_use_iterator use_iter
;
748 tree fndecl
, res
, type
;
749 gimple def_stmt
, use_stmt
, stmt
;
750 int seen_cos
= 0, seen_sin
= 0, seen_cexpi
= 0;
751 auto_vec
<gimple
> stmts
;
752 basic_block top_bb
= NULL
;
754 bool cfg_changed
= false;
756 type
= TREE_TYPE (name
);
757 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, name
)
759 if (gimple_code (use_stmt
) != GIMPLE_CALL
760 || !gimple_call_lhs (use_stmt
)
761 || !(fndecl
= gimple_call_fndecl (use_stmt
))
762 || DECL_BUILT_IN_CLASS (fndecl
) != BUILT_IN_NORMAL
)
765 switch (DECL_FUNCTION_CODE (fndecl
))
767 CASE_FLT_FN (BUILT_IN_COS
):
768 seen_cos
|= maybe_record_sincos (&stmts
, &top_bb
, use_stmt
) ? 1 : 0;
771 CASE_FLT_FN (BUILT_IN_SIN
):
772 seen_sin
|= maybe_record_sincos (&stmts
, &top_bb
, use_stmt
) ? 1 : 0;
775 CASE_FLT_FN (BUILT_IN_CEXPI
):
776 seen_cexpi
|= maybe_record_sincos (&stmts
, &top_bb
, use_stmt
) ? 1 : 0;
783 if (seen_cos
+ seen_sin
+ seen_cexpi
<= 1)
786 /* Simply insert cexpi at the beginning of top_bb but not earlier than
787 the name def statement. */
788 fndecl
= mathfn_built_in (type
, BUILT_IN_CEXPI
);
791 stmt
= gimple_build_call (fndecl
, 1, name
);
792 res
= make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl
)), stmt
, "sincostmp");
793 gimple_call_set_lhs (stmt
, res
);
795 def_stmt
= SSA_NAME_DEF_STMT (name
);
796 if (!SSA_NAME_IS_DEFAULT_DEF (name
)
797 && gimple_code (def_stmt
) != GIMPLE_PHI
798 && gimple_bb (def_stmt
) == top_bb
)
800 gsi
= gsi_for_stmt (def_stmt
);
801 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
805 gsi
= gsi_after_labels (top_bb
);
806 gsi_insert_before (&gsi
, stmt
, GSI_SAME_STMT
);
808 sincos_stats
.inserted
++;
810 /* And adjust the recorded old call sites. */
811 for (i
= 0; stmts
.iterate (i
, &use_stmt
); ++i
)
814 fndecl
= gimple_call_fndecl (use_stmt
);
816 switch (DECL_FUNCTION_CODE (fndecl
))
818 CASE_FLT_FN (BUILT_IN_COS
):
819 rhs
= fold_build1 (REALPART_EXPR
, type
, res
);
822 CASE_FLT_FN (BUILT_IN_SIN
):
823 rhs
= fold_build1 (IMAGPART_EXPR
, type
, res
);
826 CASE_FLT_FN (BUILT_IN_CEXPI
):
834 /* Replace call with a copy. */
835 stmt
= gimple_build_assign (gimple_call_lhs (use_stmt
), rhs
);
837 gsi
= gsi_for_stmt (use_stmt
);
838 gsi_replace (&gsi
, stmt
, true);
839 if (gimple_purge_dead_eh_edges (gimple_bb (stmt
)))
846 /* To evaluate powi(x,n), the floating point value x raised to the
847 constant integer exponent n, we use a hybrid algorithm that
848 combines the "window method" with look-up tables. For an
849 introduction to exponentiation algorithms and "addition chains",
850 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
851 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
852 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
853 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
855 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
856 multiplications to inline before calling the system library's pow
857 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
858 so this default never requires calling pow, powf or powl. */
860 #ifndef POWI_MAX_MULTS
861 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
864 /* The size of the "optimal power tree" lookup table. All
865 exponents less than this value are simply looked up in the
866 powi_table below. This threshold is also used to size the
867 cache of pseudo registers that hold intermediate results. */
868 #define POWI_TABLE_SIZE 256
870 /* The size, in bits of the window, used in the "window method"
871 exponentiation algorithm. This is equivalent to a radix of
872 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
873 #define POWI_WINDOW_SIZE 3
875 /* The following table is an efficient representation of an
876 "optimal power tree". For each value, i, the corresponding
877 value, j, in the table states than an optimal evaluation
878 sequence for calculating pow(x,i) can be found by evaluating
879 pow(x,j)*pow(x,i-j). An optimal power tree for the first
880 100 integers is given in Knuth's "Seminumerical algorithms". */
882 static const unsigned char powi_table
[POWI_TABLE_SIZE
] =
884 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
885 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
886 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
887 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
888 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
889 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
890 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
891 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
892 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
893 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
894 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
895 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
896 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
897 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
898 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
899 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
900 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
901 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
902 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
903 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
904 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
905 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
906 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
907 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
908 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
909 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
910 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
911 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
912 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
913 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
914 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
915 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
919 /* Return the number of multiplications required to calculate
920 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
921 subroutine of powi_cost. CACHE is an array indicating
922 which exponents have already been calculated. */
925 powi_lookup_cost (unsigned HOST_WIDE_INT n
, bool *cache
)
927 /* If we've already calculated this exponent, then this evaluation
928 doesn't require any additional multiplications. */
933 return powi_lookup_cost (n
- powi_table
[n
], cache
)
934 + powi_lookup_cost (powi_table
[n
], cache
) + 1;
937 /* Return the number of multiplications required to calculate
938 powi(x,n) for an arbitrary x, given the exponent N. This
939 function needs to be kept in sync with powi_as_mults below. */
942 powi_cost (HOST_WIDE_INT n
)
944 bool cache
[POWI_TABLE_SIZE
];
945 unsigned HOST_WIDE_INT digit
;
946 unsigned HOST_WIDE_INT val
;
952 /* Ignore the reciprocal when calculating the cost. */
953 val
= (n
< 0) ? -n
: n
;
955 /* Initialize the exponent cache. */
956 memset (cache
, 0, POWI_TABLE_SIZE
* sizeof (bool));
961 while (val
>= POWI_TABLE_SIZE
)
965 digit
= val
& ((1 << POWI_WINDOW_SIZE
) - 1);
966 result
+= powi_lookup_cost (digit
, cache
)
967 + POWI_WINDOW_SIZE
+ 1;
968 val
>>= POWI_WINDOW_SIZE
;
977 return result
+ powi_lookup_cost (val
, cache
);
980 /* Recursive subroutine of powi_as_mults. This function takes the
981 array, CACHE, of already calculated exponents and an exponent N and
982 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
985 powi_as_mults_1 (gimple_stmt_iterator
*gsi
, location_t loc
, tree type
,
986 HOST_WIDE_INT n
, tree
*cache
)
988 tree op0
, op1
, ssa_target
;
989 unsigned HOST_WIDE_INT digit
;
992 if (n
< POWI_TABLE_SIZE
&& cache
[n
])
995 ssa_target
= make_temp_ssa_name (type
, NULL
, "powmult");
997 if (n
< POWI_TABLE_SIZE
)
999 cache
[n
] = ssa_target
;
1000 op0
= powi_as_mults_1 (gsi
, loc
, type
, n
- powi_table
[n
], cache
);
1001 op1
= powi_as_mults_1 (gsi
, loc
, type
, powi_table
[n
], cache
);
1005 digit
= n
& ((1 << POWI_WINDOW_SIZE
) - 1);
1006 op0
= powi_as_mults_1 (gsi
, loc
, type
, n
- digit
, cache
);
1007 op1
= powi_as_mults_1 (gsi
, loc
, type
, digit
, cache
);
1011 op0
= powi_as_mults_1 (gsi
, loc
, type
, n
>> 1, cache
);
1015 mult_stmt
= gimple_build_assign (ssa_target
, MULT_EXPR
, op0
, op1
);
1016 gimple_set_location (mult_stmt
, loc
);
1017 gsi_insert_before (gsi
, mult_stmt
, GSI_SAME_STMT
);
1022 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1023 This function needs to be kept in sync with powi_cost above. */
1026 powi_as_mults (gimple_stmt_iterator
*gsi
, location_t loc
,
1027 tree arg0
, HOST_WIDE_INT n
)
1029 tree cache
[POWI_TABLE_SIZE
], result
, type
= TREE_TYPE (arg0
);
1034 return build_real (type
, dconst1
);
1036 memset (cache
, 0, sizeof (cache
));
1039 result
= powi_as_mults_1 (gsi
, loc
, type
, (n
< 0) ? -n
: n
, cache
);
1043 /* If the original exponent was negative, reciprocate the result. */
1044 target
= make_temp_ssa_name (type
, NULL
, "powmult");
1045 div_stmt
= gimple_build_assign (target
, RDIV_EXPR
,
1046 build_real (type
, dconst1
), result
);
1047 gimple_set_location (div_stmt
, loc
);
1048 gsi_insert_before (gsi
, div_stmt
, GSI_SAME_STMT
);
1053 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1054 location info LOC. If the arguments are appropriate, create an
1055 equivalent sequence of statements prior to GSI using an optimal
1056 number of multiplications, and return an expession holding the
1060 gimple_expand_builtin_powi (gimple_stmt_iterator
*gsi
, location_t loc
,
1061 tree arg0
, HOST_WIDE_INT n
)
1063 /* Avoid largest negative number. */
1065 && ((n
>= -1 && n
<= 2)
1066 || (optimize_function_for_speed_p (cfun
)
1067 && powi_cost (n
) <= POWI_MAX_MULTS
)))
1068 return powi_as_mults (gsi
, loc
, arg0
, n
);
1073 /* Build a gimple call statement that calls FN with argument ARG.
1074 Set the lhs of the call statement to a fresh SSA name. Insert the
1075 statement prior to GSI's current position, and return the fresh
1079 build_and_insert_call (gimple_stmt_iterator
*gsi
, location_t loc
,
1085 call_stmt
= gimple_build_call (fn
, 1, arg
);
1086 ssa_target
= make_temp_ssa_name (TREE_TYPE (arg
), NULL
, "powroot");
1087 gimple_set_lhs (call_stmt
, ssa_target
);
1088 gimple_set_location (call_stmt
, loc
);
1089 gsi_insert_before (gsi
, call_stmt
, GSI_SAME_STMT
);
1094 /* Build a gimple binary operation with the given CODE and arguments
1095 ARG0, ARG1, assigning the result to a new SSA name for variable
1096 TARGET. Insert the statement prior to GSI's current position, and
1097 return the fresh SSA name.*/
1100 build_and_insert_binop (gimple_stmt_iterator
*gsi
, location_t loc
,
1101 const char *name
, enum tree_code code
,
1102 tree arg0
, tree arg1
)
1104 tree result
= make_temp_ssa_name (TREE_TYPE (arg0
), NULL
, name
);
1105 gassign
*stmt
= gimple_build_assign (result
, code
, arg0
, arg1
);
1106 gimple_set_location (stmt
, loc
);
1107 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
1111 /* Build a gimple reference operation with the given CODE and argument
1112 ARG, assigning the result to a new SSA name of TYPE with NAME.
1113 Insert the statement prior to GSI's current position, and return
1114 the fresh SSA name. */
1117 build_and_insert_ref (gimple_stmt_iterator
*gsi
, location_t loc
, tree type
,
1118 const char *name
, enum tree_code code
, tree arg0
)
1120 tree result
= make_temp_ssa_name (type
, NULL
, name
);
1121 gimple stmt
= gimple_build_assign (result
, build1 (code
, type
, arg0
));
1122 gimple_set_location (stmt
, loc
);
1123 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
1127 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1128 prior to GSI's current position, and return the fresh SSA name. */
1131 build_and_insert_cast (gimple_stmt_iterator
*gsi
, location_t loc
,
1132 tree type
, tree val
)
1134 tree result
= make_ssa_name (type
);
1135 gassign
*stmt
= gimple_build_assign (result
, NOP_EXPR
, val
);
1136 gimple_set_location (stmt
, loc
);
1137 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
1141 struct pow_synth_sqrt_info
1144 unsigned int deepest
;
1145 unsigned int num_mults
;
1148 /* Return true iff the real value C can be represented as a
1149 sum of powers of 0.5 up to N. That is:
1150 C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1151 Record in INFO the various parameters of the synthesis algorithm such
1152 as the factors a[i], the maximum 0.5 power and the number of
1153 multiplications that will be required. */
1156 representable_as_half_series_p (REAL_VALUE_TYPE c
, unsigned n
,
1157 struct pow_synth_sqrt_info
*info
)
1159 REAL_VALUE_TYPE factor
= dconsthalf
;
1160 REAL_VALUE_TYPE remainder
= c
;
1163 info
->num_mults
= 0;
1164 memset (info
->factors
, 0, n
* sizeof (bool));
1166 for (unsigned i
= 0; i
< n
; i
++)
1168 REAL_VALUE_TYPE res
;
1170 /* If something inexact happened bail out now. */
1171 if (REAL_ARITHMETIC (res
, MINUS_EXPR
, remainder
, factor
))
1174 /* We have hit zero. The number is representable as a sum
1175 of powers of 0.5. */
1176 if (REAL_VALUES_EQUAL (res
, dconst0
))
1178 info
->factors
[i
] = true;
1179 info
->deepest
= i
+ 1;
1182 else if (!REAL_VALUE_NEGATIVE (res
))
1185 info
->factors
[i
] = true;
1189 info
->factors
[i
] = false;
1191 REAL_ARITHMETIC (factor
, MULT_EXPR
, factor
, dconsthalf
);
1196 /* Return the tree corresponding to FN being applied
1197 to ARG N times at GSI and LOC.
1198 Look up previous results from CACHE if need be.
1199 cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times. */
1202 get_fn_chain (tree arg
, unsigned int n
, gimple_stmt_iterator
*gsi
,
1203 tree fn
, location_t loc
, tree
*cache
)
1205 tree res
= cache
[n
];
1208 tree prev
= get_fn_chain (arg
, n
- 1, gsi
, fn
, loc
, cache
);
1209 res
= build_and_insert_call (gsi
, loc
, fn
, prev
);
1216 /* Print to STREAM the repeated application of function FNAME to ARG
1217 N times. So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1221 print_nested_fn (FILE* stream
, const char *fname
, const char* arg
,
1225 fprintf (stream
, "%s", arg
);
1228 fprintf (stream
, "%s (", fname
);
1229 print_nested_fn (stream
, fname
, arg
, n
- 1);
1230 fprintf (stream
, ")");
1234 /* Print to STREAM the fractional sequence of sqrt chains
1235 applied to ARG, described by INFO. Used for the dump file. */
1238 dump_fractional_sqrt_sequence (FILE *stream
, const char *arg
,
1239 struct pow_synth_sqrt_info
*info
)
1241 for (unsigned int i
= 0; i
< info
->deepest
; i
++)
1243 bool is_set
= info
->factors
[i
];
1246 print_nested_fn (stream
, "sqrt", arg
, i
+ 1);
1247 if (i
!= info
->deepest
- 1)
1248 fprintf (stream
, " * ");
1253 /* Print to STREAM a representation of raising ARG to an integer
1254 power N. Used for the dump file. */
1257 dump_integer_part (FILE *stream
, const char* arg
, HOST_WIDE_INT n
)
1260 fprintf (stream
, "powi (%s, " HOST_WIDE_INT_PRINT_DEC
")", arg
, n
);
1262 fprintf (stream
, "%s", arg
);
1265 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1266 square roots. Place at GSI and LOC. Limit the maximum depth
1267 of the sqrt chains to MAX_DEPTH. Return the tree holding the
1268 result of the expanded sequence or NULL_TREE if the expansion failed.
1270 This routine assumes that ARG1 is a real number with a fractional part
1271 (the integer exponent case will have been handled earlier in
1272 gimple_expand_builtin_pow).
1275 * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1276 FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1277 FRAC_PART == ARG1 - WHOLE_PART:
1278 Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1279 POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1280 if it can be expressed as such, that is if FRAC_PART satisfies:
1281 FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1282 where integer a[i] is either 0 or 1.
1285 POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1286 --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1288 For ARG1 < 0.0 there are two approaches:
1289 * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1290 is calculated as above.
1293 POW (x, -5.625) == 1.0 / POW (x, 5.625)
1294 --> 1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1296 * (B) : WHOLE_PART := - ceil (abs (ARG1))
1297 FRAC_PART := ARG1 - WHOLE_PART
1298 and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1300 POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1301 --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1303 For ARG1 < 0.0 we choose between (A) and (B) depending on
1304 how many multiplications we'd have to do.
1305 So, for the example in (B): POW (x, -5.875), if we were to
1306 follow algorithm (A) we would produce:
1307 1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1308 which contains more multiplications than approach (B).
1310 Hopefully, this approach will eliminate potentially expensive POW library
1311 calls when unsafe floating point math is enabled and allow the compiler to
1312 further optimise the multiplies, square roots and divides produced by this
1316 expand_pow_as_sqrts (gimple_stmt_iterator
*gsi
, location_t loc
,
1317 tree arg0
, tree arg1
, HOST_WIDE_INT max_depth
)
1319 tree type
= TREE_TYPE (arg0
);
1320 machine_mode mode
= TYPE_MODE (type
);
1321 tree sqrtfn
= mathfn_built_in (type
, BUILT_IN_SQRT
);
1322 bool one_over
= true;
1327 if (TREE_CODE (arg1
) != REAL_CST
)
1330 REAL_VALUE_TYPE exp_init
= TREE_REAL_CST (arg1
);
1332 gcc_assert (max_depth
> 0);
1333 tree
*cache
= XALLOCAVEC (tree
, max_depth
+ 1);
1335 struct pow_synth_sqrt_info synth_info
;
1336 synth_info
.factors
= XALLOCAVEC (bool, max_depth
+ 1);
1337 synth_info
.deepest
= 0;
1338 synth_info
.num_mults
= 0;
1340 bool neg_exp
= REAL_VALUE_NEGATIVE (exp_init
);
1341 REAL_VALUE_TYPE exp
= real_value_abs (&exp_init
);
1343 /* The whole and fractional parts of exp. */
1344 REAL_VALUE_TYPE whole_part
;
1345 REAL_VALUE_TYPE frac_part
;
1347 real_floor (&whole_part
, mode
, &exp
);
1348 REAL_ARITHMETIC (frac_part
, MINUS_EXPR
, exp
, whole_part
);
1351 REAL_VALUE_TYPE ceil_whole
= dconst0
;
1352 REAL_VALUE_TYPE ceil_fract
= dconst0
;
1356 real_ceil (&ceil_whole
, mode
, &exp
);
1357 REAL_ARITHMETIC (ceil_fract
, MINUS_EXPR
, ceil_whole
, exp
);
1360 if (!representable_as_half_series_p (frac_part
, max_depth
, &synth_info
))
1363 /* Check whether it's more profitable to not use 1.0 / ... */
1366 struct pow_synth_sqrt_info alt_synth_info
;
1367 alt_synth_info
.factors
= XALLOCAVEC (bool, max_depth
+ 1);
1368 alt_synth_info
.deepest
= 0;
1369 alt_synth_info
.num_mults
= 0;
1371 if (representable_as_half_series_p (ceil_fract
, max_depth
,
1373 && alt_synth_info
.deepest
<= synth_info
.deepest
1374 && alt_synth_info
.num_mults
< synth_info
.num_mults
)
1376 whole_part
= ceil_whole
;
1377 frac_part
= ceil_fract
;
1378 synth_info
.deepest
= alt_synth_info
.deepest
;
1379 synth_info
.num_mults
= alt_synth_info
.num_mults
;
1380 memcpy (synth_info
.factors
, alt_synth_info
.factors
,
1381 (max_depth
+ 1) * sizeof (bool));
1386 HOST_WIDE_INT n
= real_to_integer (&whole_part
);
1387 REAL_VALUE_TYPE cint
;
1388 real_from_integer (&cint
, VOIDmode
, n
, SIGNED
);
1390 if (!real_identical (&whole_part
, &cint
))
1393 if (powi_cost (n
) + synth_info
.num_mults
> POWI_MAX_MULTS
)
1396 memset (cache
, 0, (max_depth
+ 1) * sizeof (tree
));
1398 tree integer_res
= n
== 0 ? build_real (type
, dconst1
) : arg0
;
1400 /* Calculate the integer part of the exponent. */
1403 integer_res
= gimple_expand_builtin_powi (gsi
, loc
, arg0
, n
);
1412 real_to_decimal (string
, &exp_init
, sizeof (string
), 0, 1);
1413 fprintf (dump_file
, "synthesizing pow (x, %s) as:\n", string
);
1419 fprintf (dump_file
, "1.0 / (");
1420 dump_integer_part (dump_file
, "x", n
);
1422 fprintf (dump_file
, " * ");
1423 dump_fractional_sqrt_sequence (dump_file
, "x", &synth_info
);
1424 fprintf (dump_file
, ")");
1428 dump_fractional_sqrt_sequence (dump_file
, "x", &synth_info
);
1429 fprintf (dump_file
, " / (");
1430 dump_integer_part (dump_file
, "x", n
);
1431 fprintf (dump_file
, ")");
1436 dump_fractional_sqrt_sequence (dump_file
, "x", &synth_info
);
1438 fprintf (dump_file
, " * ");
1439 dump_integer_part (dump_file
, "x", n
);
1442 fprintf (dump_file
, "\ndeepest sqrt chain: %d\n", synth_info
.deepest
);
1446 tree fract_res
= NULL_TREE
;
1449 /* Calculate the fractional part of the exponent. */
1450 for (unsigned i
= 0; i
< synth_info
.deepest
; i
++)
1452 if (synth_info
.factors
[i
])
1454 tree sqrt_chain
= get_fn_chain (arg0
, i
+ 1, gsi
, sqrtfn
, loc
, cache
);
1457 fract_res
= sqrt_chain
;
1460 fract_res
= build_and_insert_binop (gsi
, loc
, "powroot", MULT_EXPR
,
1461 fract_res
, sqrt_chain
);
1465 tree res
= NULL_TREE
;
1472 res
= build_and_insert_binop (gsi
, loc
, "powroot", MULT_EXPR
,
1473 fract_res
, integer_res
);
1477 res
= build_and_insert_binop (gsi
, loc
, "powrootrecip", RDIV_EXPR
,
1478 build_real (type
, dconst1
), res
);
1482 res
= build_and_insert_binop (gsi
, loc
, "powroot", RDIV_EXPR
,
1483 fract_res
, integer_res
);
1487 res
= build_and_insert_binop (gsi
, loc
, "powroot", MULT_EXPR
,
1488 fract_res
, integer_res
);
1492 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1493 with location info LOC. If possible, create an equivalent and
1494 less expensive sequence of statements prior to GSI, and return an
1495 expession holding the result. */
1498 gimple_expand_builtin_pow (gimple_stmt_iterator
*gsi
, location_t loc
,
1499 tree arg0
, tree arg1
)
1501 REAL_VALUE_TYPE c
, cint
, dconst1_3
, dconst1_4
, dconst1_6
;
1502 REAL_VALUE_TYPE c2
, dconst3
;
1504 tree type
, sqrtfn
, cbrtfn
, sqrt_arg0
, result
, cbrt_x
, powi_cbrt_x
;
1506 bool speed_p
= optimize_bb_for_speed_p (gsi_bb (*gsi
));
1507 bool hw_sqrt_exists
, c_is_int
, c2_is_int
;
1509 dconst1_4
= dconst1
;
1510 SET_REAL_EXP (&dconst1_4
, REAL_EXP (&dconst1_4
) - 2);
1512 /* If the exponent isn't a constant, there's nothing of interest
1514 if (TREE_CODE (arg1
) != REAL_CST
)
1517 /* If the exponent is equivalent to an integer, expand to an optimal
1518 multiplication sequence when profitable. */
1519 c
= TREE_REAL_CST (arg1
);
1520 n
= real_to_integer (&c
);
1521 real_from_integer (&cint
, VOIDmode
, n
, SIGNED
);
1522 c_is_int
= real_identical (&c
, &cint
);
1525 && ((n
>= -1 && n
<= 2)
1526 || (flag_unsafe_math_optimizations
1528 && powi_cost (n
) <= POWI_MAX_MULTS
)))
1529 return gimple_expand_builtin_powi (gsi
, loc
, arg0
, n
);
1531 /* Attempt various optimizations using sqrt and cbrt. */
1532 type
= TREE_TYPE (arg0
);
1533 mode
= TYPE_MODE (type
);
1534 sqrtfn
= mathfn_built_in (type
, BUILT_IN_SQRT
);
1536 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1537 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1540 && REAL_VALUES_EQUAL (c
, dconsthalf
)
1541 && !HONOR_SIGNED_ZEROS (mode
))
1542 return build_and_insert_call (gsi
, loc
, sqrtfn
, arg0
);
1544 hw_sqrt_exists
= optab_handler (sqrt_optab
, mode
) != CODE_FOR_nothing
;
1546 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1547 optimizations since 1./3. is not exactly representable. If x
1548 is negative and finite, the correct value of pow(x,1./3.) is
1549 a NaN with the "invalid" exception raised, because the value
1550 of 1./3. actually has an even denominator. The correct value
1551 of cbrt(x) is a negative real value. */
1552 cbrtfn
= mathfn_built_in (type
, BUILT_IN_CBRT
);
1553 dconst1_3
= real_value_truncate (mode
, dconst_third ());
1555 if (flag_unsafe_math_optimizations
1557 && (gimple_val_nonnegative_real_p (arg0
) || !HONOR_NANS (mode
))
1558 && REAL_VALUES_EQUAL (c
, dconst1_3
))
1559 return build_and_insert_call (gsi
, loc
, cbrtfn
, arg0
);
1561 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1562 if we don't have a hardware sqrt insn. */
1563 dconst1_6
= dconst1_3
;
1564 SET_REAL_EXP (&dconst1_6
, REAL_EXP (&dconst1_6
) - 1);
1566 if (flag_unsafe_math_optimizations
1569 && (gimple_val_nonnegative_real_p (arg0
) || !HONOR_NANS (mode
))
1572 && REAL_VALUES_EQUAL (c
, dconst1_6
))
1575 sqrt_arg0
= build_and_insert_call (gsi
, loc
, sqrtfn
, arg0
);
1578 return build_and_insert_call (gsi
, loc
, cbrtfn
, sqrt_arg0
);
1582 /* Attempt to expand the POW as a product of square root chains.
1583 Expand the 0.25 case even when otpimising for size. */
1584 if (flag_unsafe_math_optimizations
1587 && (speed_p
|| REAL_VALUES_EQUAL (c
, dconst1_4
))
1588 && !HONOR_SIGNED_ZEROS (mode
))
1590 unsigned int max_depth
= speed_p
1591 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH
)
1594 tree expand_with_sqrts
1595 = expand_pow_as_sqrts (gsi
, loc
, arg0
, arg1
, max_depth
);
1597 if (expand_with_sqrts
)
1598 return expand_with_sqrts
;
1601 real_arithmetic (&c2
, MULT_EXPR
, &c
, &dconst2
);
1602 n
= real_to_integer (&c2
);
1603 real_from_integer (&cint
, VOIDmode
, n
, SIGNED
);
1604 c2_is_int
= real_identical (&c2
, &cint
);
1606 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1608 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
1609 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
1611 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
1612 different from pow(x, 1./3.) due to rounding and behavior with
1613 negative x, we need to constrain this transformation to unsafe
1614 math and positive x or finite math. */
1615 real_from_integer (&dconst3
, VOIDmode
, 3, SIGNED
);
1616 real_arithmetic (&c2
, MULT_EXPR
, &c
, &dconst3
);
1617 real_round (&c2
, mode
, &c2
);
1618 n
= real_to_integer (&c2
);
1619 real_from_integer (&cint
, VOIDmode
, n
, SIGNED
);
1620 real_arithmetic (&c2
, RDIV_EXPR
, &cint
, &dconst3
);
1621 real_convert (&c2
, mode
, &c2
);
1623 if (flag_unsafe_math_optimizations
1625 && (gimple_val_nonnegative_real_p (arg0
) || !HONOR_NANS (mode
))
1626 && real_identical (&c2
, &c
)
1628 && optimize_function_for_speed_p (cfun
)
1629 && powi_cost (n
/ 3) <= POWI_MAX_MULTS
)
1631 tree powi_x_ndiv3
= NULL_TREE
;
1633 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
1634 possible or profitable, give up. Skip the degenerate case when
1635 abs(n) < 3, where the result is always 1. */
1636 if (absu_hwi (n
) >= 3)
1638 powi_x_ndiv3
= gimple_expand_builtin_powi (gsi
, loc
, arg0
,
1644 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
1645 as that creates an unnecessary variable. Instead, just produce
1646 either cbrt(x) or cbrt(x) * cbrt(x). */
1647 cbrt_x
= build_and_insert_call (gsi
, loc
, cbrtfn
, arg0
);
1649 if (absu_hwi (n
) % 3 == 1)
1650 powi_cbrt_x
= cbrt_x
;
1652 powi_cbrt_x
= build_and_insert_binop (gsi
, loc
, "powroot", MULT_EXPR
,
1655 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
1656 if (absu_hwi (n
) < 3)
1657 result
= powi_cbrt_x
;
1659 result
= build_and_insert_binop (gsi
, loc
, "powroot", MULT_EXPR
,
1660 powi_x_ndiv3
, powi_cbrt_x
);
1662 /* If n is negative, reciprocate the result. */
1664 result
= build_and_insert_binop (gsi
, loc
, "powroot", RDIV_EXPR
,
1665 build_real (type
, dconst1
), result
);
1670 /* No optimizations succeeded. */
1674 /* ARG is the argument to a cabs builtin call in GSI with location info
1675 LOC. Create a sequence of statements prior to GSI that calculates
1676 sqrt(R*R + I*I), where R and I are the real and imaginary components
1677 of ARG, respectively. Return an expression holding the result. */
1680 gimple_expand_builtin_cabs (gimple_stmt_iterator
*gsi
, location_t loc
, tree arg
)
1682 tree real_part
, imag_part
, addend1
, addend2
, sum
, result
;
1683 tree type
= TREE_TYPE (TREE_TYPE (arg
));
1684 tree sqrtfn
= mathfn_built_in (type
, BUILT_IN_SQRT
);
1685 machine_mode mode
= TYPE_MODE (type
);
1687 if (!flag_unsafe_math_optimizations
1688 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi
)))
1690 || optab_handler (sqrt_optab
, mode
) == CODE_FOR_nothing
)
1693 real_part
= build_and_insert_ref (gsi
, loc
, type
, "cabs",
1694 REALPART_EXPR
, arg
);
1695 addend1
= build_and_insert_binop (gsi
, loc
, "cabs", MULT_EXPR
,
1696 real_part
, real_part
);
1697 imag_part
= build_and_insert_ref (gsi
, loc
, type
, "cabs",
1698 IMAGPART_EXPR
, arg
);
1699 addend2
= build_and_insert_binop (gsi
, loc
, "cabs", MULT_EXPR
,
1700 imag_part
, imag_part
);
1701 sum
= build_and_insert_binop (gsi
, loc
, "cabs", PLUS_EXPR
, addend1
, addend2
);
1702 result
= build_and_insert_call (gsi
, loc
, sqrtfn
, sum
);
1707 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1708 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
1709 an optimal number of multiplies, when n is a constant. */
1713 const pass_data pass_data_cse_sincos
=
1715 GIMPLE_PASS
, /* type */
1716 "sincos", /* name */
1717 OPTGROUP_NONE
, /* optinfo_flags */
1718 TV_NONE
, /* tv_id */
1719 PROP_ssa
, /* properties_required */
1720 0, /* properties_provided */
1721 0, /* properties_destroyed */
1722 0, /* todo_flags_start */
1723 TODO_update_ssa
, /* todo_flags_finish */
1726 class pass_cse_sincos
: public gimple_opt_pass
1729 pass_cse_sincos (gcc::context
*ctxt
)
1730 : gimple_opt_pass (pass_data_cse_sincos
, ctxt
)
1733 /* opt_pass methods: */
1734 virtual bool gate (function
*)
1736 /* We no longer require either sincos or cexp, since powi expansion
1737 piggybacks on this pass. */
1741 virtual unsigned int execute (function
*);
1743 }; // class pass_cse_sincos
1746 pass_cse_sincos::execute (function
*fun
)
1749 bool cfg_changed
= false;
1751 calculate_dominance_info (CDI_DOMINATORS
);
1752 memset (&sincos_stats
, 0, sizeof (sincos_stats
));
1754 FOR_EACH_BB_FN (bb
, fun
)
1756 gimple_stmt_iterator gsi
;
1757 bool cleanup_eh
= false;
1759 for (gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
1761 gimple stmt
= gsi_stmt (gsi
);
1764 /* Only the last stmt in a bb could throw, no need to call
1765 gimple_purge_dead_eh_edges if we change something in the middle
1766 of a basic block. */
1769 if (is_gimple_call (stmt
)
1770 && gimple_call_lhs (stmt
)
1771 && (fndecl
= gimple_call_fndecl (stmt
))
1772 && DECL_BUILT_IN_CLASS (fndecl
) == BUILT_IN_NORMAL
)
1774 tree arg
, arg0
, arg1
, result
;
1778 switch (DECL_FUNCTION_CODE (fndecl
))
1780 CASE_FLT_FN (BUILT_IN_COS
):
1781 CASE_FLT_FN (BUILT_IN_SIN
):
1782 CASE_FLT_FN (BUILT_IN_CEXPI
):
1783 /* Make sure we have either sincos or cexp. */
1784 if (!targetm
.libc_has_function (function_c99_math_complex
)
1785 && !targetm
.libc_has_function (function_sincos
))
1788 arg
= gimple_call_arg (stmt
, 0);
1789 if (TREE_CODE (arg
) == SSA_NAME
)
1790 cfg_changed
|= execute_cse_sincos_1 (arg
);
1793 CASE_FLT_FN (BUILT_IN_POW
):
1794 arg0
= gimple_call_arg (stmt
, 0);
1795 arg1
= gimple_call_arg (stmt
, 1);
1797 loc
= gimple_location (stmt
);
1798 result
= gimple_expand_builtin_pow (&gsi
, loc
, arg0
, arg1
);
1802 tree lhs
= gimple_get_lhs (stmt
);
1803 gassign
*new_stmt
= gimple_build_assign (lhs
, result
);
1804 gimple_set_location (new_stmt
, loc
);
1805 unlink_stmt_vdef (stmt
);
1806 gsi_replace (&gsi
, new_stmt
, true);
1808 if (gimple_vdef (stmt
))
1809 release_ssa_name (gimple_vdef (stmt
));
1813 CASE_FLT_FN (BUILT_IN_POWI
):
1814 arg0
= gimple_call_arg (stmt
, 0);
1815 arg1
= gimple_call_arg (stmt
, 1);
1816 loc
= gimple_location (stmt
);
1818 if (real_minus_onep (arg0
))
1820 tree t0
, t1
, cond
, one
, minus_one
;
1823 t0
= TREE_TYPE (arg0
);
1824 t1
= TREE_TYPE (arg1
);
1825 one
= build_real (t0
, dconst1
);
1826 minus_one
= build_real (t0
, dconstm1
);
1828 cond
= make_temp_ssa_name (t1
, NULL
, "powi_cond");
1829 stmt
= gimple_build_assign (cond
, BIT_AND_EXPR
,
1830 arg1
, build_int_cst (t1
, 1));
1831 gimple_set_location (stmt
, loc
);
1832 gsi_insert_before (&gsi
, stmt
, GSI_SAME_STMT
);
1834 result
= make_temp_ssa_name (t0
, NULL
, "powi");
1835 stmt
= gimple_build_assign (result
, COND_EXPR
, cond
,
1837 gimple_set_location (stmt
, loc
);
1838 gsi_insert_before (&gsi
, stmt
, GSI_SAME_STMT
);
1842 if (!tree_fits_shwi_p (arg1
))
1845 n
= tree_to_shwi (arg1
);
1846 result
= gimple_expand_builtin_powi (&gsi
, loc
, arg0
, n
);
1851 tree lhs
= gimple_get_lhs (stmt
);
1852 gassign
*new_stmt
= gimple_build_assign (lhs
, result
);
1853 gimple_set_location (new_stmt
, loc
);
1854 unlink_stmt_vdef (stmt
);
1855 gsi_replace (&gsi
, new_stmt
, true);
1857 if (gimple_vdef (stmt
))
1858 release_ssa_name (gimple_vdef (stmt
));
1862 CASE_FLT_FN (BUILT_IN_CABS
):
1863 arg0
= gimple_call_arg (stmt
, 0);
1864 loc
= gimple_location (stmt
);
1865 result
= gimple_expand_builtin_cabs (&gsi
, loc
, arg0
);
1869 tree lhs
= gimple_get_lhs (stmt
);
1870 gassign
*new_stmt
= gimple_build_assign (lhs
, result
);
1871 gimple_set_location (new_stmt
, loc
);
1872 unlink_stmt_vdef (stmt
);
1873 gsi_replace (&gsi
, new_stmt
, true);
1875 if (gimple_vdef (stmt
))
1876 release_ssa_name (gimple_vdef (stmt
));
1885 cfg_changed
|= gimple_purge_dead_eh_edges (bb
);
1888 statistics_counter_event (fun
, "sincos statements inserted",
1889 sincos_stats
.inserted
);
1891 free_dominance_info (CDI_DOMINATORS
);
1892 return cfg_changed
? TODO_cleanup_cfg
: 0;
1898 make_pass_cse_sincos (gcc::context
*ctxt
)
1900 return new pass_cse_sincos (ctxt
);
1903 /* A symbolic number is used to detect byte permutation and selection
1904 patterns. Therefore the field N contains an artificial number
1905 consisting of octet sized markers:
1907 0 - target byte has the value 0
1908 FF - target byte has an unknown value (eg. due to sign extension)
1909 1..size - marker value is the target byte index minus one.
1911 To detect permutations on memory sources (arrays and structures), a symbolic
1912 number is also associated a base address (the array or structure the load is
1913 made from), an offset from the base address and a range which gives the
1914 difference between the highest and lowest accessed memory location to make
1915 such a symbolic number. The range is thus different from size which reflects
1916 the size of the type of current expression. Note that for non memory source,
1917 range holds the same value as size.
1919 For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1920 a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1921 still have a size of 2 but this time a range of 1. */
1923 struct symbolic_number
{
1928 HOST_WIDE_INT bytepos
;
1931 unsigned HOST_WIDE_INT range
;
1934 #define BITS_PER_MARKER 8
1935 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1936 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1937 #define HEAD_MARKER(n, size) \
1938 ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1940 /* The number which the find_bswap_or_nop_1 result should match in
1941 order to have a nop. The number is masked according to the size of
1942 the symbolic number before using it. */
1943 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1944 (uint64_t)0x08070605 << 32 | 0x04030201)
1946 /* The number which the find_bswap_or_nop_1 result should match in
1947 order to have a byte swap. The number is masked according to the
1948 size of the symbolic number before using it. */
1949 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1950 (uint64_t)0x01020304 << 32 | 0x05060708)
1952 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1953 number N. Return false if the requested operation is not permitted
1954 on a symbolic number. */
1957 do_shift_rotate (enum tree_code code
,
1958 struct symbolic_number
*n
,
1961 int i
, size
= TYPE_PRECISION (n
->type
) / BITS_PER_UNIT
;
1962 unsigned head_marker
;
1964 if (count
% BITS_PER_UNIT
!= 0)
1966 count
= (count
/ BITS_PER_UNIT
) * BITS_PER_MARKER
;
1968 /* Zero out the extra bits of N in order to avoid them being shifted
1969 into the significant bits. */
1970 if (size
< 64 / BITS_PER_MARKER
)
1971 n
->n
&= ((uint64_t) 1 << (size
* BITS_PER_MARKER
)) - 1;
1979 head_marker
= HEAD_MARKER (n
->n
, size
);
1981 /* Arithmetic shift of signed type: result is dependent on the value. */
1982 if (!TYPE_UNSIGNED (n
->type
) && head_marker
)
1983 for (i
= 0; i
< count
/ BITS_PER_MARKER
; i
++)
1984 n
->n
|= (uint64_t) MARKER_BYTE_UNKNOWN
1985 << ((size
- 1 - i
) * BITS_PER_MARKER
);
1988 n
->n
= (n
->n
<< count
) | (n
->n
>> ((size
* BITS_PER_MARKER
) - count
));
1991 n
->n
= (n
->n
>> count
) | (n
->n
<< ((size
* BITS_PER_MARKER
) - count
));
1996 /* Zero unused bits for size. */
1997 if (size
< 64 / BITS_PER_MARKER
)
1998 n
->n
&= ((uint64_t) 1 << (size
* BITS_PER_MARKER
)) - 1;
2002 /* Perform sanity checking for the symbolic number N and the gimple
2006 verify_symbolic_number_p (struct symbolic_number
*n
, gimple stmt
)
2010 lhs_type
= gimple_expr_type (stmt
);
2012 if (TREE_CODE (lhs_type
) != INTEGER_TYPE
)
2015 if (TYPE_PRECISION (lhs_type
) != TYPE_PRECISION (n
->type
))
2021 /* Initialize the symbolic number N for the bswap pass from the base element
2022 SRC manipulated by the bitwise OR expression. */
2025 init_symbolic_number (struct symbolic_number
*n
, tree src
)
2029 n
->base_addr
= n
->offset
= n
->alias_set
= n
->vuse
= NULL_TREE
;
2031 /* Set up the symbolic number N by setting each byte to a value between 1 and
2032 the byte size of rhs1. The highest order byte is set to n->size and the
2033 lowest order byte to 1. */
2034 n
->type
= TREE_TYPE (src
);
2035 size
= TYPE_PRECISION (n
->type
);
2036 if (size
% BITS_PER_UNIT
!= 0)
2038 size
/= BITS_PER_UNIT
;
2039 if (size
> 64 / BITS_PER_MARKER
)
2044 if (size
< 64 / BITS_PER_MARKER
)
2045 n
->n
&= ((uint64_t) 1 << (size
* BITS_PER_MARKER
)) - 1;
2050 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2051 the answer. If so, REF is that memory source and the base of the memory area
2052 accessed and the offset of the access from that base are recorded in N. */
2055 find_bswap_or_nop_load (gimple stmt
, tree ref
, struct symbolic_number
*n
)
2057 /* Leaf node is an array or component ref. Memorize its base and
2058 offset from base to compare to other such leaf node. */
2059 HOST_WIDE_INT bitsize
, bitpos
;
2061 int unsignedp
, reversep
, volatilep
;
2062 tree offset
, base_addr
;
2064 /* Not prepared to handle PDP endian. */
2065 if (BYTES_BIG_ENDIAN
!= WORDS_BIG_ENDIAN
)
2068 if (!gimple_assign_load_p (stmt
) || gimple_has_volatile_ops (stmt
))
2071 base_addr
= get_inner_reference (ref
, &bitsize
, &bitpos
, &offset
, &mode
,
2072 &unsignedp
, &reversep
, &volatilep
, false);
2074 if (TREE_CODE (base_addr
) == MEM_REF
)
2076 offset_int bit_offset
= 0;
2077 tree off
= TREE_OPERAND (base_addr
, 1);
2079 if (!integer_zerop (off
))
2081 offset_int boff
, coff
= mem_ref_offset (base_addr
);
2082 boff
= wi::lshift (coff
, LOG2_BITS_PER_UNIT
);
2086 base_addr
= TREE_OPERAND (base_addr
, 0);
2088 /* Avoid returning a negative bitpos as this may wreak havoc later. */
2089 if (wi::neg_p (bit_offset
))
2091 offset_int mask
= wi::mask
<offset_int
> (LOG2_BITS_PER_UNIT
, false);
2092 offset_int tem
= bit_offset
.and_not (mask
);
2093 /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2094 Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */
2096 tem
= wi::arshift (tem
, LOG2_BITS_PER_UNIT
);
2098 offset
= size_binop (PLUS_EXPR
, offset
,
2099 wide_int_to_tree (sizetype
, tem
));
2101 offset
= wide_int_to_tree (sizetype
, tem
);
2104 bitpos
+= bit_offset
.to_shwi ();
2107 if (bitpos
% BITS_PER_UNIT
)
2109 if (bitsize
% BITS_PER_UNIT
)
2114 if (!init_symbolic_number (n
, ref
))
2116 n
->base_addr
= base_addr
;
2118 n
->bytepos
= bitpos
/ BITS_PER_UNIT
;
2119 n
->alias_set
= reference_alias_ptr_type (ref
);
2120 n
->vuse
= gimple_vuse (stmt
);
2124 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2125 symbolic number N1 and N2 whose source statements are respectively
2126 SOURCE_STMT1 and SOURCE_STMT2. */
2129 perform_symbolic_merge (gimple source_stmt1
, struct symbolic_number
*n1
,
2130 gimple source_stmt2
, struct symbolic_number
*n2
,
2131 struct symbolic_number
*n
)
2136 struct symbolic_number
*n_start
;
2138 /* Sources are different, cancel bswap if they are not memory location with
2139 the same base (array, structure, ...). */
2140 if (gimple_assign_rhs1 (source_stmt1
) != gimple_assign_rhs1 (source_stmt2
))
2143 HOST_WIDE_INT start_sub
, end_sub
, end1
, end2
, end
;
2144 struct symbolic_number
*toinc_n_ptr
, *n_end
;
2146 if (!n1
->base_addr
|| !n2
->base_addr
2147 || !operand_equal_p (n1
->base_addr
, n2
->base_addr
, 0))
2150 if (!n1
->offset
!= !n2
->offset
2151 || (n1
->offset
&& !operand_equal_p (n1
->offset
, n2
->offset
, 0)))
2154 if (n1
->bytepos
< n2
->bytepos
)
2157 start_sub
= n2
->bytepos
- n1
->bytepos
;
2158 source_stmt
= source_stmt1
;
2163 start_sub
= n1
->bytepos
- n2
->bytepos
;
2164 source_stmt
= source_stmt2
;
2167 /* Find the highest address at which a load is performed and
2168 compute related info. */
2169 end1
= n1
->bytepos
+ (n1
->range
- 1);
2170 end2
= n2
->bytepos
+ (n2
->range
- 1);
2174 end_sub
= end2
- end1
;
2179 end_sub
= end1
- end2
;
2181 n_end
= (end2
> end1
) ? n2
: n1
;
2183 /* Find symbolic number whose lsb is the most significant. */
2184 if (BYTES_BIG_ENDIAN
)
2185 toinc_n_ptr
= (n_end
== n1
) ? n2
: n1
;
2187 toinc_n_ptr
= (n_start
== n1
) ? n2
: n1
;
2189 n
->range
= end
- n_start
->bytepos
+ 1;
2191 /* Check that the range of memory covered can be represented by
2192 a symbolic number. */
2193 if (n
->range
> 64 / BITS_PER_MARKER
)
2196 /* Reinterpret byte marks in symbolic number holding the value of
2197 bigger weight according to target endianness. */
2198 inc
= BYTES_BIG_ENDIAN
? end_sub
: start_sub
;
2199 size
= TYPE_PRECISION (n1
->type
) / BITS_PER_UNIT
;
2200 for (i
= 0; i
< size
; i
++, inc
<<= BITS_PER_MARKER
)
2203 = (toinc_n_ptr
->n
>> (i
* BITS_PER_MARKER
)) & MARKER_MASK
;
2204 if (marker
&& marker
!= MARKER_BYTE_UNKNOWN
)
2205 toinc_n_ptr
->n
+= inc
;
2210 n
->range
= n1
->range
;
2212 source_stmt
= source_stmt1
;
2216 || alias_ptr_types_compatible_p (n1
->alias_set
, n2
->alias_set
))
2217 n
->alias_set
= n1
->alias_set
;
2219 n
->alias_set
= ptr_type_node
;
2220 n
->vuse
= n_start
->vuse
;
2221 n
->base_addr
= n_start
->base_addr
;
2222 n
->offset
= n_start
->offset
;
2223 n
->bytepos
= n_start
->bytepos
;
2224 n
->type
= n_start
->type
;
2225 size
= TYPE_PRECISION (n
->type
) / BITS_PER_UNIT
;
2227 for (i
= 0, mask
= MARKER_MASK
; i
< size
; i
++, mask
<<= BITS_PER_MARKER
)
2229 uint64_t masked1
, masked2
;
2231 masked1
= n1
->n
& mask
;
2232 masked2
= n2
->n
& mask
;
2233 if (masked1
&& masked2
&& masked1
!= masked2
)
2236 n
->n
= n1
->n
| n2
->n
;
2241 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2242 the operation given by the rhs of STMT on the result. If the operation
2243 could successfully be executed the function returns a gimple stmt whose
2244 rhs's first tree is the expression of the source operand and NULL
2248 find_bswap_or_nop_1 (gimple stmt
, struct symbolic_number
*n
, int limit
)
2250 enum tree_code code
;
2251 tree rhs1
, rhs2
= NULL
;
2252 gimple rhs1_stmt
, rhs2_stmt
, source_stmt1
;
2253 enum gimple_rhs_class rhs_class
;
2255 if (!limit
|| !is_gimple_assign (stmt
))
2258 rhs1
= gimple_assign_rhs1 (stmt
);
2260 if (find_bswap_or_nop_load (stmt
, rhs1
, n
))
2263 if (TREE_CODE (rhs1
) != SSA_NAME
)
2266 code
= gimple_assign_rhs_code (stmt
);
2267 rhs_class
= gimple_assign_rhs_class (stmt
);
2268 rhs1_stmt
= SSA_NAME_DEF_STMT (rhs1
);
2270 if (rhs_class
== GIMPLE_BINARY_RHS
)
2271 rhs2
= gimple_assign_rhs2 (stmt
);
2273 /* Handle unary rhs and binary rhs with integer constants as second
2276 if (rhs_class
== GIMPLE_UNARY_RHS
2277 || (rhs_class
== GIMPLE_BINARY_RHS
2278 && TREE_CODE (rhs2
) == INTEGER_CST
))
2280 if (code
!= BIT_AND_EXPR
2281 && code
!= LSHIFT_EXPR
2282 && code
!= RSHIFT_EXPR
2283 && code
!= LROTATE_EXPR
2284 && code
!= RROTATE_EXPR
2285 && !CONVERT_EXPR_CODE_P (code
))
2288 source_stmt1
= find_bswap_or_nop_1 (rhs1_stmt
, n
, limit
- 1);
2290 /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2291 we have to initialize the symbolic number. */
2294 if (gimple_assign_load_p (stmt
)
2295 || !init_symbolic_number (n
, rhs1
))
2297 source_stmt1
= stmt
;
2304 int i
, size
= TYPE_PRECISION (n
->type
) / BITS_PER_UNIT
;
2305 uint64_t val
= int_cst_value (rhs2
), mask
= 0;
2306 uint64_t tmp
= (1 << BITS_PER_UNIT
) - 1;
2308 /* Only constants masking full bytes are allowed. */
2309 for (i
= 0; i
< size
; i
++, tmp
<<= BITS_PER_UNIT
)
2310 if ((val
& tmp
) != 0 && (val
& tmp
) != tmp
)
2313 mask
|= (uint64_t) MARKER_MASK
<< (i
* BITS_PER_MARKER
);
2322 if (!do_shift_rotate (code
, n
, (int) TREE_INT_CST_LOW (rhs2
)))
2327 int i
, type_size
, old_type_size
;
2330 type
= gimple_expr_type (stmt
);
2331 type_size
= TYPE_PRECISION (type
);
2332 if (type_size
% BITS_PER_UNIT
!= 0)
2334 type_size
/= BITS_PER_UNIT
;
2335 if (type_size
> 64 / BITS_PER_MARKER
)
2338 /* Sign extension: result is dependent on the value. */
2339 old_type_size
= TYPE_PRECISION (n
->type
) / BITS_PER_UNIT
;
2340 if (!TYPE_UNSIGNED (n
->type
) && type_size
> old_type_size
2341 && HEAD_MARKER (n
->n
, old_type_size
))
2342 for (i
= 0; i
< type_size
- old_type_size
; i
++)
2343 n
->n
|= (uint64_t) MARKER_BYTE_UNKNOWN
2344 << ((type_size
- 1 - i
) * BITS_PER_MARKER
);
2346 if (type_size
< 64 / BITS_PER_MARKER
)
2348 /* If STMT casts to a smaller type mask out the bits not
2349 belonging to the target type. */
2350 n
->n
&= ((uint64_t) 1 << (type_size
* BITS_PER_MARKER
)) - 1;
2354 n
->range
= type_size
;
2360 return verify_symbolic_number_p (n
, stmt
) ? source_stmt1
: NULL
;
2363 /* Handle binary rhs. */
2365 if (rhs_class
== GIMPLE_BINARY_RHS
)
2367 struct symbolic_number n1
, n2
;
2368 gimple source_stmt
, source_stmt2
;
2370 if (code
!= BIT_IOR_EXPR
)
2373 if (TREE_CODE (rhs2
) != SSA_NAME
)
2376 rhs2_stmt
= SSA_NAME_DEF_STMT (rhs2
);
2381 source_stmt1
= find_bswap_or_nop_1 (rhs1_stmt
, &n1
, limit
- 1);
2386 source_stmt2
= find_bswap_or_nop_1 (rhs2_stmt
, &n2
, limit
- 1);
2391 if (TYPE_PRECISION (n1
.type
) != TYPE_PRECISION (n2
.type
))
2394 if (!n1
.vuse
!= !n2
.vuse
2395 || (n1
.vuse
&& !operand_equal_p (n1
.vuse
, n2
.vuse
, 0)))
2399 = perform_symbolic_merge (source_stmt1
, &n1
, source_stmt2
, &n2
, n
);
2404 if (!verify_symbolic_number_p (n
, stmt
))
2416 /* Check if STMT completes a bswap implementation or a read in a given
2417 endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2418 accordingly. It also sets N to represent the kind of operations
2419 performed: size of the resulting expression and whether it works on
2420 a memory source, and if so alias-set and vuse. At last, the
2421 function returns a stmt whose rhs's first tree is the source
2425 find_bswap_or_nop (gimple stmt
, struct symbolic_number
*n
, bool *bswap
)
2427 /* The number which the find_bswap_or_nop_1 result should match in order
2428 to have a full byte swap. The number is shifted to the right
2429 according to the size of the symbolic number before using it. */
2430 uint64_t cmpxchg
= CMPXCHG
;
2431 uint64_t cmpnop
= CMPNOP
;
2436 /* The last parameter determines the depth search limit. It usually
2437 correlates directly to the number n of bytes to be touched. We
2438 increase that number by log2(n) + 1 here in order to also
2439 cover signed -> unsigned conversions of the src operand as can be seen
2440 in libgcc, and for initial shift/and operation of the src operand. */
2441 limit
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt
)));
2442 limit
+= 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT
) limit
);
2443 source_stmt
= find_bswap_or_nop_1 (stmt
, n
, limit
);
2448 /* Find real size of result (highest non-zero byte). */
2454 for (tmpn
= n
->n
, rsize
= 0; tmpn
; tmpn
>>= BITS_PER_MARKER
, rsize
++);
2458 /* Zero out the extra bits of N and CMP*. */
2459 if (n
->range
< (int) sizeof (int64_t))
2463 mask
= ((uint64_t) 1 << (n
->range
* BITS_PER_MARKER
)) - 1;
2464 cmpxchg
>>= (64 / BITS_PER_MARKER
- n
->range
) * BITS_PER_MARKER
;
2468 /* A complete byte swap should make the symbolic number to start with
2469 the largest digit in the highest order byte. Unchanged symbolic
2470 number indicates a read with same endianness as target architecture. */
2473 else if (n
->n
== cmpxchg
)
2478 /* Useless bit manipulation performed by code. */
2479 if (!n
->base_addr
&& n
->n
== cmpnop
)
2482 n
->range
*= BITS_PER_UNIT
;
2488 const pass_data pass_data_optimize_bswap
=
2490 GIMPLE_PASS
, /* type */
2492 OPTGROUP_NONE
, /* optinfo_flags */
2493 TV_NONE
, /* tv_id */
2494 PROP_ssa
, /* properties_required */
2495 0, /* properties_provided */
2496 0, /* properties_destroyed */
2497 0, /* todo_flags_start */
2498 0, /* todo_flags_finish */
2501 class pass_optimize_bswap
: public gimple_opt_pass
2504 pass_optimize_bswap (gcc::context
*ctxt
)
2505 : gimple_opt_pass (pass_data_optimize_bswap
, ctxt
)
2508 /* opt_pass methods: */
2509 virtual bool gate (function
*)
2511 return flag_expensive_optimizations
&& optimize
;
2514 virtual unsigned int execute (function
*);
2516 }; // class pass_optimize_bswap
2518 /* Perform the bswap optimization: replace the expression computed in the rhs
2519 of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2520 Which of these alternatives replace the rhs is given by N->base_addr (non
2521 null if a load is needed) and BSWAP. The type, VUSE and set-alias of the
2522 load to perform are also given in N while the builtin bswap invoke is given
2523 in FNDEL. Finally, if a load is involved, SRC_STMT refers to one of the
2524 load statements involved to construct the rhs in CUR_STMT and N->range gives
2525 the size of the rhs expression for maintaining some statistics.
2527 Note that if the replacement involve a load, CUR_STMT is moved just after
2528 SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2529 changing of basic block. */
2532 bswap_replace (gimple cur_stmt
, gimple src_stmt
, tree fndecl
, tree bswap_type
,
2533 tree load_type
, struct symbolic_number
*n
, bool bswap
)
2535 gimple_stmt_iterator gsi
;
2539 gsi
= gsi_for_stmt (cur_stmt
);
2540 src
= gimple_assign_rhs1 (src_stmt
);
2541 tgt
= gimple_assign_lhs (cur_stmt
);
2543 /* Need to load the value from memory first. */
2546 gimple_stmt_iterator gsi_ins
= gsi_for_stmt (src_stmt
);
2547 tree addr_expr
, addr_tmp
, val_expr
, val_tmp
;
2548 tree load_offset_ptr
, aligned_load_type
;
2549 gimple addr_stmt
, load_stmt
;
2551 HOST_WIDE_INT load_offset
= 0;
2553 align
= get_object_alignment (src
);
2554 /* If the new access is smaller than the original one, we need
2555 to perform big endian adjustment. */
2556 if (BYTES_BIG_ENDIAN
)
2558 HOST_WIDE_INT bitsize
, bitpos
;
2560 int unsignedp
, reversep
, volatilep
;
2563 get_inner_reference (src
, &bitsize
, &bitpos
, &offset
, &mode
,
2564 &unsignedp
, &reversep
, &volatilep
, false);
2565 if (n
->range
< (unsigned HOST_WIDE_INT
) bitsize
)
2567 load_offset
= (bitsize
- n
->range
) / BITS_PER_UNIT
;
2568 unsigned HOST_WIDE_INT l
2569 = (load_offset
* BITS_PER_UNIT
) & (align
- 1);
2576 && align
< GET_MODE_ALIGNMENT (TYPE_MODE (load_type
))
2577 && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type
), align
))
2580 /* Move cur_stmt just before one of the load of the original
2581 to ensure it has the same VUSE. See PR61517 for what could
2583 gsi_move_before (&gsi
, &gsi_ins
);
2584 gsi
= gsi_for_stmt (cur_stmt
);
2586 /* Compute address to load from and cast according to the size
2588 addr_expr
= build_fold_addr_expr (unshare_expr (src
));
2589 if (is_gimple_mem_ref_addr (addr_expr
))
2590 addr_tmp
= addr_expr
;
2593 addr_tmp
= make_temp_ssa_name (TREE_TYPE (addr_expr
), NULL
,
2595 addr_stmt
= gimple_build_assign (addr_tmp
, addr_expr
);
2596 gsi_insert_before (&gsi
, addr_stmt
, GSI_SAME_STMT
);
2599 /* Perform the load. */
2600 aligned_load_type
= load_type
;
2601 if (align
< TYPE_ALIGN (load_type
))
2602 aligned_load_type
= build_aligned_type (load_type
, align
);
2603 load_offset_ptr
= build_int_cst (n
->alias_set
, load_offset
);
2604 val_expr
= fold_build2 (MEM_REF
, aligned_load_type
, addr_tmp
,
2610 nop_stats
.found_16bit
++;
2611 else if (n
->range
== 32)
2612 nop_stats
.found_32bit
++;
2615 gcc_assert (n
->range
== 64);
2616 nop_stats
.found_64bit
++;
2619 /* Convert the result of load if necessary. */
2620 if (!useless_type_conversion_p (TREE_TYPE (tgt
), load_type
))
2622 val_tmp
= make_temp_ssa_name (aligned_load_type
, NULL
,
2624 load_stmt
= gimple_build_assign (val_tmp
, val_expr
);
2625 gimple_set_vuse (load_stmt
, n
->vuse
);
2626 gsi_insert_before (&gsi
, load_stmt
, GSI_SAME_STMT
);
2627 gimple_assign_set_rhs_with_ops (&gsi
, NOP_EXPR
, val_tmp
);
2631 gimple_assign_set_rhs_with_ops (&gsi
, MEM_REF
, val_expr
);
2632 gimple_set_vuse (cur_stmt
, n
->vuse
);
2634 update_stmt (cur_stmt
);
2639 "%d bit load in target endianness found at: ",
2641 print_gimple_stmt (dump_file
, cur_stmt
, 0, 0);
2647 val_tmp
= make_temp_ssa_name (aligned_load_type
, NULL
, "load_dst");
2648 load_stmt
= gimple_build_assign (val_tmp
, val_expr
);
2649 gimple_set_vuse (load_stmt
, n
->vuse
);
2650 gsi_insert_before (&gsi
, load_stmt
, GSI_SAME_STMT
);
2656 bswap_stats
.found_16bit
++;
2657 else if (n
->range
== 32)
2658 bswap_stats
.found_32bit
++;
2661 gcc_assert (n
->range
== 64);
2662 bswap_stats
.found_64bit
++;
2667 /* Convert the src expression if necessary. */
2668 if (!useless_type_conversion_p (TREE_TYPE (tmp
), bswap_type
))
2670 gimple convert_stmt
;
2672 tmp
= make_temp_ssa_name (bswap_type
, NULL
, "bswapsrc");
2673 convert_stmt
= gimple_build_assign (tmp
, NOP_EXPR
, src
);
2674 gsi_insert_before (&gsi
, convert_stmt
, GSI_SAME_STMT
);
2677 /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values
2678 are considered as rotation of 2N bit values by N bits is generally not
2679 equivalent to a bswap. Consider for instance 0x01020304 r>> 16 which
2680 gives 0x03040102 while a bswap for that value is 0x04030201. */
2681 if (bswap
&& n
->range
== 16)
2683 tree count
= build_int_cst (NULL
, BITS_PER_UNIT
);
2684 src
= fold_build2 (LROTATE_EXPR
, bswap_type
, tmp
, count
);
2685 bswap_stmt
= gimple_build_assign (NULL
, src
);
2688 bswap_stmt
= gimple_build_call (fndecl
, 1, tmp
);
2692 /* Convert the result if necessary. */
2693 if (!useless_type_conversion_p (TREE_TYPE (tgt
), bswap_type
))
2695 gimple convert_stmt
;
2697 tmp
= make_temp_ssa_name (bswap_type
, NULL
, "bswapdst");
2698 convert_stmt
= gimple_build_assign (tgt
, NOP_EXPR
, tmp
);
2699 gsi_insert_after (&gsi
, convert_stmt
, GSI_SAME_STMT
);
2702 gimple_set_lhs (bswap_stmt
, tmp
);
2706 fprintf (dump_file
, "%d bit bswap implementation found at: ",
2708 print_gimple_stmt (dump_file
, cur_stmt
, 0, 0);
2711 gsi_insert_after (&gsi
, bswap_stmt
, GSI_SAME_STMT
);
2712 gsi_remove (&gsi
, true);
2716 /* Find manual byte swap implementations as well as load in a given
2717 endianness. Byte swaps are turned into a bswap builtin invokation
2718 while endian loads are converted to bswap builtin invokation or
2719 simple load according to the target endianness. */
2722 pass_optimize_bswap::execute (function
*fun
)
2725 bool bswap32_p
, bswap64_p
;
2726 bool changed
= false;
2727 tree bswap32_type
= NULL_TREE
, bswap64_type
= NULL_TREE
;
2729 if (BITS_PER_UNIT
!= 8)
2732 bswap32_p
= (builtin_decl_explicit_p (BUILT_IN_BSWAP32
)
2733 && optab_handler (bswap_optab
, SImode
) != CODE_FOR_nothing
);
2734 bswap64_p
= (builtin_decl_explicit_p (BUILT_IN_BSWAP64
)
2735 && (optab_handler (bswap_optab
, DImode
) != CODE_FOR_nothing
2736 || (bswap32_p
&& word_mode
== SImode
)));
2738 /* Determine the argument type of the builtins. The code later on
2739 assumes that the return and argument type are the same. */
2742 tree fndecl
= builtin_decl_explicit (BUILT_IN_BSWAP32
);
2743 bswap32_type
= TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl
)));
2748 tree fndecl
= builtin_decl_explicit (BUILT_IN_BSWAP64
);
2749 bswap64_type
= TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl
)));
2752 memset (&nop_stats
, 0, sizeof (nop_stats
));
2753 memset (&bswap_stats
, 0, sizeof (bswap_stats
));
2755 FOR_EACH_BB_FN (bb
, fun
)
2757 gimple_stmt_iterator gsi
;
2759 /* We do a reverse scan for bswap patterns to make sure we get the
2760 widest match. As bswap pattern matching doesn't handle previously
2761 inserted smaller bswap replacements as sub-patterns, the wider
2762 variant wouldn't be detected. */
2763 for (gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);)
2765 gimple src_stmt
, cur_stmt
= gsi_stmt (gsi
);
2766 tree fndecl
= NULL_TREE
, bswap_type
= NULL_TREE
, load_type
;
2767 enum tree_code code
;
2768 struct symbolic_number n
;
2771 /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2772 might be moved to a different basic block by bswap_replace and gsi
2773 must not points to it if that's the case. Moving the gsi_prev
2774 there make sure that gsi points to the statement previous to
2775 cur_stmt while still making sure that all statements are
2776 considered in this basic block. */
2779 if (!is_gimple_assign (cur_stmt
))
2782 code
= gimple_assign_rhs_code (cur_stmt
);
2787 if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt
))
2788 || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt
))
2798 src_stmt
= find_bswap_or_nop (cur_stmt
, &n
, &bswap
);
2806 /* Already in canonical form, nothing to do. */
2807 if (code
== LROTATE_EXPR
|| code
== RROTATE_EXPR
)
2809 load_type
= bswap_type
= uint16_type_node
;
2812 load_type
= uint32_type_node
;
2815 fndecl
= builtin_decl_explicit (BUILT_IN_BSWAP32
);
2816 bswap_type
= bswap32_type
;
2820 load_type
= uint64_type_node
;
2823 fndecl
= builtin_decl_explicit (BUILT_IN_BSWAP64
);
2824 bswap_type
= bswap64_type
;
2831 if (bswap
&& !fndecl
&& n
.range
!= 16)
2834 if (bswap_replace (cur_stmt
, src_stmt
, fndecl
, bswap_type
, load_type
,
2840 statistics_counter_event (fun
, "16-bit nop implementations found",
2841 nop_stats
.found_16bit
);
2842 statistics_counter_event (fun
, "32-bit nop implementations found",
2843 nop_stats
.found_32bit
);
2844 statistics_counter_event (fun
, "64-bit nop implementations found",
2845 nop_stats
.found_64bit
);
2846 statistics_counter_event (fun
, "16-bit bswap implementations found",
2847 bswap_stats
.found_16bit
);
2848 statistics_counter_event (fun
, "32-bit bswap implementations found",
2849 bswap_stats
.found_32bit
);
2850 statistics_counter_event (fun
, "64-bit bswap implementations found",
2851 bswap_stats
.found_64bit
);
2853 return (changed
? TODO_update_ssa
: 0);
2859 make_pass_optimize_bswap (gcc::context
*ctxt
)
2861 return new pass_optimize_bswap (ctxt
);
2864 /* Return true if stmt is a type conversion operation that can be stripped
2865 when used in a widening multiply operation. */
2867 widening_mult_conversion_strippable_p (tree result_type
, gimple stmt
)
2869 enum tree_code rhs_code
= gimple_assign_rhs_code (stmt
);
2871 if (TREE_CODE (result_type
) == INTEGER_TYPE
)
2876 if (!CONVERT_EXPR_CODE_P (rhs_code
))
2879 op_type
= TREE_TYPE (gimple_assign_lhs (stmt
));
2881 /* If the type of OP has the same precision as the result, then
2882 we can strip this conversion. The multiply operation will be
2883 selected to create the correct extension as a by-product. */
2884 if (TYPE_PRECISION (result_type
) == TYPE_PRECISION (op_type
))
2887 /* We can also strip a conversion if it preserves the signed-ness of
2888 the operation and doesn't narrow the range. */
2889 inner_op_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
2891 /* If the inner-most type is unsigned, then we can strip any
2892 intermediate widening operation. If it's signed, then the
2893 intermediate widening operation must also be signed. */
2894 if ((TYPE_UNSIGNED (inner_op_type
)
2895 || TYPE_UNSIGNED (op_type
) == TYPE_UNSIGNED (inner_op_type
))
2896 && TYPE_PRECISION (op_type
) > TYPE_PRECISION (inner_op_type
))
2902 return rhs_code
== FIXED_CONVERT_EXPR
;
2905 /* Return true if RHS is a suitable operand for a widening multiplication,
2906 assuming a target type of TYPE.
2907 There are two cases:
2909 - RHS makes some value at least twice as wide. Store that value
2910 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2912 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
2913 but leave *TYPE_OUT untouched. */
2916 is_widening_mult_rhs_p (tree type
, tree rhs
, tree
*type_out
,
2922 if (TREE_CODE (rhs
) == SSA_NAME
)
2924 stmt
= SSA_NAME_DEF_STMT (rhs
);
2925 if (is_gimple_assign (stmt
))
2927 if (! widening_mult_conversion_strippable_p (type
, stmt
))
2931 rhs1
= gimple_assign_rhs1 (stmt
);
2933 if (TREE_CODE (rhs1
) == INTEGER_CST
)
2935 *new_rhs_out
= rhs1
;
2944 type1
= TREE_TYPE (rhs1
);
2946 if (TREE_CODE (type1
) != TREE_CODE (type
)
2947 || TYPE_PRECISION (type1
) * 2 > TYPE_PRECISION (type
))
2950 *new_rhs_out
= rhs1
;
2955 if (TREE_CODE (rhs
) == INTEGER_CST
)
2965 /* Return true if STMT performs a widening multiplication, assuming the
2966 output type is TYPE. If so, store the unwidened types of the operands
2967 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
2968 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2969 and *TYPE2_OUT would give the operands of the multiplication. */
2972 is_widening_mult_p (gimple stmt
,
2973 tree
*type1_out
, tree
*rhs1_out
,
2974 tree
*type2_out
, tree
*rhs2_out
)
2976 tree type
= TREE_TYPE (gimple_assign_lhs (stmt
));
2978 if (TREE_CODE (type
) != INTEGER_TYPE
2979 && TREE_CODE (type
) != FIXED_POINT_TYPE
)
2982 if (!is_widening_mult_rhs_p (type
, gimple_assign_rhs1 (stmt
), type1_out
,
2986 if (!is_widening_mult_rhs_p (type
, gimple_assign_rhs2 (stmt
), type2_out
,
2990 if (*type1_out
== NULL
)
2992 if (*type2_out
== NULL
|| !int_fits_type_p (*rhs1_out
, *type2_out
))
2994 *type1_out
= *type2_out
;
2997 if (*type2_out
== NULL
)
2999 if (!int_fits_type_p (*rhs2_out
, *type1_out
))
3001 *type2_out
= *type1_out
;
3004 /* Ensure that the larger of the two operands comes first. */
3005 if (TYPE_PRECISION (*type1_out
) < TYPE_PRECISION (*type2_out
))
3007 std::swap (*type1_out
, *type2_out
);
3008 std::swap (*rhs1_out
, *rhs2_out
);
3014 /* Process a single gimple statement STMT, which has a MULT_EXPR as
3015 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
3016 value is true iff we converted the statement. */
3019 convert_mult_to_widen (gimple stmt
, gimple_stmt_iterator
*gsi
)
3021 tree lhs
, rhs1
, rhs2
, type
, type1
, type2
;
3022 enum insn_code handler
;
3023 machine_mode to_mode
, from_mode
, actual_mode
;
3025 int actual_precision
;
3026 location_t loc
= gimple_location (stmt
);
3027 bool from_unsigned1
, from_unsigned2
;
3029 lhs
= gimple_assign_lhs (stmt
);
3030 type
= TREE_TYPE (lhs
);
3031 if (TREE_CODE (type
) != INTEGER_TYPE
)
3034 if (!is_widening_mult_p (stmt
, &type1
, &rhs1
, &type2
, &rhs2
))
3037 to_mode
= TYPE_MODE (type
);
3038 from_mode
= TYPE_MODE (type1
);
3039 from_unsigned1
= TYPE_UNSIGNED (type1
);
3040 from_unsigned2
= TYPE_UNSIGNED (type2
);
3042 if (from_unsigned1
&& from_unsigned2
)
3043 op
= umul_widen_optab
;
3044 else if (!from_unsigned1
&& !from_unsigned2
)
3045 op
= smul_widen_optab
;
3047 op
= usmul_widen_optab
;
3049 handler
= find_widening_optab_handler_and_mode (op
, to_mode
, from_mode
,
3052 if (handler
== CODE_FOR_nothing
)
3054 if (op
!= smul_widen_optab
)
3056 /* We can use a signed multiply with unsigned types as long as
3057 there is a wider mode to use, or it is the smaller of the two
3058 types that is unsigned. Note that type1 >= type2, always. */
3059 if ((TYPE_UNSIGNED (type1
)
3060 && TYPE_PRECISION (type1
) == GET_MODE_PRECISION (from_mode
))
3061 || (TYPE_UNSIGNED (type2
)
3062 && TYPE_PRECISION (type2
) == GET_MODE_PRECISION (from_mode
)))
3064 from_mode
= GET_MODE_WIDER_MODE (from_mode
);
3065 if (GET_MODE_SIZE (to_mode
) <= GET_MODE_SIZE (from_mode
))
3069 op
= smul_widen_optab
;
3070 handler
= find_widening_optab_handler_and_mode (op
, to_mode
,
3074 if (handler
== CODE_FOR_nothing
)
3077 from_unsigned1
= from_unsigned2
= false;
3083 /* Ensure that the inputs to the handler are in the correct precison
3084 for the opcode. This will be the full mode size. */
3085 actual_precision
= GET_MODE_PRECISION (actual_mode
);
3086 if (2 * actual_precision
> TYPE_PRECISION (type
))
3088 if (actual_precision
!= TYPE_PRECISION (type1
)
3089 || from_unsigned1
!= TYPE_UNSIGNED (type1
))
3090 rhs1
= build_and_insert_cast (gsi
, loc
,
3091 build_nonstandard_integer_type
3092 (actual_precision
, from_unsigned1
), rhs1
);
3093 if (actual_precision
!= TYPE_PRECISION (type2
)
3094 || from_unsigned2
!= TYPE_UNSIGNED (type2
))
3095 rhs2
= build_and_insert_cast (gsi
, loc
,
3096 build_nonstandard_integer_type
3097 (actual_precision
, from_unsigned2
), rhs2
);
3099 /* Handle constants. */
3100 if (TREE_CODE (rhs1
) == INTEGER_CST
)
3101 rhs1
= fold_convert (type1
, rhs1
);
3102 if (TREE_CODE (rhs2
) == INTEGER_CST
)
3103 rhs2
= fold_convert (type2
, rhs2
);
3105 gimple_assign_set_rhs1 (stmt
, rhs1
);
3106 gimple_assign_set_rhs2 (stmt
, rhs2
);
3107 gimple_assign_set_rhs_code (stmt
, WIDEN_MULT_EXPR
);
3109 widen_mul_stats
.widen_mults_inserted
++;
3113 /* Process a single gimple statement STMT, which is found at the
3114 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3115 rhs (given by CODE), and try to convert it into a
3116 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
3117 is true iff we converted the statement. */
3120 convert_plusminus_to_widen (gimple_stmt_iterator
*gsi
, gimple stmt
,
3121 enum tree_code code
)
3123 gimple rhs1_stmt
= NULL
, rhs2_stmt
= NULL
;
3124 gimple conv1_stmt
= NULL
, conv2_stmt
= NULL
, conv_stmt
;
3125 tree type
, type1
, type2
, optype
;
3126 tree lhs
, rhs1
, rhs2
, mult_rhs1
, mult_rhs2
, add_rhs
;
3127 enum tree_code rhs1_code
= ERROR_MARK
, rhs2_code
= ERROR_MARK
;
3129 enum tree_code wmult_code
;
3130 enum insn_code handler
;
3131 machine_mode to_mode
, from_mode
, actual_mode
;
3132 location_t loc
= gimple_location (stmt
);
3133 int actual_precision
;
3134 bool from_unsigned1
, from_unsigned2
;
3136 lhs
= gimple_assign_lhs (stmt
);
3137 type
= TREE_TYPE (lhs
);
3138 if (TREE_CODE (type
) != INTEGER_TYPE
3139 && TREE_CODE (type
) != FIXED_POINT_TYPE
)
3142 if (code
== MINUS_EXPR
)
3143 wmult_code
= WIDEN_MULT_MINUS_EXPR
;
3145 wmult_code
= WIDEN_MULT_PLUS_EXPR
;
3147 rhs1
= gimple_assign_rhs1 (stmt
);
3148 rhs2
= gimple_assign_rhs2 (stmt
);
3150 if (TREE_CODE (rhs1
) == SSA_NAME
)
3152 rhs1_stmt
= SSA_NAME_DEF_STMT (rhs1
);
3153 if (is_gimple_assign (rhs1_stmt
))
3154 rhs1_code
= gimple_assign_rhs_code (rhs1_stmt
);
3157 if (TREE_CODE (rhs2
) == SSA_NAME
)
3159 rhs2_stmt
= SSA_NAME_DEF_STMT (rhs2
);
3160 if (is_gimple_assign (rhs2_stmt
))
3161 rhs2_code
= gimple_assign_rhs_code (rhs2_stmt
);
3164 /* Allow for one conversion statement between the multiply
3165 and addition/subtraction statement. If there are more than
3166 one conversions then we assume they would invalidate this
3167 transformation. If that's not the case then they should have
3168 been folded before now. */
3169 if (CONVERT_EXPR_CODE_P (rhs1_code
))
3171 conv1_stmt
= rhs1_stmt
;
3172 rhs1
= gimple_assign_rhs1 (rhs1_stmt
);
3173 if (TREE_CODE (rhs1
) == SSA_NAME
)
3175 rhs1_stmt
= SSA_NAME_DEF_STMT (rhs1
);
3176 if (is_gimple_assign (rhs1_stmt
))
3177 rhs1_code
= gimple_assign_rhs_code (rhs1_stmt
);
3182 if (CONVERT_EXPR_CODE_P (rhs2_code
))
3184 conv2_stmt
= rhs2_stmt
;
3185 rhs2
= gimple_assign_rhs1 (rhs2_stmt
);
3186 if (TREE_CODE (rhs2
) == SSA_NAME
)
3188 rhs2_stmt
= SSA_NAME_DEF_STMT (rhs2
);
3189 if (is_gimple_assign (rhs2_stmt
))
3190 rhs2_code
= gimple_assign_rhs_code (rhs2_stmt
);
3196 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3197 is_widening_mult_p, but we still need the rhs returns.
3199 It might also appear that it would be sufficient to use the existing
3200 operands of the widening multiply, but that would limit the choice of
3201 multiply-and-accumulate instructions.
3203 If the widened-multiplication result has more than one uses, it is
3204 probably wiser not to do the conversion. */
3205 if (code
== PLUS_EXPR
3206 && (rhs1_code
== MULT_EXPR
|| rhs1_code
== WIDEN_MULT_EXPR
))
3208 if (!has_single_use (rhs1
)
3209 || !is_widening_mult_p (rhs1_stmt
, &type1
, &mult_rhs1
,
3210 &type2
, &mult_rhs2
))
3213 conv_stmt
= conv1_stmt
;
3215 else if (rhs2_code
== MULT_EXPR
|| rhs2_code
== WIDEN_MULT_EXPR
)
3217 if (!has_single_use (rhs2
)
3218 || !is_widening_mult_p (rhs2_stmt
, &type1
, &mult_rhs1
,
3219 &type2
, &mult_rhs2
))
3222 conv_stmt
= conv2_stmt
;
3227 to_mode
= TYPE_MODE (type
);
3228 from_mode
= TYPE_MODE (type1
);
3229 from_unsigned1
= TYPE_UNSIGNED (type1
);
3230 from_unsigned2
= TYPE_UNSIGNED (type2
);
3233 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
3234 if (from_unsigned1
!= from_unsigned2
)
3236 if (!INTEGRAL_TYPE_P (type
))
3238 /* We can use a signed multiply with unsigned types as long as
3239 there is a wider mode to use, or it is the smaller of the two
3240 types that is unsigned. Note that type1 >= type2, always. */
3242 && TYPE_PRECISION (type1
) == GET_MODE_PRECISION (from_mode
))
3244 && TYPE_PRECISION (type2
) == GET_MODE_PRECISION (from_mode
)))
3246 from_mode
= GET_MODE_WIDER_MODE (from_mode
);
3247 if (GET_MODE_SIZE (from_mode
) >= GET_MODE_SIZE (to_mode
))
3251 from_unsigned1
= from_unsigned2
= false;
3252 optype
= build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode
),
3256 /* If there was a conversion between the multiply and addition
3257 then we need to make sure it fits a multiply-and-accumulate.
3258 The should be a single mode change which does not change the
3262 /* We use the original, unmodified data types for this. */
3263 tree from_type
= TREE_TYPE (gimple_assign_rhs1 (conv_stmt
));
3264 tree to_type
= TREE_TYPE (gimple_assign_lhs (conv_stmt
));
3265 int data_size
= TYPE_PRECISION (type1
) + TYPE_PRECISION (type2
);
3266 bool is_unsigned
= TYPE_UNSIGNED (type1
) && TYPE_UNSIGNED (type2
);
3268 if (TYPE_PRECISION (from_type
) > TYPE_PRECISION (to_type
))
3270 /* Conversion is a truncate. */
3271 if (TYPE_PRECISION (to_type
) < data_size
)
3274 else if (TYPE_PRECISION (from_type
) < TYPE_PRECISION (to_type
))
3276 /* Conversion is an extend. Check it's the right sort. */
3277 if (TYPE_UNSIGNED (from_type
) != is_unsigned
3278 && !(is_unsigned
&& TYPE_PRECISION (from_type
) > data_size
))
3281 /* else convert is a no-op for our purposes. */
3284 /* Verify that the machine can perform a widening multiply
3285 accumulate in this mode/signedness combination, otherwise
3286 this transformation is likely to pessimize code. */
3287 this_optab
= optab_for_tree_code (wmult_code
, optype
, optab_default
);
3288 handler
= find_widening_optab_handler_and_mode (this_optab
, to_mode
,
3289 from_mode
, 0, &actual_mode
);
3291 if (handler
== CODE_FOR_nothing
)
3294 /* Ensure that the inputs to the handler are in the correct precison
3295 for the opcode. This will be the full mode size. */
3296 actual_precision
= GET_MODE_PRECISION (actual_mode
);
3297 if (actual_precision
!= TYPE_PRECISION (type1
)
3298 || from_unsigned1
!= TYPE_UNSIGNED (type1
))
3299 mult_rhs1
= build_and_insert_cast (gsi
, loc
,
3300 build_nonstandard_integer_type
3301 (actual_precision
, from_unsigned1
),
3303 if (actual_precision
!= TYPE_PRECISION (type2
)
3304 || from_unsigned2
!= TYPE_UNSIGNED (type2
))
3305 mult_rhs2
= build_and_insert_cast (gsi
, loc
,
3306 build_nonstandard_integer_type
3307 (actual_precision
, from_unsigned2
),
3310 if (!useless_type_conversion_p (type
, TREE_TYPE (add_rhs
)))
3311 add_rhs
= build_and_insert_cast (gsi
, loc
, type
, add_rhs
);
3313 /* Handle constants. */
3314 if (TREE_CODE (mult_rhs1
) == INTEGER_CST
)
3315 mult_rhs1
= fold_convert (type1
, mult_rhs1
);
3316 if (TREE_CODE (mult_rhs2
) == INTEGER_CST
)
3317 mult_rhs2
= fold_convert (type2
, mult_rhs2
);
3319 gimple_assign_set_rhs_with_ops (gsi
, wmult_code
, mult_rhs1
, mult_rhs2
,
3321 update_stmt (gsi_stmt (*gsi
));
3322 widen_mul_stats
.maccs_inserted
++;
3326 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3327 with uses in additions and subtractions to form fused multiply-add
3328 operations. Returns true if successful and MUL_STMT should be removed. */
3331 convert_mult_to_fma (gimple mul_stmt
, tree op1
, tree op2
)
3333 tree mul_result
= gimple_get_lhs (mul_stmt
);
3334 tree type
= TREE_TYPE (mul_result
);
3335 gimple use_stmt
, neguse_stmt
;
3337 use_operand_p use_p
;
3338 imm_use_iterator imm_iter
;
3340 if (FLOAT_TYPE_P (type
)
3341 && flag_fp_contract_mode
== FP_CONTRACT_OFF
)
3344 /* We don't want to do bitfield reduction ops. */
3345 if (INTEGRAL_TYPE_P (type
)
3346 && (TYPE_PRECISION (type
)
3347 != GET_MODE_PRECISION (TYPE_MODE (type
))))
3350 /* If the target doesn't support it, don't generate it. We assume that
3351 if fma isn't available then fms, fnma or fnms are not either. */
3352 if (optab_handler (fma_optab
, TYPE_MODE (type
)) == CODE_FOR_nothing
)
3355 /* If the multiplication has zero uses, it is kept around probably because
3356 of -fnon-call-exceptions. Don't optimize it away in that case,
3358 if (has_zero_uses (mul_result
))
3361 /* Make sure that the multiplication statement becomes dead after
3362 the transformation, thus that all uses are transformed to FMAs.
3363 This means we assume that an FMA operation has the same cost
3365 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, mul_result
)
3367 enum tree_code use_code
;
3368 tree result
= mul_result
;
3369 bool negate_p
= false;
3371 use_stmt
= USE_STMT (use_p
);
3373 if (is_gimple_debug (use_stmt
))
3376 /* For now restrict this operations to single basic blocks. In theory
3377 we would want to support sinking the multiplication in
3383 to form a fma in the then block and sink the multiplication to the
3385 if (gimple_bb (use_stmt
) != gimple_bb (mul_stmt
))
3388 if (!is_gimple_assign (use_stmt
))
3391 use_code
= gimple_assign_rhs_code (use_stmt
);
3393 /* A negate on the multiplication leads to FNMA. */
3394 if (use_code
== NEGATE_EXPR
)
3399 result
= gimple_assign_lhs (use_stmt
);
3401 /* Make sure the negate statement becomes dead with this
3402 single transformation. */
3403 if (!single_imm_use (gimple_assign_lhs (use_stmt
),
3404 &use_p
, &neguse_stmt
))
3407 /* Make sure the multiplication isn't also used on that stmt. */
3408 FOR_EACH_PHI_OR_STMT_USE (usep
, neguse_stmt
, iter
, SSA_OP_USE
)
3409 if (USE_FROM_PTR (usep
) == mul_result
)
3413 use_stmt
= neguse_stmt
;
3414 if (gimple_bb (use_stmt
) != gimple_bb (mul_stmt
))
3416 if (!is_gimple_assign (use_stmt
))
3419 use_code
= gimple_assign_rhs_code (use_stmt
);
3426 if (gimple_assign_rhs2 (use_stmt
) == result
)
3427 negate_p
= !negate_p
;
3432 /* FMA can only be formed from PLUS and MINUS. */
3436 /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3437 by a MULT_EXPR that we'll visit later, we might be able to
3438 get a more profitable match with fnma.
3439 OTOH, if we don't, a negate / fma pair has likely lower latency
3440 that a mult / subtract pair. */
3441 if (use_code
== MINUS_EXPR
&& !negate_p
3442 && gimple_assign_rhs1 (use_stmt
) == result
3443 && optab_handler (fms_optab
, TYPE_MODE (type
)) == CODE_FOR_nothing
3444 && optab_handler (fnma_optab
, TYPE_MODE (type
)) != CODE_FOR_nothing
)
3446 tree rhs2
= gimple_assign_rhs2 (use_stmt
);
3448 if (TREE_CODE (rhs2
) == SSA_NAME
)
3450 gimple stmt2
= SSA_NAME_DEF_STMT (rhs2
);
3451 if (has_single_use (rhs2
)
3452 && is_gimple_assign (stmt2
)
3453 && gimple_assign_rhs_code (stmt2
) == MULT_EXPR
)
3458 /* We can't handle a * b + a * b. */
3459 if (gimple_assign_rhs1 (use_stmt
) == gimple_assign_rhs2 (use_stmt
))
3462 /* While it is possible to validate whether or not the exact form
3463 that we've recognized is available in the backend, the assumption
3464 is that the transformation is never a loss. For instance, suppose
3465 the target only has the plain FMA pattern available. Consider
3466 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3467 is still two operations. Consider -(a*b)-c -> fma(-a,b,-c): we
3468 still have 3 operations, but in the FMA form the two NEGs are
3469 independent and could be run in parallel. */
3472 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, mul_result
)
3474 gimple_stmt_iterator gsi
= gsi_for_stmt (use_stmt
);
3475 enum tree_code use_code
;
3476 tree addop
, mulop1
= op1
, result
= mul_result
;
3477 bool negate_p
= false;
3479 if (is_gimple_debug (use_stmt
))
3482 use_code
= gimple_assign_rhs_code (use_stmt
);
3483 if (use_code
== NEGATE_EXPR
)
3485 result
= gimple_assign_lhs (use_stmt
);
3486 single_imm_use (gimple_assign_lhs (use_stmt
), &use_p
, &neguse_stmt
);
3487 gsi_remove (&gsi
, true);
3488 release_defs (use_stmt
);
3490 use_stmt
= neguse_stmt
;
3491 gsi
= gsi_for_stmt (use_stmt
);
3492 use_code
= gimple_assign_rhs_code (use_stmt
);
3496 if (gimple_assign_rhs1 (use_stmt
) == result
)
3498 addop
= gimple_assign_rhs2 (use_stmt
);
3499 /* a * b - c -> a * b + (-c) */
3500 if (gimple_assign_rhs_code (use_stmt
) == MINUS_EXPR
)
3501 addop
= force_gimple_operand_gsi (&gsi
,
3502 build1 (NEGATE_EXPR
,
3504 true, NULL_TREE
, true,
3509 addop
= gimple_assign_rhs1 (use_stmt
);
3510 /* a - b * c -> (-b) * c + a */
3511 if (gimple_assign_rhs_code (use_stmt
) == MINUS_EXPR
)
3512 negate_p
= !negate_p
;
3516 mulop1
= force_gimple_operand_gsi (&gsi
,
3517 build1 (NEGATE_EXPR
,
3519 true, NULL_TREE
, true,
3522 fma_stmt
= gimple_build_assign (gimple_assign_lhs (use_stmt
),
3523 FMA_EXPR
, mulop1
, op2
, addop
);
3524 gsi_replace (&gsi
, fma_stmt
, true);
3525 widen_mul_stats
.fmas_inserted
++;
3531 /* Find integer multiplications where the operands are extended from
3532 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3533 where appropriate. */
3537 const pass_data pass_data_optimize_widening_mul
=
3539 GIMPLE_PASS
, /* type */
3540 "widening_mul", /* name */
3541 OPTGROUP_NONE
, /* optinfo_flags */
3542 TV_NONE
, /* tv_id */
3543 PROP_ssa
, /* properties_required */
3544 0, /* properties_provided */
3545 0, /* properties_destroyed */
3546 0, /* todo_flags_start */
3547 TODO_update_ssa
, /* todo_flags_finish */
3550 class pass_optimize_widening_mul
: public gimple_opt_pass
3553 pass_optimize_widening_mul (gcc::context
*ctxt
)
3554 : gimple_opt_pass (pass_data_optimize_widening_mul
, ctxt
)
3557 /* opt_pass methods: */
3558 virtual bool gate (function
*)
3560 return flag_expensive_optimizations
&& optimize
;
3563 virtual unsigned int execute (function
*);
3565 }; // class pass_optimize_widening_mul
3568 pass_optimize_widening_mul::execute (function
*fun
)
3571 bool cfg_changed
= false;
3573 memset (&widen_mul_stats
, 0, sizeof (widen_mul_stats
));
3575 FOR_EACH_BB_FN (bb
, fun
)
3577 gimple_stmt_iterator gsi
;
3579 for (gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);)
3581 gimple stmt
= gsi_stmt (gsi
);
3582 enum tree_code code
;
3584 if (is_gimple_assign (stmt
))
3586 code
= gimple_assign_rhs_code (stmt
);
3590 if (!convert_mult_to_widen (stmt
, &gsi
)
3591 && convert_mult_to_fma (stmt
,
3592 gimple_assign_rhs1 (stmt
),
3593 gimple_assign_rhs2 (stmt
)))
3595 gsi_remove (&gsi
, true);
3596 release_defs (stmt
);
3603 convert_plusminus_to_widen (&gsi
, stmt
, code
);
3609 else if (is_gimple_call (stmt
)
3610 && gimple_call_lhs (stmt
))
3612 tree fndecl
= gimple_call_fndecl (stmt
);
3614 && DECL_BUILT_IN_CLASS (fndecl
) == BUILT_IN_NORMAL
)
3616 switch (DECL_FUNCTION_CODE (fndecl
))
3621 if (TREE_CODE (gimple_call_arg (stmt
, 1)) == REAL_CST
3622 && REAL_VALUES_EQUAL
3623 (TREE_REAL_CST (gimple_call_arg (stmt
, 1)),
3625 && convert_mult_to_fma (stmt
,
3626 gimple_call_arg (stmt
, 0),
3627 gimple_call_arg (stmt
, 0)))
3629 unlink_stmt_vdef (stmt
);
3630 if (gsi_remove (&gsi
, true)
3631 && gimple_purge_dead_eh_edges (bb
))
3633 release_defs (stmt
);
3646 statistics_counter_event (fun
, "widening multiplications inserted",
3647 widen_mul_stats
.widen_mults_inserted
);
3648 statistics_counter_event (fun
, "widening maccs inserted",
3649 widen_mul_stats
.maccs_inserted
);
3650 statistics_counter_event (fun
, "fused multiply-adds inserted",
3651 widen_mul_stats
.fmas_inserted
);
3653 return cfg_changed
? TODO_cleanup_cfg
: 0;
3659 make_pass_optimize_widening_mul (gcc::context
*ctxt
)
3661 return new pass_optimize_widening_mul (ctxt
);