1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2015 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
23 modulus = sqrt(x*x + y*y + z*z);
28 that can be optimized to
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 hy the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
89 #include "coretypes.h"
96 #include "fold-const.h"
98 #include "hard-reg-set.h"
100 #include "dominance.h"
102 #include "basic-block.h"
103 #include "tree-ssa-alias.h"
104 #include "internal-fn.h"
105 #include "gimple-fold.h"
106 #include "gimple-expr.h"
109 #include "gimple-iterator.h"
110 #include "gimplify.h"
111 #include "gimplify-me.h"
112 #include "stor-layout.h"
113 #include "gimple-ssa.h"
114 #include "tree-cfg.h"
115 #include "tree-phinodes.h"
116 #include "ssa-iterators.h"
117 #include "stringpool.h"
118 #include "tree-ssanames.h"
120 #include "insn-config.h"
125 #include "emit-rtl.h"
129 #include "tree-dfa.h"
130 #include "tree-ssa.h"
131 #include "tree-pass.h"
132 #include "alloc-pool.h"
134 #include "gimple-pretty-print.h"
135 #include "builtins.h"
138 /* FIXME: RTL headers have to be included here for optabs. */
139 #include "rtl.h" /* Because optabs.h wants enum rtx_code. */
140 #include "expr.h" /* Because optabs.h wants sepops. */
141 #include "insn-codes.h"
144 /* This structure represents one basic block that either computes a
145 division, or is a common dominator for basic block that compute a
148 /* The basic block represented by this structure. */
151 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
155 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
156 was inserted in BB. */
157 gimple recip_def_stmt
;
159 /* Pointer to a list of "struct occurrence"s for blocks dominated
161 struct occurrence
*children
;
163 /* Pointer to the next "struct occurrence"s in the list of blocks
164 sharing a common dominator. */
165 struct occurrence
*next
;
167 /* The number of divisions that are in BB before compute_merit. The
168 number of divisions that are in BB or post-dominate it after
172 /* True if the basic block has a division, false if it is a common
173 dominator for basic blocks that do. If it is false and trapping
174 math is active, BB is not a candidate for inserting a reciprocal. */
175 bool bb_has_division
;
180 /* Number of 1.0/X ops inserted. */
183 /* Number of 1.0/FUNC ops inserted. */
189 /* Number of cexpi calls inserted. */
195 /* Number of hand-written 16-bit nop / bswaps found. */
198 /* Number of hand-written 32-bit nop / bswaps found. */
201 /* Number of hand-written 64-bit nop / bswaps found. */
203 } nop_stats
, bswap_stats
;
207 /* Number of widening multiplication ops inserted. */
208 int widen_mults_inserted
;
210 /* Number of integer multiply-and-accumulate ops inserted. */
213 /* Number of fp fused multiply-add ops inserted. */
217 /* The instance of "struct occurrence" representing the highest
218 interesting block in the dominator tree. */
219 static struct occurrence
*occ_head
;
221 /* Allocation pool for getting instances of "struct occurrence". */
222 static pool_allocator
<occurrence
> *occ_pool
;
226 /* Allocate and return a new struct occurrence for basic block BB, and
227 whose children list is headed by CHILDREN. */
228 static struct occurrence
*
229 occ_new (basic_block bb
, struct occurrence
*children
)
231 struct occurrence
*occ
;
233 bb
->aux
= occ
= occ_pool
->allocate ();
234 memset (occ
, 0, sizeof (struct occurrence
));
237 occ
->children
= children
;
242 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
243 list of "struct occurrence"s, one per basic block, having IDOM as
244 their common dominator.
246 We try to insert NEW_OCC as deep as possible in the tree, and we also
247 insert any other block that is a common dominator for BB and one
248 block already in the tree. */
251 insert_bb (struct occurrence
*new_occ
, basic_block idom
,
252 struct occurrence
**p_head
)
254 struct occurrence
*occ
, **p_occ
;
256 for (p_occ
= p_head
; (occ
= *p_occ
) != NULL
; )
258 basic_block bb
= new_occ
->bb
, occ_bb
= occ
->bb
;
259 basic_block dom
= nearest_common_dominator (CDI_DOMINATORS
, occ_bb
, bb
);
262 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
265 occ
->next
= new_occ
->children
;
266 new_occ
->children
= occ
;
268 /* Try the next block (it may as well be dominated by BB). */
271 else if (dom
== occ_bb
)
273 /* OCC_BB dominates BB. Tail recurse to look deeper. */
274 insert_bb (new_occ
, dom
, &occ
->children
);
278 else if (dom
!= idom
)
280 gcc_assert (!dom
->aux
);
282 /* There is a dominator between IDOM and BB, add it and make
283 two children out of NEW_OCC and OCC. First, remove OCC from
289 /* None of the previous blocks has DOM as a dominator: if we tail
290 recursed, we would reexamine them uselessly. Just switch BB with
291 DOM, and go on looking for blocks dominated by DOM. */
292 new_occ
= occ_new (dom
, new_occ
);
297 /* Nothing special, go on with the next element. */
302 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
303 new_occ
->next
= *p_head
;
307 /* Register that we found a division in BB. */
310 register_division_in (basic_block bb
)
312 struct occurrence
*occ
;
314 occ
= (struct occurrence
*) bb
->aux
;
317 occ
= occ_new (bb
, NULL
);
318 insert_bb (occ
, ENTRY_BLOCK_PTR_FOR_FN (cfun
), &occ_head
);
321 occ
->bb_has_division
= true;
322 occ
->num_divisions
++;
326 /* Compute the number of divisions that postdominate each block in OCC and
330 compute_merit (struct occurrence
*occ
)
332 struct occurrence
*occ_child
;
333 basic_block dom
= occ
->bb
;
335 for (occ_child
= occ
->children
; occ_child
; occ_child
= occ_child
->next
)
338 if (occ_child
->children
)
339 compute_merit (occ_child
);
342 bb
= single_noncomplex_succ (dom
);
346 if (dominated_by_p (CDI_POST_DOMINATORS
, bb
, occ_child
->bb
))
347 occ
->num_divisions
+= occ_child
->num_divisions
;
352 /* Return whether USE_STMT is a floating-point division by DEF. */
354 is_division_by (gimple use_stmt
, tree def
)
356 return is_gimple_assign (use_stmt
)
357 && gimple_assign_rhs_code (use_stmt
) == RDIV_EXPR
358 && gimple_assign_rhs2 (use_stmt
) == def
359 /* Do not recognize x / x as valid division, as we are getting
360 confused later by replacing all immediate uses x in such
362 && gimple_assign_rhs1 (use_stmt
) != def
;
365 /* Walk the subset of the dominator tree rooted at OCC, setting the
366 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
367 the given basic block. The field may be left NULL, of course,
368 if it is not possible or profitable to do the optimization.
370 DEF_BSI is an iterator pointing at the statement defining DEF.
371 If RECIP_DEF is set, a dominator already has a computation that can
375 insert_reciprocals (gimple_stmt_iterator
*def_gsi
, struct occurrence
*occ
,
376 tree def
, tree recip_def
, int threshold
)
380 gimple_stmt_iterator gsi
;
381 struct occurrence
*occ_child
;
384 && (occ
->bb_has_division
|| !flag_trapping_math
)
385 && occ
->num_divisions
>= threshold
)
387 /* Make a variable with the replacement and substitute it. */
388 type
= TREE_TYPE (def
);
389 recip_def
= create_tmp_reg (type
, "reciptmp");
390 new_stmt
= gimple_build_assign (recip_def
, RDIV_EXPR
,
391 build_one_cst (type
), def
);
393 if (occ
->bb_has_division
)
395 /* Case 1: insert before an existing division. */
396 gsi
= gsi_after_labels (occ
->bb
);
397 while (!gsi_end_p (gsi
) && !is_division_by (gsi_stmt (gsi
), def
))
400 gsi_insert_before (&gsi
, new_stmt
, GSI_SAME_STMT
);
402 else if (def_gsi
&& occ
->bb
== def_gsi
->bb
)
404 /* Case 2: insert right after the definition. Note that this will
405 never happen if the definition statement can throw, because in
406 that case the sole successor of the statement's basic block will
407 dominate all the uses as well. */
408 gsi_insert_after (def_gsi
, new_stmt
, GSI_NEW_STMT
);
412 /* Case 3: insert in a basic block not containing defs/uses. */
413 gsi
= gsi_after_labels (occ
->bb
);
414 gsi_insert_before (&gsi
, new_stmt
, GSI_SAME_STMT
);
417 reciprocal_stats
.rdivs_inserted
++;
419 occ
->recip_def_stmt
= new_stmt
;
422 occ
->recip_def
= recip_def
;
423 for (occ_child
= occ
->children
; occ_child
; occ_child
= occ_child
->next
)
424 insert_reciprocals (def_gsi
, occ_child
, def
, recip_def
, threshold
);
428 /* Replace the division at USE_P with a multiplication by the reciprocal, if
432 replace_reciprocal (use_operand_p use_p
)
434 gimple use_stmt
= USE_STMT (use_p
);
435 basic_block bb
= gimple_bb (use_stmt
);
436 struct occurrence
*occ
= (struct occurrence
*) bb
->aux
;
438 if (optimize_bb_for_speed_p (bb
)
439 && occ
->recip_def
&& use_stmt
!= occ
->recip_def_stmt
)
441 gimple_stmt_iterator gsi
= gsi_for_stmt (use_stmt
);
442 gimple_assign_set_rhs_code (use_stmt
, MULT_EXPR
);
443 SET_USE (use_p
, occ
->recip_def
);
444 fold_stmt_inplace (&gsi
);
445 update_stmt (use_stmt
);
450 /* Free OCC and return one more "struct occurrence" to be freed. */
452 static struct occurrence
*
453 free_bb (struct occurrence
*occ
)
455 struct occurrence
*child
, *next
;
457 /* First get the two pointers hanging off OCC. */
459 child
= occ
->children
;
461 occ_pool
->remove (occ
);
463 /* Now ensure that we don't recurse unless it is necessary. */
469 next
= free_bb (next
);
476 /* Look for floating-point divisions among DEF's uses, and try to
477 replace them by multiplications with the reciprocal. Add
478 as many statements computing the reciprocal as needed.
480 DEF must be a GIMPLE register of a floating-point type. */
483 execute_cse_reciprocals_1 (gimple_stmt_iterator
*def_gsi
, tree def
)
486 imm_use_iterator use_iter
;
487 struct occurrence
*occ
;
488 int count
= 0, threshold
;
490 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def
)) && is_gimple_reg (def
));
492 FOR_EACH_IMM_USE_FAST (use_p
, use_iter
, def
)
494 gimple use_stmt
= USE_STMT (use_p
);
495 if (is_division_by (use_stmt
, def
))
497 register_division_in (gimple_bb (use_stmt
));
502 /* Do the expensive part only if we can hope to optimize something. */
503 threshold
= targetm
.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def
)));
504 if (count
>= threshold
)
507 for (occ
= occ_head
; occ
; occ
= occ
->next
)
510 insert_reciprocals (def_gsi
, occ
, def
, NULL
, threshold
);
513 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, def
)
515 if (is_division_by (use_stmt
, def
))
517 FOR_EACH_IMM_USE_ON_STMT (use_p
, use_iter
)
518 replace_reciprocal (use_p
);
523 for (occ
= occ_head
; occ
; )
529 /* Go through all the floating-point SSA_NAMEs, and call
530 execute_cse_reciprocals_1 on each of them. */
533 const pass_data pass_data_cse_reciprocals
=
535 GIMPLE_PASS
, /* type */
537 OPTGROUP_NONE
, /* optinfo_flags */
539 PROP_ssa
, /* properties_required */
540 0, /* properties_provided */
541 0, /* properties_destroyed */
542 0, /* todo_flags_start */
543 TODO_update_ssa
, /* todo_flags_finish */
546 class pass_cse_reciprocals
: public gimple_opt_pass
549 pass_cse_reciprocals (gcc::context
*ctxt
)
550 : gimple_opt_pass (pass_data_cse_reciprocals
, ctxt
)
553 /* opt_pass methods: */
554 virtual bool gate (function
*) { return optimize
&& flag_reciprocal_math
; }
555 virtual unsigned int execute (function
*);
557 }; // class pass_cse_reciprocals
560 pass_cse_reciprocals::execute (function
*fun
)
565 occ_pool
= new pool_allocator
<occurrence
>
566 ("dominators for recip", n_basic_blocks_for_fn (fun
) / 3 + 1);
568 memset (&reciprocal_stats
, 0, sizeof (reciprocal_stats
));
569 calculate_dominance_info (CDI_DOMINATORS
);
570 calculate_dominance_info (CDI_POST_DOMINATORS
);
572 #ifdef ENABLE_CHECKING
573 FOR_EACH_BB_FN (bb
, fun
)
574 gcc_assert (!bb
->aux
);
577 for (arg
= DECL_ARGUMENTS (fun
->decl
); arg
; arg
= DECL_CHAIN (arg
))
578 if (FLOAT_TYPE_P (TREE_TYPE (arg
))
579 && is_gimple_reg (arg
))
581 tree name
= ssa_default_def (fun
, arg
);
583 execute_cse_reciprocals_1 (NULL
, name
);
586 FOR_EACH_BB_FN (bb
, fun
)
590 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
593 gphi
*phi
= gsi
.phi ();
594 def
= PHI_RESULT (phi
);
595 if (! virtual_operand_p (def
)
596 && FLOAT_TYPE_P (TREE_TYPE (def
)))
597 execute_cse_reciprocals_1 (NULL
, def
);
600 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
603 gimple stmt
= gsi_stmt (gsi
);
605 if (gimple_has_lhs (stmt
)
606 && (def
= SINGLE_SSA_TREE_OPERAND (stmt
, SSA_OP_DEF
)) != NULL
607 && FLOAT_TYPE_P (TREE_TYPE (def
))
608 && TREE_CODE (def
) == SSA_NAME
)
609 execute_cse_reciprocals_1 (&gsi
, def
);
612 if (optimize_bb_for_size_p (bb
))
615 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
616 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
619 gimple stmt
= gsi_stmt (gsi
);
622 if (is_gimple_assign (stmt
)
623 && gimple_assign_rhs_code (stmt
) == RDIV_EXPR
)
625 tree arg1
= gimple_assign_rhs2 (stmt
);
628 if (TREE_CODE (arg1
) != SSA_NAME
)
631 stmt1
= SSA_NAME_DEF_STMT (arg1
);
633 if (is_gimple_call (stmt1
)
634 && gimple_call_lhs (stmt1
)
635 && (fndecl
= gimple_call_fndecl (stmt1
))
636 && (DECL_BUILT_IN_CLASS (fndecl
) == BUILT_IN_NORMAL
637 || DECL_BUILT_IN_CLASS (fndecl
) == BUILT_IN_MD
))
639 enum built_in_function code
;
644 code
= DECL_FUNCTION_CODE (fndecl
);
645 md_code
= DECL_BUILT_IN_CLASS (fndecl
) == BUILT_IN_MD
;
647 fndecl
= targetm
.builtin_reciprocal (code
, md_code
, false);
651 /* Check that all uses of the SSA name are divisions,
652 otherwise replacing the defining statement will do
655 FOR_EACH_IMM_USE_FAST (use_p
, ui
, arg1
)
657 gimple stmt2
= USE_STMT (use_p
);
658 if (is_gimple_debug (stmt2
))
660 if (!is_gimple_assign (stmt2
)
661 || gimple_assign_rhs_code (stmt2
) != RDIV_EXPR
662 || gimple_assign_rhs1 (stmt2
) == arg1
663 || gimple_assign_rhs2 (stmt2
) != arg1
)
672 gimple_replace_ssa_lhs (stmt1
, arg1
);
673 gimple_call_set_fndecl (stmt1
, fndecl
);
675 reciprocal_stats
.rfuncs_inserted
++;
677 FOR_EACH_IMM_USE_STMT (stmt
, ui
, arg1
)
679 gimple_stmt_iterator gsi
= gsi_for_stmt (stmt
);
680 gimple_assign_set_rhs_code (stmt
, MULT_EXPR
);
681 fold_stmt_inplace (&gsi
);
689 statistics_counter_event (fun
, "reciprocal divs inserted",
690 reciprocal_stats
.rdivs_inserted
);
691 statistics_counter_event (fun
, "reciprocal functions inserted",
692 reciprocal_stats
.rfuncs_inserted
);
694 free_dominance_info (CDI_DOMINATORS
);
695 free_dominance_info (CDI_POST_DOMINATORS
);
703 make_pass_cse_reciprocals (gcc::context
*ctxt
)
705 return new pass_cse_reciprocals (ctxt
);
708 /* Records an occurrence at statement USE_STMT in the vector of trees
709 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
710 is not yet initialized. Returns true if the occurrence was pushed on
711 the vector. Adjusts *TOP_BB to be the basic block dominating all
712 statements in the vector. */
715 maybe_record_sincos (vec
<gimple
> *stmts
,
716 basic_block
*top_bb
, gimple use_stmt
)
718 basic_block use_bb
= gimple_bb (use_stmt
);
720 && (*top_bb
== use_bb
721 || dominated_by_p (CDI_DOMINATORS
, use_bb
, *top_bb
)))
722 stmts
->safe_push (use_stmt
);
724 || dominated_by_p (CDI_DOMINATORS
, *top_bb
, use_bb
))
726 stmts
->safe_push (use_stmt
);
735 /* Look for sin, cos and cexpi calls with the same argument NAME and
736 create a single call to cexpi CSEing the result in this case.
737 We first walk over all immediate uses of the argument collecting
738 statements that we can CSE in a vector and in a second pass replace
739 the statement rhs with a REALPART or IMAGPART expression on the
740 result of the cexpi call we insert before the use statement that
741 dominates all other candidates. */
744 execute_cse_sincos_1 (tree name
)
746 gimple_stmt_iterator gsi
;
747 imm_use_iterator use_iter
;
748 tree fndecl
, res
, type
;
749 gimple def_stmt
, use_stmt
, stmt
;
750 int seen_cos
= 0, seen_sin
= 0, seen_cexpi
= 0;
751 auto_vec
<gimple
> stmts
;
752 basic_block top_bb
= NULL
;
754 bool cfg_changed
= false;
756 type
= TREE_TYPE (name
);
757 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, name
)
759 if (gimple_code (use_stmt
) != GIMPLE_CALL
760 || !gimple_call_lhs (use_stmt
)
761 || !(fndecl
= gimple_call_fndecl (use_stmt
))
762 || DECL_BUILT_IN_CLASS (fndecl
) != BUILT_IN_NORMAL
)
765 switch (DECL_FUNCTION_CODE (fndecl
))
767 CASE_FLT_FN (BUILT_IN_COS
):
768 seen_cos
|= maybe_record_sincos (&stmts
, &top_bb
, use_stmt
) ? 1 : 0;
771 CASE_FLT_FN (BUILT_IN_SIN
):
772 seen_sin
|= maybe_record_sincos (&stmts
, &top_bb
, use_stmt
) ? 1 : 0;
775 CASE_FLT_FN (BUILT_IN_CEXPI
):
776 seen_cexpi
|= maybe_record_sincos (&stmts
, &top_bb
, use_stmt
) ? 1 : 0;
783 if (seen_cos
+ seen_sin
+ seen_cexpi
<= 1)
786 /* Simply insert cexpi at the beginning of top_bb but not earlier than
787 the name def statement. */
788 fndecl
= mathfn_built_in (type
, BUILT_IN_CEXPI
);
791 stmt
= gimple_build_call (fndecl
, 1, name
);
792 res
= make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl
)), stmt
, "sincostmp");
793 gimple_call_set_lhs (stmt
, res
);
795 def_stmt
= SSA_NAME_DEF_STMT (name
);
796 if (!SSA_NAME_IS_DEFAULT_DEF (name
)
797 && gimple_code (def_stmt
) != GIMPLE_PHI
798 && gimple_bb (def_stmt
) == top_bb
)
800 gsi
= gsi_for_stmt (def_stmt
);
801 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
805 gsi
= gsi_after_labels (top_bb
);
806 gsi_insert_before (&gsi
, stmt
, GSI_SAME_STMT
);
808 sincos_stats
.inserted
++;
810 /* And adjust the recorded old call sites. */
811 for (i
= 0; stmts
.iterate (i
, &use_stmt
); ++i
)
814 fndecl
= gimple_call_fndecl (use_stmt
);
816 switch (DECL_FUNCTION_CODE (fndecl
))
818 CASE_FLT_FN (BUILT_IN_COS
):
819 rhs
= fold_build1 (REALPART_EXPR
, type
, res
);
822 CASE_FLT_FN (BUILT_IN_SIN
):
823 rhs
= fold_build1 (IMAGPART_EXPR
, type
, res
);
826 CASE_FLT_FN (BUILT_IN_CEXPI
):
834 /* Replace call with a copy. */
835 stmt
= gimple_build_assign (gimple_call_lhs (use_stmt
), rhs
);
837 gsi
= gsi_for_stmt (use_stmt
);
838 gsi_replace (&gsi
, stmt
, true);
839 if (gimple_purge_dead_eh_edges (gimple_bb (stmt
)))
846 /* To evaluate powi(x,n), the floating point value x raised to the
847 constant integer exponent n, we use a hybrid algorithm that
848 combines the "window method" with look-up tables. For an
849 introduction to exponentiation algorithms and "addition chains",
850 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
851 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
852 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
853 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
855 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
856 multiplications to inline before calling the system library's pow
857 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
858 so this default never requires calling pow, powf or powl. */
860 #ifndef POWI_MAX_MULTS
861 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
864 /* The size of the "optimal power tree" lookup table. All
865 exponents less than this value are simply looked up in the
866 powi_table below. This threshold is also used to size the
867 cache of pseudo registers that hold intermediate results. */
868 #define POWI_TABLE_SIZE 256
870 /* The size, in bits of the window, used in the "window method"
871 exponentiation algorithm. This is equivalent to a radix of
872 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
873 #define POWI_WINDOW_SIZE 3
875 /* The following table is an efficient representation of an
876 "optimal power tree". For each value, i, the corresponding
877 value, j, in the table states than an optimal evaluation
878 sequence for calculating pow(x,i) can be found by evaluating
879 pow(x,j)*pow(x,i-j). An optimal power tree for the first
880 100 integers is given in Knuth's "Seminumerical algorithms". */
882 static const unsigned char powi_table
[POWI_TABLE_SIZE
] =
884 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
885 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
886 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
887 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
888 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
889 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
890 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
891 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
892 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
893 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
894 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
895 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
896 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
897 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
898 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
899 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
900 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
901 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
902 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
903 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
904 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
905 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
906 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
907 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
908 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
909 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
910 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
911 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
912 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
913 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
914 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
915 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
919 /* Return the number of multiplications required to calculate
920 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
921 subroutine of powi_cost. CACHE is an array indicating
922 which exponents have already been calculated. */
925 powi_lookup_cost (unsigned HOST_WIDE_INT n
, bool *cache
)
927 /* If we've already calculated this exponent, then this evaluation
928 doesn't require any additional multiplications. */
933 return powi_lookup_cost (n
- powi_table
[n
], cache
)
934 + powi_lookup_cost (powi_table
[n
], cache
) + 1;
937 /* Return the number of multiplications required to calculate
938 powi(x,n) for an arbitrary x, given the exponent N. This
939 function needs to be kept in sync with powi_as_mults below. */
942 powi_cost (HOST_WIDE_INT n
)
944 bool cache
[POWI_TABLE_SIZE
];
945 unsigned HOST_WIDE_INT digit
;
946 unsigned HOST_WIDE_INT val
;
952 /* Ignore the reciprocal when calculating the cost. */
953 val
= (n
< 0) ? -n
: n
;
955 /* Initialize the exponent cache. */
956 memset (cache
, 0, POWI_TABLE_SIZE
* sizeof (bool));
961 while (val
>= POWI_TABLE_SIZE
)
965 digit
= val
& ((1 << POWI_WINDOW_SIZE
) - 1);
966 result
+= powi_lookup_cost (digit
, cache
)
967 + POWI_WINDOW_SIZE
+ 1;
968 val
>>= POWI_WINDOW_SIZE
;
977 return result
+ powi_lookup_cost (val
, cache
);
980 /* Recursive subroutine of powi_as_mults. This function takes the
981 array, CACHE, of already calculated exponents and an exponent N and
982 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
985 powi_as_mults_1 (gimple_stmt_iterator
*gsi
, location_t loc
, tree type
,
986 HOST_WIDE_INT n
, tree
*cache
)
988 tree op0
, op1
, ssa_target
;
989 unsigned HOST_WIDE_INT digit
;
992 if (n
< POWI_TABLE_SIZE
&& cache
[n
])
995 ssa_target
= make_temp_ssa_name (type
, NULL
, "powmult");
997 if (n
< POWI_TABLE_SIZE
)
999 cache
[n
] = ssa_target
;
1000 op0
= powi_as_mults_1 (gsi
, loc
, type
, n
- powi_table
[n
], cache
);
1001 op1
= powi_as_mults_1 (gsi
, loc
, type
, powi_table
[n
], cache
);
1005 digit
= n
& ((1 << POWI_WINDOW_SIZE
) - 1);
1006 op0
= powi_as_mults_1 (gsi
, loc
, type
, n
- digit
, cache
);
1007 op1
= powi_as_mults_1 (gsi
, loc
, type
, digit
, cache
);
1011 op0
= powi_as_mults_1 (gsi
, loc
, type
, n
>> 1, cache
);
1015 mult_stmt
= gimple_build_assign (ssa_target
, MULT_EXPR
, op0
, op1
);
1016 gimple_set_location (mult_stmt
, loc
);
1017 gsi_insert_before (gsi
, mult_stmt
, GSI_SAME_STMT
);
1022 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1023 This function needs to be kept in sync with powi_cost above. */
1026 powi_as_mults (gimple_stmt_iterator
*gsi
, location_t loc
,
1027 tree arg0
, HOST_WIDE_INT n
)
1029 tree cache
[POWI_TABLE_SIZE
], result
, type
= TREE_TYPE (arg0
);
1034 return build_real (type
, dconst1
);
1036 memset (cache
, 0, sizeof (cache
));
1039 result
= powi_as_mults_1 (gsi
, loc
, type
, (n
< 0) ? -n
: n
, cache
);
1043 /* If the original exponent was negative, reciprocate the result. */
1044 target
= make_temp_ssa_name (type
, NULL
, "powmult");
1045 div_stmt
= gimple_build_assign (target
, RDIV_EXPR
,
1046 build_real (type
, dconst1
), result
);
1047 gimple_set_location (div_stmt
, loc
);
1048 gsi_insert_before (gsi
, div_stmt
, GSI_SAME_STMT
);
1053 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1054 location info LOC. If the arguments are appropriate, create an
1055 equivalent sequence of statements prior to GSI using an optimal
1056 number of multiplications, and return an expession holding the
1060 gimple_expand_builtin_powi (gimple_stmt_iterator
*gsi
, location_t loc
,
1061 tree arg0
, HOST_WIDE_INT n
)
1063 /* Avoid largest negative number. */
1065 && ((n
>= -1 && n
<= 2)
1066 || (optimize_function_for_speed_p (cfun
)
1067 && powi_cost (n
) <= POWI_MAX_MULTS
)))
1068 return powi_as_mults (gsi
, loc
, arg0
, n
);
1073 /* Build a gimple call statement that calls FN with argument ARG.
1074 Set the lhs of the call statement to a fresh SSA name. Insert the
1075 statement prior to GSI's current position, and return the fresh
1079 build_and_insert_call (gimple_stmt_iterator
*gsi
, location_t loc
,
1085 call_stmt
= gimple_build_call (fn
, 1, arg
);
1086 ssa_target
= make_temp_ssa_name (TREE_TYPE (arg
), NULL
, "powroot");
1087 gimple_set_lhs (call_stmt
, ssa_target
);
1088 gimple_set_location (call_stmt
, loc
);
1089 gsi_insert_before (gsi
, call_stmt
, GSI_SAME_STMT
);
1094 /* Build a gimple binary operation with the given CODE and arguments
1095 ARG0, ARG1, assigning the result to a new SSA name for variable
1096 TARGET. Insert the statement prior to GSI's current position, and
1097 return the fresh SSA name.*/
1100 build_and_insert_binop (gimple_stmt_iterator
*gsi
, location_t loc
,
1101 const char *name
, enum tree_code code
,
1102 tree arg0
, tree arg1
)
1104 tree result
= make_temp_ssa_name (TREE_TYPE (arg0
), NULL
, name
);
1105 gassign
*stmt
= gimple_build_assign (result
, code
, arg0
, arg1
);
1106 gimple_set_location (stmt
, loc
);
1107 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
1111 /* Build a gimple reference operation with the given CODE and argument
1112 ARG, assigning the result to a new SSA name of TYPE with NAME.
1113 Insert the statement prior to GSI's current position, and return
1114 the fresh SSA name. */
1117 build_and_insert_ref (gimple_stmt_iterator
*gsi
, location_t loc
, tree type
,
1118 const char *name
, enum tree_code code
, tree arg0
)
1120 tree result
= make_temp_ssa_name (type
, NULL
, name
);
1121 gimple stmt
= gimple_build_assign (result
, build1 (code
, type
, arg0
));
1122 gimple_set_location (stmt
, loc
);
1123 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
1127 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1128 prior to GSI's current position, and return the fresh SSA name. */
1131 build_and_insert_cast (gimple_stmt_iterator
*gsi
, location_t loc
,
1132 tree type
, tree val
)
1134 tree result
= make_ssa_name (type
);
1135 gassign
*stmt
= gimple_build_assign (result
, NOP_EXPR
, val
);
1136 gimple_set_location (stmt
, loc
);
1137 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
1141 struct pow_synth_sqrt_info
1144 unsigned int deepest
;
1145 unsigned int num_mults
;
1148 /* Return true iff the real value C can be represented as a
1149 sum of powers of 0.5 up to N. That is:
1150 C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1151 Record in INFO the various parameters of the synthesis algorithm such
1152 as the factors a[i], the maximum 0.5 power and the number of
1153 multiplications that will be required. */
1156 representable_as_half_series_p (REAL_VALUE_TYPE c
, unsigned n
,
1157 struct pow_synth_sqrt_info
*info
)
1159 REAL_VALUE_TYPE factor
= dconsthalf
;
1160 REAL_VALUE_TYPE remainder
= c
;
1163 info
->num_mults
= 0;
1164 memset (info
->factors
, 0, n
* sizeof (bool));
1166 for (unsigned i
= 0; i
< n
; i
++)
1168 REAL_VALUE_TYPE res
;
1170 /* If something inexact happened bail out now. */
1171 if (REAL_ARITHMETIC (res
, MINUS_EXPR
, remainder
, factor
))
1174 /* We have hit zero. The number is representable as a sum
1175 of powers of 0.5. */
1176 if (REAL_VALUES_EQUAL (res
, dconst0
))
1178 info
->factors
[i
] = true;
1179 info
->deepest
= i
+ 1;
1182 else if (!REAL_VALUE_NEGATIVE (res
))
1185 info
->factors
[i
] = true;
1189 info
->factors
[i
] = false;
1191 REAL_ARITHMETIC (factor
, MULT_EXPR
, factor
, dconsthalf
);
1196 /* Return the tree corresponding to FN being applied
1197 to ARG N times at GSI and LOC.
1198 Look up previous results from CACHE if need be.
1199 cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times. */
1202 get_fn_chain (tree arg
, unsigned int n
, gimple_stmt_iterator
*gsi
,
1203 tree fn
, location_t loc
, tree
*cache
)
1205 tree res
= cache
[n
];
1208 tree prev
= get_fn_chain (arg
, n
- 1, gsi
, fn
, loc
, cache
);
1209 res
= build_and_insert_call (gsi
, loc
, fn
, prev
);
1216 /* Print to STREAM the repeated application of function FNAME to ARG
1217 N times. So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1221 print_nested_fn (FILE* stream
, const char *fname
, const char* arg
,
1225 fprintf (stream
, "%s", arg
);
1228 fprintf (stream
, "%s (", fname
);
1229 print_nested_fn (stream
, fname
, arg
, n
- 1);
1230 fprintf (stream
, ")");
1234 /* Print to STREAM the fractional sequence of sqrt chains
1235 applied to ARG, described by INFO. Used for the dump file. */
1238 dump_fractional_sqrt_sequence (FILE *stream
, const char *arg
,
1239 struct pow_synth_sqrt_info
*info
)
1241 for (unsigned int i
= 0; i
< info
->deepest
; i
++)
1243 bool is_set
= info
->factors
[i
];
1246 print_nested_fn (stream
, "sqrt", arg
, i
+ 1);
1247 if (i
!= info
->deepest
- 1)
1248 fprintf (stream
, " * ");
1253 /* Print to STREAM a representation of raising ARG to an integer
1254 power N. Used for the dump file. */
1257 dump_integer_part (FILE *stream
, const char* arg
, HOST_WIDE_INT n
)
1260 fprintf (stream
, "powi (%s, " HOST_WIDE_INT_PRINT_DEC
")", arg
, n
);
1262 fprintf (stream
, "%s", arg
);
1265 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1266 square roots. Place at GSI and LOC. Limit the maximum depth
1267 of the sqrt chains to MAX_DEPTH. Return the tree holding the
1268 result of the expanded sequence or NULL_TREE if the expansion failed.
1270 This routine assumes that ARG1 is a real number with a fractional part
1271 (the integer exponent case will have been handled earlier in
1272 gimple_expand_builtin_pow).
1275 * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1276 FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1277 FRAC_PART == ARG1 - WHOLE_PART:
1278 Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1279 POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1280 if it can be expressed as such, that is if FRAC_PART satisfies:
1281 FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1282 where integer a[i] is either 0 or 1.
1285 POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1286 --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1288 For ARG1 < 0.0 there are two approaches:
1289 * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1290 is calculated as above.
1293 POW (x, -5.625) == 1.0 / POW (x, 5.625)
1294 --> 1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1296 * (B) : WHOLE_PART := - ceil (abs (ARG1))
1297 FRAC_PART := ARG1 - WHOLE_PART
1298 and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1300 POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1301 --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1303 For ARG1 < 0.0 we choose between (A) and (B) depending on
1304 how many multiplications we'd have to do.
1305 So, for the example in (B): POW (x, -5.875), if we were to
1306 follow algorithm (A) we would produce:
1307 1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1308 which contains more multiplications than approach (B).
1310 Hopefully, this approach will eliminate potentially expensive POW library
1311 calls when unsafe floating point math is enabled and allow the compiler to
1312 further optimise the multiplies, square roots and divides produced by this
1316 expand_pow_as_sqrts (gimple_stmt_iterator
*gsi
, location_t loc
,
1317 tree arg0
, tree arg1
, HOST_WIDE_INT max_depth
)
1319 tree type
= TREE_TYPE (arg0
);
1320 machine_mode mode
= TYPE_MODE (type
);
1321 tree sqrtfn
= mathfn_built_in (type
, BUILT_IN_SQRT
);
1322 bool one_over
= true;
1327 if (TREE_CODE (arg1
) != REAL_CST
)
1330 REAL_VALUE_TYPE exp_init
= TREE_REAL_CST (arg1
);
1332 gcc_assert (max_depth
> 0);
1333 tree
*cache
= XALLOCAVEC (tree
, max_depth
+ 1);
1335 struct pow_synth_sqrt_info synth_info
;
1336 synth_info
.factors
= XALLOCAVEC (bool, max_depth
+ 1);
1337 synth_info
.deepest
= 0;
1338 synth_info
.num_mults
= 0;
1340 bool neg_exp
= REAL_VALUE_NEGATIVE (exp_init
);
1341 REAL_VALUE_TYPE exp
= real_value_abs (&exp_init
);
1343 /* The whole and fractional parts of exp. */
1344 REAL_VALUE_TYPE whole_part
;
1345 REAL_VALUE_TYPE frac_part
;
1347 real_floor (&whole_part
, mode
, &exp
);
1348 REAL_ARITHMETIC (frac_part
, MINUS_EXPR
, exp
, whole_part
);
1351 REAL_VALUE_TYPE ceil_whole
= dconst0
;
1352 REAL_VALUE_TYPE ceil_fract
= dconst0
;
1356 real_ceil (&ceil_whole
, mode
, &exp
);
1357 REAL_ARITHMETIC (ceil_fract
, MINUS_EXPR
, ceil_whole
, exp
);
1360 if (!representable_as_half_series_p (frac_part
, max_depth
, &synth_info
))
1363 /* Check whether it's more profitable to not use 1.0 / ... */
1366 struct pow_synth_sqrt_info alt_synth_info
;
1367 alt_synth_info
.factors
= XALLOCAVEC (bool, max_depth
+ 1);
1368 alt_synth_info
.deepest
= 0;
1369 alt_synth_info
.num_mults
= 0;
1371 if (representable_as_half_series_p (ceil_fract
, max_depth
,
1373 && alt_synth_info
.deepest
<= synth_info
.deepest
1374 && alt_synth_info
.num_mults
< synth_info
.num_mults
)
1376 whole_part
= ceil_whole
;
1377 frac_part
= ceil_fract
;
1378 synth_info
.deepest
= alt_synth_info
.deepest
;
1379 synth_info
.num_mults
= alt_synth_info
.num_mults
;
1380 memcpy (synth_info
.factors
, alt_synth_info
.factors
,
1381 (max_depth
+ 1) * sizeof (bool));
1386 HOST_WIDE_INT n
= real_to_integer (&whole_part
);
1387 REAL_VALUE_TYPE cint
;
1388 real_from_integer (&cint
, VOIDmode
, n
, SIGNED
);
1390 if (!real_identical (&whole_part
, &cint
))
1393 if (powi_cost (n
) + synth_info
.num_mults
> POWI_MAX_MULTS
)
1396 memset (cache
, 0, (max_depth
+ 1) * sizeof (tree
));
1398 tree integer_res
= n
== 0 ? build_real (type
, dconst1
) : arg0
;
1400 /* Calculate the integer part of the exponent. */
1403 integer_res
= gimple_expand_builtin_powi (gsi
, loc
, arg0
, n
);
1412 real_to_decimal (string
, &exp_init
, sizeof (string
), 0, 1);
1413 fprintf (dump_file
, "synthesizing pow (x, %s) as:\n", string
);
1419 fprintf (dump_file
, "1.0 / (");
1420 dump_integer_part (dump_file
, "x", n
);
1422 fprintf (dump_file
, " * ");
1423 dump_fractional_sqrt_sequence (dump_file
, "x", &synth_info
);
1424 fprintf (dump_file
, ")");
1428 dump_fractional_sqrt_sequence (dump_file
, "x", &synth_info
);
1429 fprintf (dump_file
, " / (");
1430 dump_integer_part (dump_file
, "x", n
);
1431 fprintf (dump_file
, ")");
1436 dump_fractional_sqrt_sequence (dump_file
, "x", &synth_info
);
1438 fprintf (dump_file
, " * ");
1439 dump_integer_part (dump_file
, "x", n
);
1442 fprintf (dump_file
, "\ndeepest sqrt chain: %d\n", synth_info
.deepest
);
1446 tree fract_res
= NULL_TREE
;
1449 /* Calculate the fractional part of the exponent. */
1450 for (unsigned i
= 0; i
< synth_info
.deepest
; i
++)
1452 if (synth_info
.factors
[i
])
1454 tree sqrt_chain
= get_fn_chain (arg0
, i
+ 1, gsi
, sqrtfn
, loc
, cache
);
1457 fract_res
= sqrt_chain
;
1460 fract_res
= build_and_insert_binop (gsi
, loc
, "powroot", MULT_EXPR
,
1461 fract_res
, sqrt_chain
);
1465 tree res
= NULL_TREE
;
1472 res
= build_and_insert_binop (gsi
, loc
, "powroot", MULT_EXPR
,
1473 fract_res
, integer_res
);
1477 res
= build_and_insert_binop (gsi
, loc
, "powrootrecip", RDIV_EXPR
,
1478 build_real (type
, dconst1
), res
);
1482 res
= build_and_insert_binop (gsi
, loc
, "powroot", RDIV_EXPR
,
1483 fract_res
, integer_res
);
1487 res
= build_and_insert_binop (gsi
, loc
, "powroot", MULT_EXPR
,
1488 fract_res
, integer_res
);
1492 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1493 with location info LOC. If possible, create an equivalent and
1494 less expensive sequence of statements prior to GSI, and return an
1495 expession holding the result. */
1498 gimple_expand_builtin_pow (gimple_stmt_iterator
*gsi
, location_t loc
,
1499 tree arg0
, tree arg1
)
1501 REAL_VALUE_TYPE c
, cint
, dconst1_3
, dconst1_4
, dconst1_6
;
1502 REAL_VALUE_TYPE c2
, dconst3
;
1504 tree type
, sqrtfn
, cbrtfn
, sqrt_arg0
, result
, cbrt_x
, powi_cbrt_x
;
1506 bool speed_p
= optimize_bb_for_speed_p (gsi_bb (*gsi
));
1507 bool hw_sqrt_exists
, c_is_int
, c2_is_int
;
1509 dconst1_4
= dconst1
;
1510 SET_REAL_EXP (&dconst1_4
, REAL_EXP (&dconst1_4
) - 2);
1512 /* If the exponent isn't a constant, there's nothing of interest
1514 if (TREE_CODE (arg1
) != REAL_CST
)
1517 /* If the exponent is equivalent to an integer, expand to an optimal
1518 multiplication sequence when profitable. */
1519 c
= TREE_REAL_CST (arg1
);
1520 n
= real_to_integer (&c
);
1521 real_from_integer (&cint
, VOIDmode
, n
, SIGNED
);
1522 c_is_int
= real_identical (&c
, &cint
);
1525 && ((n
>= -1 && n
<= 2)
1526 || (flag_unsafe_math_optimizations
1528 && powi_cost (n
) <= POWI_MAX_MULTS
)))
1529 return gimple_expand_builtin_powi (gsi
, loc
, arg0
, n
);
1531 /* Attempt various optimizations using sqrt and cbrt. */
1532 type
= TREE_TYPE (arg0
);
1533 mode
= TYPE_MODE (type
);
1534 sqrtfn
= mathfn_built_in (type
, BUILT_IN_SQRT
);
1536 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1537 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1540 && REAL_VALUES_EQUAL (c
, dconsthalf
)
1541 && !HONOR_SIGNED_ZEROS (mode
))
1542 return build_and_insert_call (gsi
, loc
, sqrtfn
, arg0
);
1544 hw_sqrt_exists
= optab_handler (sqrt_optab
, mode
) != CODE_FOR_nothing
;
1546 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1547 optimizations since 1./3. is not exactly representable. If x
1548 is negative and finite, the correct value of pow(x,1./3.) is
1549 a NaN with the "invalid" exception raised, because the value
1550 of 1./3. actually has an even denominator. The correct value
1551 of cbrt(x) is a negative real value. */
1552 cbrtfn
= mathfn_built_in (type
, BUILT_IN_CBRT
);
1553 dconst1_3
= real_value_truncate (mode
, dconst_third ());
1555 if (flag_unsafe_math_optimizations
1557 && (gimple_val_nonnegative_real_p (arg0
) || !HONOR_NANS (mode
))
1558 && REAL_VALUES_EQUAL (c
, dconst1_3
))
1559 return build_and_insert_call (gsi
, loc
, cbrtfn
, arg0
);
1561 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1562 if we don't have a hardware sqrt insn. */
1563 dconst1_6
= dconst1_3
;
1564 SET_REAL_EXP (&dconst1_6
, REAL_EXP (&dconst1_6
) - 1);
1566 if (flag_unsafe_math_optimizations
1569 && (gimple_val_nonnegative_real_p (arg0
) || !HONOR_NANS (mode
))
1572 && REAL_VALUES_EQUAL (c
, dconst1_6
))
1575 sqrt_arg0
= build_and_insert_call (gsi
, loc
, sqrtfn
, arg0
);
1578 return build_and_insert_call (gsi
, loc
, cbrtfn
, sqrt_arg0
);
1582 /* Attempt to expand the POW as a product of square root chains.
1583 Expand the 0.25 case even when otpimising for size. */
1584 if (flag_unsafe_math_optimizations
1587 && (speed_p
|| REAL_VALUES_EQUAL (c
, dconst1_4
))
1588 && !HONOR_SIGNED_ZEROS (mode
))
1590 unsigned int max_depth
= speed_p
1591 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH
)
1594 tree expand_with_sqrts
1595 = expand_pow_as_sqrts (gsi
, loc
, arg0
, arg1
, max_depth
);
1597 if (expand_with_sqrts
)
1598 return expand_with_sqrts
;
1601 real_arithmetic (&c2
, MULT_EXPR
, &c
, &dconst2
);
1602 n
= real_to_integer (&c2
);
1603 real_from_integer (&cint
, VOIDmode
, n
, SIGNED
);
1604 c2_is_int
= real_identical (&c2
, &cint
);
1606 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1608 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
1609 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
1611 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
1612 different from pow(x, 1./3.) due to rounding and behavior with
1613 negative x, we need to constrain this transformation to unsafe
1614 math and positive x or finite math. */
1615 real_from_integer (&dconst3
, VOIDmode
, 3, SIGNED
);
1616 real_arithmetic (&c2
, MULT_EXPR
, &c
, &dconst3
);
1617 real_round (&c2
, mode
, &c2
);
1618 n
= real_to_integer (&c2
);
1619 real_from_integer (&cint
, VOIDmode
, n
, SIGNED
);
1620 real_arithmetic (&c2
, RDIV_EXPR
, &cint
, &dconst3
);
1621 real_convert (&c2
, mode
, &c2
);
1623 if (flag_unsafe_math_optimizations
1625 && (gimple_val_nonnegative_real_p (arg0
) || !HONOR_NANS (mode
))
1626 && real_identical (&c2
, &c
)
1628 && optimize_function_for_speed_p (cfun
)
1629 && powi_cost (n
/ 3) <= POWI_MAX_MULTS
)
1631 tree powi_x_ndiv3
= NULL_TREE
;
1633 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
1634 possible or profitable, give up. Skip the degenerate case when
1635 abs(n) < 3, where the result is always 1. */
1636 if (absu_hwi (n
) >= 3)
1638 powi_x_ndiv3
= gimple_expand_builtin_powi (gsi
, loc
, arg0
,
1644 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
1645 as that creates an unnecessary variable. Instead, just produce
1646 either cbrt(x) or cbrt(x) * cbrt(x). */
1647 cbrt_x
= build_and_insert_call (gsi
, loc
, cbrtfn
, arg0
);
1649 if (absu_hwi (n
) % 3 == 1)
1650 powi_cbrt_x
= cbrt_x
;
1652 powi_cbrt_x
= build_and_insert_binop (gsi
, loc
, "powroot", MULT_EXPR
,
1655 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
1656 if (absu_hwi (n
) < 3)
1657 result
= powi_cbrt_x
;
1659 result
= build_and_insert_binop (gsi
, loc
, "powroot", MULT_EXPR
,
1660 powi_x_ndiv3
, powi_cbrt_x
);
1662 /* If n is negative, reciprocate the result. */
1664 result
= build_and_insert_binop (gsi
, loc
, "powroot", RDIV_EXPR
,
1665 build_real (type
, dconst1
), result
);
1670 /* No optimizations succeeded. */
1674 /* ARG is the argument to a cabs builtin call in GSI with location info
1675 LOC. Create a sequence of statements prior to GSI that calculates
1676 sqrt(R*R + I*I), where R and I are the real and imaginary components
1677 of ARG, respectively. Return an expression holding the result. */
1680 gimple_expand_builtin_cabs (gimple_stmt_iterator
*gsi
, location_t loc
, tree arg
)
1682 tree real_part
, imag_part
, addend1
, addend2
, sum
, result
;
1683 tree type
= TREE_TYPE (TREE_TYPE (arg
));
1684 tree sqrtfn
= mathfn_built_in (type
, BUILT_IN_SQRT
);
1685 machine_mode mode
= TYPE_MODE (type
);
1687 if (!flag_unsafe_math_optimizations
1688 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi
)))
1690 || optab_handler (sqrt_optab
, mode
) == CODE_FOR_nothing
)
1693 real_part
= build_and_insert_ref (gsi
, loc
, type
, "cabs",
1694 REALPART_EXPR
, arg
);
1695 addend1
= build_and_insert_binop (gsi
, loc
, "cabs", MULT_EXPR
,
1696 real_part
, real_part
);
1697 imag_part
= build_and_insert_ref (gsi
, loc
, type
, "cabs",
1698 IMAGPART_EXPR
, arg
);
1699 addend2
= build_and_insert_binop (gsi
, loc
, "cabs", MULT_EXPR
,
1700 imag_part
, imag_part
);
1701 sum
= build_and_insert_binop (gsi
, loc
, "cabs", PLUS_EXPR
, addend1
, addend2
);
1702 result
= build_and_insert_call (gsi
, loc
, sqrtfn
, sum
);
1707 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1708 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
1709 an optimal number of multiplies, when n is a constant. */
1713 const pass_data pass_data_cse_sincos
=
1715 GIMPLE_PASS
, /* type */
1716 "sincos", /* name */
1717 OPTGROUP_NONE
, /* optinfo_flags */
1718 TV_NONE
, /* tv_id */
1719 PROP_ssa
, /* properties_required */
1720 0, /* properties_provided */
1721 0, /* properties_destroyed */
1722 0, /* todo_flags_start */
1723 TODO_update_ssa
, /* todo_flags_finish */
1726 class pass_cse_sincos
: public gimple_opt_pass
1729 pass_cse_sincos (gcc::context
*ctxt
)
1730 : gimple_opt_pass (pass_data_cse_sincos
, ctxt
)
1733 /* opt_pass methods: */
1734 virtual bool gate (function
*)
1736 /* We no longer require either sincos or cexp, since powi expansion
1737 piggybacks on this pass. */
1741 virtual unsigned int execute (function
*);
1743 }; // class pass_cse_sincos
1746 pass_cse_sincos::execute (function
*fun
)
1749 bool cfg_changed
= false;
1751 calculate_dominance_info (CDI_DOMINATORS
);
1752 memset (&sincos_stats
, 0, sizeof (sincos_stats
));
1754 FOR_EACH_BB_FN (bb
, fun
)
1756 gimple_stmt_iterator gsi
;
1757 bool cleanup_eh
= false;
1759 for (gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
1761 gimple stmt
= gsi_stmt (gsi
);
1764 /* Only the last stmt in a bb could throw, no need to call
1765 gimple_purge_dead_eh_edges if we change something in the middle
1766 of a basic block. */
1769 if (is_gimple_call (stmt
)
1770 && gimple_call_lhs (stmt
)
1771 && (fndecl
= gimple_call_fndecl (stmt
))
1772 && DECL_BUILT_IN_CLASS (fndecl
) == BUILT_IN_NORMAL
)
1774 tree arg
, arg0
, arg1
, result
;
1778 switch (DECL_FUNCTION_CODE (fndecl
))
1780 CASE_FLT_FN (BUILT_IN_COS
):
1781 CASE_FLT_FN (BUILT_IN_SIN
):
1782 CASE_FLT_FN (BUILT_IN_CEXPI
):
1783 /* Make sure we have either sincos or cexp. */
1784 if (!targetm
.libc_has_function (function_c99_math_complex
)
1785 && !targetm
.libc_has_function (function_sincos
))
1788 arg
= gimple_call_arg (stmt
, 0);
1789 if (TREE_CODE (arg
) == SSA_NAME
)
1790 cfg_changed
|= execute_cse_sincos_1 (arg
);
1793 CASE_FLT_FN (BUILT_IN_POW
):
1794 arg0
= gimple_call_arg (stmt
, 0);
1795 arg1
= gimple_call_arg (stmt
, 1);
1797 loc
= gimple_location (stmt
);
1798 result
= gimple_expand_builtin_pow (&gsi
, loc
, arg0
, arg1
);
1802 tree lhs
= gimple_get_lhs (stmt
);
1803 gassign
*new_stmt
= gimple_build_assign (lhs
, result
);
1804 gimple_set_location (new_stmt
, loc
);
1805 unlink_stmt_vdef (stmt
);
1806 gsi_replace (&gsi
, new_stmt
, true);
1808 if (gimple_vdef (stmt
))
1809 release_ssa_name (gimple_vdef (stmt
));
1813 CASE_FLT_FN (BUILT_IN_POWI
):
1814 arg0
= gimple_call_arg (stmt
, 0);
1815 arg1
= gimple_call_arg (stmt
, 1);
1816 loc
= gimple_location (stmt
);
1818 if (real_minus_onep (arg0
))
1820 tree t0
, t1
, cond
, one
, minus_one
;
1823 t0
= TREE_TYPE (arg0
);
1824 t1
= TREE_TYPE (arg1
);
1825 one
= build_real (t0
, dconst1
);
1826 minus_one
= build_real (t0
, dconstm1
);
1828 cond
= make_temp_ssa_name (t1
, NULL
, "powi_cond");
1829 stmt
= gimple_build_assign (cond
, BIT_AND_EXPR
,
1830 arg1
, build_int_cst (t1
, 1));
1831 gimple_set_location (stmt
, loc
);
1832 gsi_insert_before (&gsi
, stmt
, GSI_SAME_STMT
);
1834 result
= make_temp_ssa_name (t0
, NULL
, "powi");
1835 stmt
= gimple_build_assign (result
, COND_EXPR
, cond
,
1837 gimple_set_location (stmt
, loc
);
1838 gsi_insert_before (&gsi
, stmt
, GSI_SAME_STMT
);
1842 if (!tree_fits_shwi_p (arg1
))
1845 n
= tree_to_shwi (arg1
);
1846 result
= gimple_expand_builtin_powi (&gsi
, loc
, arg0
, n
);
1851 tree lhs
= gimple_get_lhs (stmt
);
1852 gassign
*new_stmt
= gimple_build_assign (lhs
, result
);
1853 gimple_set_location (new_stmt
, loc
);
1854 unlink_stmt_vdef (stmt
);
1855 gsi_replace (&gsi
, new_stmt
, true);
1857 if (gimple_vdef (stmt
))
1858 release_ssa_name (gimple_vdef (stmt
));
1862 CASE_FLT_FN (BUILT_IN_CABS
):
1863 arg0
= gimple_call_arg (stmt
, 0);
1864 loc
= gimple_location (stmt
);
1865 result
= gimple_expand_builtin_cabs (&gsi
, loc
, arg0
);
1869 tree lhs
= gimple_get_lhs (stmt
);
1870 gassign
*new_stmt
= gimple_build_assign (lhs
, result
);
1871 gimple_set_location (new_stmt
, loc
);
1872 unlink_stmt_vdef (stmt
);
1873 gsi_replace (&gsi
, new_stmt
, true);
1875 if (gimple_vdef (stmt
))
1876 release_ssa_name (gimple_vdef (stmt
));
1885 cfg_changed
|= gimple_purge_dead_eh_edges (bb
);
1888 statistics_counter_event (fun
, "sincos statements inserted",
1889 sincos_stats
.inserted
);
1891 free_dominance_info (CDI_DOMINATORS
);
1892 return cfg_changed
? TODO_cleanup_cfg
: 0;
1898 make_pass_cse_sincos (gcc::context
*ctxt
)
1900 return new pass_cse_sincos (ctxt
);
1903 /* A symbolic number is used to detect byte permutation and selection
1904 patterns. Therefore the field N contains an artificial number
1905 consisting of octet sized markers:
1907 0 - target byte has the value 0
1908 FF - target byte has an unknown value (eg. due to sign extension)
1909 1..size - marker value is the target byte index minus one.
1911 To detect permutations on memory sources (arrays and structures), a symbolic
1912 number is also associated a base address (the array or structure the load is
1913 made from), an offset from the base address and a range which gives the
1914 difference between the highest and lowest accessed memory location to make
1915 such a symbolic number. The range is thus different from size which reflects
1916 the size of the type of current expression. Note that for non memory source,
1917 range holds the same value as size.
1919 For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1920 a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1921 still have a size of 2 but this time a range of 1. */
1923 struct symbolic_number
{
1928 HOST_WIDE_INT bytepos
;
1931 unsigned HOST_WIDE_INT range
;
1934 #define BITS_PER_MARKER 8
1935 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1936 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1937 #define HEAD_MARKER(n, size) \
1938 ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1940 /* The number which the find_bswap_or_nop_1 result should match in
1941 order to have a nop. The number is masked according to the size of
1942 the symbolic number before using it. */
1943 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1944 (uint64_t)0x08070605 << 32 | 0x04030201)
1946 /* The number which the find_bswap_or_nop_1 result should match in
1947 order to have a byte swap. The number is masked according to the
1948 size of the symbolic number before using it. */
1949 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1950 (uint64_t)0x01020304 << 32 | 0x05060708)
1952 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1953 number N. Return false if the requested operation is not permitted
1954 on a symbolic number. */
1957 do_shift_rotate (enum tree_code code
,
1958 struct symbolic_number
*n
,
1961 int i
, size
= TYPE_PRECISION (n
->type
) / BITS_PER_UNIT
;
1962 unsigned head_marker
;
1964 if (count
% BITS_PER_UNIT
!= 0)
1966 count
= (count
/ BITS_PER_UNIT
) * BITS_PER_MARKER
;
1968 /* Zero out the extra bits of N in order to avoid them being shifted
1969 into the significant bits. */
1970 if (size
< 64 / BITS_PER_MARKER
)
1971 n
->n
&= ((uint64_t) 1 << (size
* BITS_PER_MARKER
)) - 1;
1979 head_marker
= HEAD_MARKER (n
->n
, size
);
1981 /* Arithmetic shift of signed type: result is dependent on the value. */
1982 if (!TYPE_UNSIGNED (n
->type
) && head_marker
)
1983 for (i
= 0; i
< count
/ BITS_PER_MARKER
; i
++)
1984 n
->n
|= (uint64_t) MARKER_BYTE_UNKNOWN
1985 << ((size
- 1 - i
) * BITS_PER_MARKER
);
1988 n
->n
= (n
->n
<< count
) | (n
->n
>> ((size
* BITS_PER_MARKER
) - count
));
1991 n
->n
= (n
->n
>> count
) | (n
->n
<< ((size
* BITS_PER_MARKER
) - count
));
1996 /* Zero unused bits for size. */
1997 if (size
< 64 / BITS_PER_MARKER
)
1998 n
->n
&= ((uint64_t) 1 << (size
* BITS_PER_MARKER
)) - 1;
2002 /* Perform sanity checking for the symbolic number N and the gimple
2006 verify_symbolic_number_p (struct symbolic_number
*n
, gimple stmt
)
2010 lhs_type
= gimple_expr_type (stmt
);
2012 if (TREE_CODE (lhs_type
) != INTEGER_TYPE
)
2015 if (TYPE_PRECISION (lhs_type
) != TYPE_PRECISION (n
->type
))
2021 /* Initialize the symbolic number N for the bswap pass from the base element
2022 SRC manipulated by the bitwise OR expression. */
2025 init_symbolic_number (struct symbolic_number
*n
, tree src
)
2029 n
->base_addr
= n
->offset
= n
->alias_set
= n
->vuse
= NULL_TREE
;
2031 /* Set up the symbolic number N by setting each byte to a value between 1 and
2032 the byte size of rhs1. The highest order byte is set to n->size and the
2033 lowest order byte to 1. */
2034 n
->type
= TREE_TYPE (src
);
2035 size
= TYPE_PRECISION (n
->type
);
2036 if (size
% BITS_PER_UNIT
!= 0)
2038 size
/= BITS_PER_UNIT
;
2039 if (size
> 64 / BITS_PER_MARKER
)
2044 if (size
< 64 / BITS_PER_MARKER
)
2045 n
->n
&= ((uint64_t) 1 << (size
* BITS_PER_MARKER
)) - 1;
2050 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2051 the answer. If so, REF is that memory source and the base of the memory area
2052 accessed and the offset of the access from that base are recorded in N. */
2055 find_bswap_or_nop_load (gimple stmt
, tree ref
, struct symbolic_number
*n
)
2057 /* Leaf node is an array or component ref. Memorize its base and
2058 offset from base to compare to other such leaf node. */
2059 HOST_WIDE_INT bitsize
, bitpos
;
2061 int unsignedp
, volatilep
;
2062 tree offset
, base_addr
;
2064 /* Not prepared to handle PDP endian. */
2065 if (BYTES_BIG_ENDIAN
!= WORDS_BIG_ENDIAN
)
2068 if (!gimple_assign_load_p (stmt
) || gimple_has_volatile_ops (stmt
))
2071 base_addr
= get_inner_reference (ref
, &bitsize
, &bitpos
, &offset
, &mode
,
2072 &unsignedp
, &volatilep
, false);
2074 if (TREE_CODE (base_addr
) == MEM_REF
)
2076 offset_int bit_offset
= 0;
2077 tree off
= TREE_OPERAND (base_addr
, 1);
2079 if (!integer_zerop (off
))
2081 offset_int boff
, coff
= mem_ref_offset (base_addr
);
2082 boff
= wi::lshift (coff
, LOG2_BITS_PER_UNIT
);
2086 base_addr
= TREE_OPERAND (base_addr
, 0);
2088 /* Avoid returning a negative bitpos as this may wreak havoc later. */
2089 if (wi::neg_p (bit_offset
))
2091 offset_int mask
= wi::mask
<offset_int
> (LOG2_BITS_PER_UNIT
, false);
2092 offset_int tem
= bit_offset
.and_not (mask
);
2093 /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2094 Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */
2096 tem
= wi::arshift (tem
, LOG2_BITS_PER_UNIT
);
2098 offset
= size_binop (PLUS_EXPR
, offset
,
2099 wide_int_to_tree (sizetype
, tem
));
2101 offset
= wide_int_to_tree (sizetype
, tem
);
2104 bitpos
+= bit_offset
.to_shwi ();
2107 if (bitpos
% BITS_PER_UNIT
)
2109 if (bitsize
% BITS_PER_UNIT
)
2112 if (!init_symbolic_number (n
, ref
))
2114 n
->base_addr
= base_addr
;
2116 n
->bytepos
= bitpos
/ BITS_PER_UNIT
;
2117 n
->alias_set
= reference_alias_ptr_type (ref
);
2118 n
->vuse
= gimple_vuse (stmt
);
2122 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2123 symbolic number N1 and N2 whose source statements are respectively
2124 SOURCE_STMT1 and SOURCE_STMT2. */
2127 perform_symbolic_merge (gimple source_stmt1
, struct symbolic_number
*n1
,
2128 gimple source_stmt2
, struct symbolic_number
*n2
,
2129 struct symbolic_number
*n
)
2134 struct symbolic_number
*n_start
;
2136 /* Sources are different, cancel bswap if they are not memory location with
2137 the same base (array, structure, ...). */
2138 if (gimple_assign_rhs1 (source_stmt1
) != gimple_assign_rhs1 (source_stmt2
))
2141 HOST_WIDE_INT start_sub
, end_sub
, end1
, end2
, end
;
2142 struct symbolic_number
*toinc_n_ptr
, *n_end
;
2144 if (!n1
->base_addr
|| !n2
->base_addr
2145 || !operand_equal_p (n1
->base_addr
, n2
->base_addr
, 0))
2148 if (!n1
->offset
!= !n2
->offset
2149 || (n1
->offset
&& !operand_equal_p (n1
->offset
, n2
->offset
, 0)))
2152 if (n1
->bytepos
< n2
->bytepos
)
2155 start_sub
= n2
->bytepos
- n1
->bytepos
;
2156 source_stmt
= source_stmt1
;
2161 start_sub
= n1
->bytepos
- n2
->bytepos
;
2162 source_stmt
= source_stmt2
;
2165 /* Find the highest address at which a load is performed and
2166 compute related info. */
2167 end1
= n1
->bytepos
+ (n1
->range
- 1);
2168 end2
= n2
->bytepos
+ (n2
->range
- 1);
2172 end_sub
= end2
- end1
;
2177 end_sub
= end1
- end2
;
2179 n_end
= (end2
> end1
) ? n2
: n1
;
2181 /* Find symbolic number whose lsb is the most significant. */
2182 if (BYTES_BIG_ENDIAN
)
2183 toinc_n_ptr
= (n_end
== n1
) ? n2
: n1
;
2185 toinc_n_ptr
= (n_start
== n1
) ? n2
: n1
;
2187 n
->range
= end
- n_start
->bytepos
+ 1;
2189 /* Check that the range of memory covered can be represented by
2190 a symbolic number. */
2191 if (n
->range
> 64 / BITS_PER_MARKER
)
2194 /* Reinterpret byte marks in symbolic number holding the value of
2195 bigger weight according to target endianness. */
2196 inc
= BYTES_BIG_ENDIAN
? end_sub
: start_sub
;
2197 size
= TYPE_PRECISION (n1
->type
) / BITS_PER_UNIT
;
2198 for (i
= 0; i
< size
; i
++, inc
<<= BITS_PER_MARKER
)
2201 = (toinc_n_ptr
->n
>> (i
* BITS_PER_MARKER
)) & MARKER_MASK
;
2202 if (marker
&& marker
!= MARKER_BYTE_UNKNOWN
)
2203 toinc_n_ptr
->n
+= inc
;
2208 n
->range
= n1
->range
;
2210 source_stmt
= source_stmt1
;
2214 || alias_ptr_types_compatible_p (n1
->alias_set
, n2
->alias_set
))
2215 n
->alias_set
= n1
->alias_set
;
2217 n
->alias_set
= ptr_type_node
;
2218 n
->vuse
= n_start
->vuse
;
2219 n
->base_addr
= n_start
->base_addr
;
2220 n
->offset
= n_start
->offset
;
2221 n
->bytepos
= n_start
->bytepos
;
2222 n
->type
= n_start
->type
;
2223 size
= TYPE_PRECISION (n
->type
) / BITS_PER_UNIT
;
2225 for (i
= 0, mask
= MARKER_MASK
; i
< size
; i
++, mask
<<= BITS_PER_MARKER
)
2227 uint64_t masked1
, masked2
;
2229 masked1
= n1
->n
& mask
;
2230 masked2
= n2
->n
& mask
;
2231 if (masked1
&& masked2
&& masked1
!= masked2
)
2234 n
->n
= n1
->n
| n2
->n
;
2239 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2240 the operation given by the rhs of STMT on the result. If the operation
2241 could successfully be executed the function returns a gimple stmt whose
2242 rhs's first tree is the expression of the source operand and NULL
2246 find_bswap_or_nop_1 (gimple stmt
, struct symbolic_number
*n
, int limit
)
2248 enum tree_code code
;
2249 tree rhs1
, rhs2
= NULL
;
2250 gimple rhs1_stmt
, rhs2_stmt
, source_stmt1
;
2251 enum gimple_rhs_class rhs_class
;
2253 if (!limit
|| !is_gimple_assign (stmt
))
2256 rhs1
= gimple_assign_rhs1 (stmt
);
2258 if (find_bswap_or_nop_load (stmt
, rhs1
, n
))
2261 if (TREE_CODE (rhs1
) != SSA_NAME
)
2264 code
= gimple_assign_rhs_code (stmt
);
2265 rhs_class
= gimple_assign_rhs_class (stmt
);
2266 rhs1_stmt
= SSA_NAME_DEF_STMT (rhs1
);
2268 if (rhs_class
== GIMPLE_BINARY_RHS
)
2269 rhs2
= gimple_assign_rhs2 (stmt
);
2271 /* Handle unary rhs and binary rhs with integer constants as second
2274 if (rhs_class
== GIMPLE_UNARY_RHS
2275 || (rhs_class
== GIMPLE_BINARY_RHS
2276 && TREE_CODE (rhs2
) == INTEGER_CST
))
2278 if (code
!= BIT_AND_EXPR
2279 && code
!= LSHIFT_EXPR
2280 && code
!= RSHIFT_EXPR
2281 && code
!= LROTATE_EXPR
2282 && code
!= RROTATE_EXPR
2283 && !CONVERT_EXPR_CODE_P (code
))
2286 source_stmt1
= find_bswap_or_nop_1 (rhs1_stmt
, n
, limit
- 1);
2288 /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2289 we have to initialize the symbolic number. */
2292 if (gimple_assign_load_p (stmt
)
2293 || !init_symbolic_number (n
, rhs1
))
2295 source_stmt1
= stmt
;
2302 int i
, size
= TYPE_PRECISION (n
->type
) / BITS_PER_UNIT
;
2303 uint64_t val
= int_cst_value (rhs2
), mask
= 0;
2304 uint64_t tmp
= (1 << BITS_PER_UNIT
) - 1;
2306 /* Only constants masking full bytes are allowed. */
2307 for (i
= 0; i
< size
; i
++, tmp
<<= BITS_PER_UNIT
)
2308 if ((val
& tmp
) != 0 && (val
& tmp
) != tmp
)
2311 mask
|= (uint64_t) MARKER_MASK
<< (i
* BITS_PER_MARKER
);
2320 if (!do_shift_rotate (code
, n
, (int) TREE_INT_CST_LOW (rhs2
)))
2325 int i
, type_size
, old_type_size
;
2328 type
= gimple_expr_type (stmt
);
2329 type_size
= TYPE_PRECISION (type
);
2330 if (type_size
% BITS_PER_UNIT
!= 0)
2332 type_size
/= BITS_PER_UNIT
;
2333 if (type_size
> 64 / BITS_PER_MARKER
)
2336 /* Sign extension: result is dependent on the value. */
2337 old_type_size
= TYPE_PRECISION (n
->type
) / BITS_PER_UNIT
;
2338 if (!TYPE_UNSIGNED (n
->type
) && type_size
> old_type_size
2339 && HEAD_MARKER (n
->n
, old_type_size
))
2340 for (i
= 0; i
< type_size
- old_type_size
; i
++)
2341 n
->n
|= (uint64_t) MARKER_BYTE_UNKNOWN
2342 << ((type_size
- 1 - i
) * BITS_PER_MARKER
);
2344 if (type_size
< 64 / BITS_PER_MARKER
)
2346 /* If STMT casts to a smaller type mask out the bits not
2347 belonging to the target type. */
2348 n
->n
&= ((uint64_t) 1 << (type_size
* BITS_PER_MARKER
)) - 1;
2352 n
->range
= type_size
;
2358 return verify_symbolic_number_p (n
, stmt
) ? source_stmt1
: NULL
;
2361 /* Handle binary rhs. */
2363 if (rhs_class
== GIMPLE_BINARY_RHS
)
2365 struct symbolic_number n1
, n2
;
2366 gimple source_stmt
, source_stmt2
;
2368 if (code
!= BIT_IOR_EXPR
)
2371 if (TREE_CODE (rhs2
) != SSA_NAME
)
2374 rhs2_stmt
= SSA_NAME_DEF_STMT (rhs2
);
2379 source_stmt1
= find_bswap_or_nop_1 (rhs1_stmt
, &n1
, limit
- 1);
2384 source_stmt2
= find_bswap_or_nop_1 (rhs2_stmt
, &n2
, limit
- 1);
2389 if (TYPE_PRECISION (n1
.type
) != TYPE_PRECISION (n2
.type
))
2392 if (!n1
.vuse
!= !n2
.vuse
2393 || (n1
.vuse
&& !operand_equal_p (n1
.vuse
, n2
.vuse
, 0)))
2397 = perform_symbolic_merge (source_stmt1
, &n1
, source_stmt2
, &n2
, n
);
2402 if (!verify_symbolic_number_p (n
, stmt
))
2414 /* Check if STMT completes a bswap implementation or a read in a given
2415 endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2416 accordingly. It also sets N to represent the kind of operations
2417 performed: size of the resulting expression and whether it works on
2418 a memory source, and if so alias-set and vuse. At last, the
2419 function returns a stmt whose rhs's first tree is the source
2423 find_bswap_or_nop (gimple stmt
, struct symbolic_number
*n
, bool *bswap
)
2425 /* The number which the find_bswap_or_nop_1 result should match in order
2426 to have a full byte swap. The number is shifted to the right
2427 according to the size of the symbolic number before using it. */
2428 uint64_t cmpxchg
= CMPXCHG
;
2429 uint64_t cmpnop
= CMPNOP
;
2434 /* The last parameter determines the depth search limit. It usually
2435 correlates directly to the number n of bytes to be touched. We
2436 increase that number by log2(n) + 1 here in order to also
2437 cover signed -> unsigned conversions of the src operand as can be seen
2438 in libgcc, and for initial shift/and operation of the src operand. */
2439 limit
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt
)));
2440 limit
+= 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT
) limit
);
2441 source_stmt
= find_bswap_or_nop_1 (stmt
, n
, limit
);
2446 /* Find real size of result (highest non-zero byte). */
2452 for (tmpn
= n
->n
, rsize
= 0; tmpn
; tmpn
>>= BITS_PER_MARKER
, rsize
++);
2456 /* Zero out the extra bits of N and CMP*. */
2457 if (n
->range
< (int) sizeof (int64_t))
2461 mask
= ((uint64_t) 1 << (n
->range
* BITS_PER_MARKER
)) - 1;
2462 cmpxchg
>>= (64 / BITS_PER_MARKER
- n
->range
) * BITS_PER_MARKER
;
2466 /* A complete byte swap should make the symbolic number to start with
2467 the largest digit in the highest order byte. Unchanged symbolic
2468 number indicates a read with same endianness as target architecture. */
2471 else if (n
->n
== cmpxchg
)
2476 /* Useless bit manipulation performed by code. */
2477 if (!n
->base_addr
&& n
->n
== cmpnop
)
2480 n
->range
*= BITS_PER_UNIT
;
2486 const pass_data pass_data_optimize_bswap
=
2488 GIMPLE_PASS
, /* type */
2490 OPTGROUP_NONE
, /* optinfo_flags */
2491 TV_NONE
, /* tv_id */
2492 PROP_ssa
, /* properties_required */
2493 0, /* properties_provided */
2494 0, /* properties_destroyed */
2495 0, /* todo_flags_start */
2496 0, /* todo_flags_finish */
2499 class pass_optimize_bswap
: public gimple_opt_pass
2502 pass_optimize_bswap (gcc::context
*ctxt
)
2503 : gimple_opt_pass (pass_data_optimize_bswap
, ctxt
)
2506 /* opt_pass methods: */
2507 virtual bool gate (function
*)
2509 return flag_expensive_optimizations
&& optimize
;
2512 virtual unsigned int execute (function
*);
2514 }; // class pass_optimize_bswap
2516 /* Perform the bswap optimization: replace the expression computed in the rhs
2517 of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2518 Which of these alternatives replace the rhs is given by N->base_addr (non
2519 null if a load is needed) and BSWAP. The type, VUSE and set-alias of the
2520 load to perform are also given in N while the builtin bswap invoke is given
2521 in FNDEL. Finally, if a load is involved, SRC_STMT refers to one of the
2522 load statements involved to construct the rhs in CUR_STMT and N->range gives
2523 the size of the rhs expression for maintaining some statistics.
2525 Note that if the replacement involve a load, CUR_STMT is moved just after
2526 SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2527 changing of basic block. */
2530 bswap_replace (gimple cur_stmt
, gimple src_stmt
, tree fndecl
, tree bswap_type
,
2531 tree load_type
, struct symbolic_number
*n
, bool bswap
)
2533 gimple_stmt_iterator gsi
;
2537 gsi
= gsi_for_stmt (cur_stmt
);
2538 src
= gimple_assign_rhs1 (src_stmt
);
2539 tgt
= gimple_assign_lhs (cur_stmt
);
2541 /* Need to load the value from memory first. */
2544 gimple_stmt_iterator gsi_ins
= gsi_for_stmt (src_stmt
);
2545 tree addr_expr
, addr_tmp
, val_expr
, val_tmp
;
2546 tree load_offset_ptr
, aligned_load_type
;
2547 gimple addr_stmt
, load_stmt
;
2549 HOST_WIDE_INT load_offset
= 0;
2551 align
= get_object_alignment (src
);
2552 /* If the new access is smaller than the original one, we need
2553 to perform big endian adjustment. */
2554 if (BYTES_BIG_ENDIAN
)
2556 HOST_WIDE_INT bitsize
, bitpos
;
2558 int unsignedp
, volatilep
;
2561 get_inner_reference (src
, &bitsize
, &bitpos
, &offset
, &mode
,
2562 &unsignedp
, &volatilep
, false);
2563 if (n
->range
< (unsigned HOST_WIDE_INT
) bitsize
)
2565 load_offset
= (bitsize
- n
->range
) / BITS_PER_UNIT
;
2566 unsigned HOST_WIDE_INT l
2567 = (load_offset
* BITS_PER_UNIT
) & (align
- 1);
2574 && align
< GET_MODE_ALIGNMENT (TYPE_MODE (load_type
))
2575 && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type
), align
))
2578 /* Move cur_stmt just before one of the load of the original
2579 to ensure it has the same VUSE. See PR61517 for what could
2581 gsi_move_before (&gsi
, &gsi_ins
);
2582 gsi
= gsi_for_stmt (cur_stmt
);
2584 /* Compute address to load from and cast according to the size
2586 addr_expr
= build_fold_addr_expr (unshare_expr (src
));
2587 if (is_gimple_mem_ref_addr (addr_expr
))
2588 addr_tmp
= addr_expr
;
2591 addr_tmp
= make_temp_ssa_name (TREE_TYPE (addr_expr
), NULL
,
2593 addr_stmt
= gimple_build_assign (addr_tmp
, addr_expr
);
2594 gsi_insert_before (&gsi
, addr_stmt
, GSI_SAME_STMT
);
2597 /* Perform the load. */
2598 aligned_load_type
= load_type
;
2599 if (align
< TYPE_ALIGN (load_type
))
2600 aligned_load_type
= build_aligned_type (load_type
, align
);
2601 load_offset_ptr
= build_int_cst (n
->alias_set
, load_offset
);
2602 val_expr
= fold_build2 (MEM_REF
, aligned_load_type
, addr_tmp
,
2608 nop_stats
.found_16bit
++;
2609 else if (n
->range
== 32)
2610 nop_stats
.found_32bit
++;
2613 gcc_assert (n
->range
== 64);
2614 nop_stats
.found_64bit
++;
2617 /* Convert the result of load if necessary. */
2618 if (!useless_type_conversion_p (TREE_TYPE (tgt
), load_type
))
2620 val_tmp
= make_temp_ssa_name (aligned_load_type
, NULL
,
2622 load_stmt
= gimple_build_assign (val_tmp
, val_expr
);
2623 gimple_set_vuse (load_stmt
, n
->vuse
);
2624 gsi_insert_before (&gsi
, load_stmt
, GSI_SAME_STMT
);
2625 gimple_assign_set_rhs_with_ops (&gsi
, NOP_EXPR
, val_tmp
);
2629 gimple_assign_set_rhs_with_ops (&gsi
, MEM_REF
, val_expr
);
2630 gimple_set_vuse (cur_stmt
, n
->vuse
);
2632 update_stmt (cur_stmt
);
2637 "%d bit load in target endianness found at: ",
2639 print_gimple_stmt (dump_file
, cur_stmt
, 0, 0);
2645 val_tmp
= make_temp_ssa_name (aligned_load_type
, NULL
, "load_dst");
2646 load_stmt
= gimple_build_assign (val_tmp
, val_expr
);
2647 gimple_set_vuse (load_stmt
, n
->vuse
);
2648 gsi_insert_before (&gsi
, load_stmt
, GSI_SAME_STMT
);
2654 bswap_stats
.found_16bit
++;
2655 else if (n
->range
== 32)
2656 bswap_stats
.found_32bit
++;
2659 gcc_assert (n
->range
== 64);
2660 bswap_stats
.found_64bit
++;
2665 /* Convert the src expression if necessary. */
2666 if (!useless_type_conversion_p (TREE_TYPE (tmp
), bswap_type
))
2668 gimple convert_stmt
;
2670 tmp
= make_temp_ssa_name (bswap_type
, NULL
, "bswapsrc");
2671 convert_stmt
= gimple_build_assign (tmp
, NOP_EXPR
, src
);
2672 gsi_insert_before (&gsi
, convert_stmt
, GSI_SAME_STMT
);
2675 /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values
2676 are considered as rotation of 2N bit values by N bits is generally not
2677 equivalent to a bswap. Consider for instance 0x01020304 r>> 16 which
2678 gives 0x03040102 while a bswap for that value is 0x04030201. */
2679 if (bswap
&& n
->range
== 16)
2681 tree count
= build_int_cst (NULL
, BITS_PER_UNIT
);
2682 src
= fold_build2 (LROTATE_EXPR
, bswap_type
, tmp
, count
);
2683 bswap_stmt
= gimple_build_assign (NULL
, src
);
2686 bswap_stmt
= gimple_build_call (fndecl
, 1, tmp
);
2690 /* Convert the result if necessary. */
2691 if (!useless_type_conversion_p (TREE_TYPE (tgt
), bswap_type
))
2693 gimple convert_stmt
;
2695 tmp
= make_temp_ssa_name (bswap_type
, NULL
, "bswapdst");
2696 convert_stmt
= gimple_build_assign (tgt
, NOP_EXPR
, tmp
);
2697 gsi_insert_after (&gsi
, convert_stmt
, GSI_SAME_STMT
);
2700 gimple_set_lhs (bswap_stmt
, tmp
);
2704 fprintf (dump_file
, "%d bit bswap implementation found at: ",
2706 print_gimple_stmt (dump_file
, cur_stmt
, 0, 0);
2709 gsi_insert_after (&gsi
, bswap_stmt
, GSI_SAME_STMT
);
2710 gsi_remove (&gsi
, true);
2714 /* Find manual byte swap implementations as well as load in a given
2715 endianness. Byte swaps are turned into a bswap builtin invokation
2716 while endian loads are converted to bswap builtin invokation or
2717 simple load according to the target endianness. */
2720 pass_optimize_bswap::execute (function
*fun
)
2723 bool bswap32_p
, bswap64_p
;
2724 bool changed
= false;
2725 tree bswap32_type
= NULL_TREE
, bswap64_type
= NULL_TREE
;
2727 if (BITS_PER_UNIT
!= 8)
2730 bswap32_p
= (builtin_decl_explicit_p (BUILT_IN_BSWAP32
)
2731 && optab_handler (bswap_optab
, SImode
) != CODE_FOR_nothing
);
2732 bswap64_p
= (builtin_decl_explicit_p (BUILT_IN_BSWAP64
)
2733 && (optab_handler (bswap_optab
, DImode
) != CODE_FOR_nothing
2734 || (bswap32_p
&& word_mode
== SImode
)));
2736 /* Determine the argument type of the builtins. The code later on
2737 assumes that the return and argument type are the same. */
2740 tree fndecl
= builtin_decl_explicit (BUILT_IN_BSWAP32
);
2741 bswap32_type
= TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl
)));
2746 tree fndecl
= builtin_decl_explicit (BUILT_IN_BSWAP64
);
2747 bswap64_type
= TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl
)));
2750 memset (&nop_stats
, 0, sizeof (nop_stats
));
2751 memset (&bswap_stats
, 0, sizeof (bswap_stats
));
2753 FOR_EACH_BB_FN (bb
, fun
)
2755 gimple_stmt_iterator gsi
;
2757 /* We do a reverse scan for bswap patterns to make sure we get the
2758 widest match. As bswap pattern matching doesn't handle previously
2759 inserted smaller bswap replacements as sub-patterns, the wider
2760 variant wouldn't be detected. */
2761 for (gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);)
2763 gimple src_stmt
, cur_stmt
= gsi_stmt (gsi
);
2764 tree fndecl
= NULL_TREE
, bswap_type
= NULL_TREE
, load_type
;
2765 enum tree_code code
;
2766 struct symbolic_number n
;
2769 /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2770 might be moved to a different basic block by bswap_replace and gsi
2771 must not points to it if that's the case. Moving the gsi_prev
2772 there make sure that gsi points to the statement previous to
2773 cur_stmt while still making sure that all statements are
2774 considered in this basic block. */
2777 if (!is_gimple_assign (cur_stmt
))
2780 code
= gimple_assign_rhs_code (cur_stmt
);
2785 if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt
))
2786 || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt
))
2796 src_stmt
= find_bswap_or_nop (cur_stmt
, &n
, &bswap
);
2804 /* Already in canonical form, nothing to do. */
2805 if (code
== LROTATE_EXPR
|| code
== RROTATE_EXPR
)
2807 load_type
= bswap_type
= uint16_type_node
;
2810 load_type
= uint32_type_node
;
2813 fndecl
= builtin_decl_explicit (BUILT_IN_BSWAP32
);
2814 bswap_type
= bswap32_type
;
2818 load_type
= uint64_type_node
;
2821 fndecl
= builtin_decl_explicit (BUILT_IN_BSWAP64
);
2822 bswap_type
= bswap64_type
;
2829 if (bswap
&& !fndecl
&& n
.range
!= 16)
2832 if (bswap_replace (cur_stmt
, src_stmt
, fndecl
, bswap_type
, load_type
,
2838 statistics_counter_event (fun
, "16-bit nop implementations found",
2839 nop_stats
.found_16bit
);
2840 statistics_counter_event (fun
, "32-bit nop implementations found",
2841 nop_stats
.found_32bit
);
2842 statistics_counter_event (fun
, "64-bit nop implementations found",
2843 nop_stats
.found_64bit
);
2844 statistics_counter_event (fun
, "16-bit bswap implementations found",
2845 bswap_stats
.found_16bit
);
2846 statistics_counter_event (fun
, "32-bit bswap implementations found",
2847 bswap_stats
.found_32bit
);
2848 statistics_counter_event (fun
, "64-bit bswap implementations found",
2849 bswap_stats
.found_64bit
);
2851 return (changed
? TODO_update_ssa
: 0);
2857 make_pass_optimize_bswap (gcc::context
*ctxt
)
2859 return new pass_optimize_bswap (ctxt
);
2862 /* Return true if stmt is a type conversion operation that can be stripped
2863 when used in a widening multiply operation. */
2865 widening_mult_conversion_strippable_p (tree result_type
, gimple stmt
)
2867 enum tree_code rhs_code
= gimple_assign_rhs_code (stmt
);
2869 if (TREE_CODE (result_type
) == INTEGER_TYPE
)
2874 if (!CONVERT_EXPR_CODE_P (rhs_code
))
2877 op_type
= TREE_TYPE (gimple_assign_lhs (stmt
));
2879 /* If the type of OP has the same precision as the result, then
2880 we can strip this conversion. The multiply operation will be
2881 selected to create the correct extension as a by-product. */
2882 if (TYPE_PRECISION (result_type
) == TYPE_PRECISION (op_type
))
2885 /* We can also strip a conversion if it preserves the signed-ness of
2886 the operation and doesn't narrow the range. */
2887 inner_op_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
2889 /* If the inner-most type is unsigned, then we can strip any
2890 intermediate widening operation. If it's signed, then the
2891 intermediate widening operation must also be signed. */
2892 if ((TYPE_UNSIGNED (inner_op_type
)
2893 || TYPE_UNSIGNED (op_type
) == TYPE_UNSIGNED (inner_op_type
))
2894 && TYPE_PRECISION (op_type
) > TYPE_PRECISION (inner_op_type
))
2900 return rhs_code
== FIXED_CONVERT_EXPR
;
2903 /* Return true if RHS is a suitable operand for a widening multiplication,
2904 assuming a target type of TYPE.
2905 There are two cases:
2907 - RHS makes some value at least twice as wide. Store that value
2908 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2910 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
2911 but leave *TYPE_OUT untouched. */
2914 is_widening_mult_rhs_p (tree type
, tree rhs
, tree
*type_out
,
2920 if (TREE_CODE (rhs
) == SSA_NAME
)
2922 stmt
= SSA_NAME_DEF_STMT (rhs
);
2923 if (is_gimple_assign (stmt
))
2925 if (! widening_mult_conversion_strippable_p (type
, stmt
))
2929 rhs1
= gimple_assign_rhs1 (stmt
);
2931 if (TREE_CODE (rhs1
) == INTEGER_CST
)
2933 *new_rhs_out
= rhs1
;
2942 type1
= TREE_TYPE (rhs1
);
2944 if (TREE_CODE (type1
) != TREE_CODE (type
)
2945 || TYPE_PRECISION (type1
) * 2 > TYPE_PRECISION (type
))
2948 *new_rhs_out
= rhs1
;
2953 if (TREE_CODE (rhs
) == INTEGER_CST
)
2963 /* Return true if STMT performs a widening multiplication, assuming the
2964 output type is TYPE. If so, store the unwidened types of the operands
2965 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
2966 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2967 and *TYPE2_OUT would give the operands of the multiplication. */
2970 is_widening_mult_p (gimple stmt
,
2971 tree
*type1_out
, tree
*rhs1_out
,
2972 tree
*type2_out
, tree
*rhs2_out
)
2974 tree type
= TREE_TYPE (gimple_assign_lhs (stmt
));
2976 if (TREE_CODE (type
) != INTEGER_TYPE
2977 && TREE_CODE (type
) != FIXED_POINT_TYPE
)
2980 if (!is_widening_mult_rhs_p (type
, gimple_assign_rhs1 (stmt
), type1_out
,
2984 if (!is_widening_mult_rhs_p (type
, gimple_assign_rhs2 (stmt
), type2_out
,
2988 if (*type1_out
== NULL
)
2990 if (*type2_out
== NULL
|| !int_fits_type_p (*rhs1_out
, *type2_out
))
2992 *type1_out
= *type2_out
;
2995 if (*type2_out
== NULL
)
2997 if (!int_fits_type_p (*rhs2_out
, *type1_out
))
2999 *type2_out
= *type1_out
;
3002 /* Ensure that the larger of the two operands comes first. */
3003 if (TYPE_PRECISION (*type1_out
) < TYPE_PRECISION (*type2_out
))
3005 std::swap (*type1_out
, *type2_out
);
3006 std::swap (*rhs1_out
, *rhs2_out
);
3012 /* Process a single gimple statement STMT, which has a MULT_EXPR as
3013 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
3014 value is true iff we converted the statement. */
3017 convert_mult_to_widen (gimple stmt
, gimple_stmt_iterator
*gsi
)
3019 tree lhs
, rhs1
, rhs2
, type
, type1
, type2
;
3020 enum insn_code handler
;
3021 machine_mode to_mode
, from_mode
, actual_mode
;
3023 int actual_precision
;
3024 location_t loc
= gimple_location (stmt
);
3025 bool from_unsigned1
, from_unsigned2
;
3027 lhs
= gimple_assign_lhs (stmt
);
3028 type
= TREE_TYPE (lhs
);
3029 if (TREE_CODE (type
) != INTEGER_TYPE
)
3032 if (!is_widening_mult_p (stmt
, &type1
, &rhs1
, &type2
, &rhs2
))
3035 to_mode
= TYPE_MODE (type
);
3036 from_mode
= TYPE_MODE (type1
);
3037 from_unsigned1
= TYPE_UNSIGNED (type1
);
3038 from_unsigned2
= TYPE_UNSIGNED (type2
);
3040 if (from_unsigned1
&& from_unsigned2
)
3041 op
= umul_widen_optab
;
3042 else if (!from_unsigned1
&& !from_unsigned2
)
3043 op
= smul_widen_optab
;
3045 op
= usmul_widen_optab
;
3047 handler
= find_widening_optab_handler_and_mode (op
, to_mode
, from_mode
,
3050 if (handler
== CODE_FOR_nothing
)
3052 if (op
!= smul_widen_optab
)
3054 /* We can use a signed multiply with unsigned types as long as
3055 there is a wider mode to use, or it is the smaller of the two
3056 types that is unsigned. Note that type1 >= type2, always. */
3057 if ((TYPE_UNSIGNED (type1
)
3058 && TYPE_PRECISION (type1
) == GET_MODE_PRECISION (from_mode
))
3059 || (TYPE_UNSIGNED (type2
)
3060 && TYPE_PRECISION (type2
) == GET_MODE_PRECISION (from_mode
)))
3062 from_mode
= GET_MODE_WIDER_MODE (from_mode
);
3063 if (GET_MODE_SIZE (to_mode
) <= GET_MODE_SIZE (from_mode
))
3067 op
= smul_widen_optab
;
3068 handler
= find_widening_optab_handler_and_mode (op
, to_mode
,
3072 if (handler
== CODE_FOR_nothing
)
3075 from_unsigned1
= from_unsigned2
= false;
3081 /* Ensure that the inputs to the handler are in the correct precison
3082 for the opcode. This will be the full mode size. */
3083 actual_precision
= GET_MODE_PRECISION (actual_mode
);
3084 if (2 * actual_precision
> TYPE_PRECISION (type
))
3086 if (actual_precision
!= TYPE_PRECISION (type1
)
3087 || from_unsigned1
!= TYPE_UNSIGNED (type1
))
3088 rhs1
= build_and_insert_cast (gsi
, loc
,
3089 build_nonstandard_integer_type
3090 (actual_precision
, from_unsigned1
), rhs1
);
3091 if (actual_precision
!= TYPE_PRECISION (type2
)
3092 || from_unsigned2
!= TYPE_UNSIGNED (type2
))
3093 rhs2
= build_and_insert_cast (gsi
, loc
,
3094 build_nonstandard_integer_type
3095 (actual_precision
, from_unsigned2
), rhs2
);
3097 /* Handle constants. */
3098 if (TREE_CODE (rhs1
) == INTEGER_CST
)
3099 rhs1
= fold_convert (type1
, rhs1
);
3100 if (TREE_CODE (rhs2
) == INTEGER_CST
)
3101 rhs2
= fold_convert (type2
, rhs2
);
3103 gimple_assign_set_rhs1 (stmt
, rhs1
);
3104 gimple_assign_set_rhs2 (stmt
, rhs2
);
3105 gimple_assign_set_rhs_code (stmt
, WIDEN_MULT_EXPR
);
3107 widen_mul_stats
.widen_mults_inserted
++;
3111 /* Process a single gimple statement STMT, which is found at the
3112 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3113 rhs (given by CODE), and try to convert it into a
3114 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
3115 is true iff we converted the statement. */
3118 convert_plusminus_to_widen (gimple_stmt_iterator
*gsi
, gimple stmt
,
3119 enum tree_code code
)
3121 gimple rhs1_stmt
= NULL
, rhs2_stmt
= NULL
;
3122 gimple conv1_stmt
= NULL
, conv2_stmt
= NULL
, conv_stmt
;
3123 tree type
, type1
, type2
, optype
;
3124 tree lhs
, rhs1
, rhs2
, mult_rhs1
, mult_rhs2
, add_rhs
;
3125 enum tree_code rhs1_code
= ERROR_MARK
, rhs2_code
= ERROR_MARK
;
3127 enum tree_code wmult_code
;
3128 enum insn_code handler
;
3129 machine_mode to_mode
, from_mode
, actual_mode
;
3130 location_t loc
= gimple_location (stmt
);
3131 int actual_precision
;
3132 bool from_unsigned1
, from_unsigned2
;
3134 lhs
= gimple_assign_lhs (stmt
);
3135 type
= TREE_TYPE (lhs
);
3136 if (TREE_CODE (type
) != INTEGER_TYPE
3137 && TREE_CODE (type
) != FIXED_POINT_TYPE
)
3140 if (code
== MINUS_EXPR
)
3141 wmult_code
= WIDEN_MULT_MINUS_EXPR
;
3143 wmult_code
= WIDEN_MULT_PLUS_EXPR
;
3145 rhs1
= gimple_assign_rhs1 (stmt
);
3146 rhs2
= gimple_assign_rhs2 (stmt
);
3148 if (TREE_CODE (rhs1
) == SSA_NAME
)
3150 rhs1_stmt
= SSA_NAME_DEF_STMT (rhs1
);
3151 if (is_gimple_assign (rhs1_stmt
))
3152 rhs1_code
= gimple_assign_rhs_code (rhs1_stmt
);
3155 if (TREE_CODE (rhs2
) == SSA_NAME
)
3157 rhs2_stmt
= SSA_NAME_DEF_STMT (rhs2
);
3158 if (is_gimple_assign (rhs2_stmt
))
3159 rhs2_code
= gimple_assign_rhs_code (rhs2_stmt
);
3162 /* Allow for one conversion statement between the multiply
3163 and addition/subtraction statement. If there are more than
3164 one conversions then we assume they would invalidate this
3165 transformation. If that's not the case then they should have
3166 been folded before now. */
3167 if (CONVERT_EXPR_CODE_P (rhs1_code
))
3169 conv1_stmt
= rhs1_stmt
;
3170 rhs1
= gimple_assign_rhs1 (rhs1_stmt
);
3171 if (TREE_CODE (rhs1
) == SSA_NAME
)
3173 rhs1_stmt
= SSA_NAME_DEF_STMT (rhs1
);
3174 if (is_gimple_assign (rhs1_stmt
))
3175 rhs1_code
= gimple_assign_rhs_code (rhs1_stmt
);
3180 if (CONVERT_EXPR_CODE_P (rhs2_code
))
3182 conv2_stmt
= rhs2_stmt
;
3183 rhs2
= gimple_assign_rhs1 (rhs2_stmt
);
3184 if (TREE_CODE (rhs2
) == SSA_NAME
)
3186 rhs2_stmt
= SSA_NAME_DEF_STMT (rhs2
);
3187 if (is_gimple_assign (rhs2_stmt
))
3188 rhs2_code
= gimple_assign_rhs_code (rhs2_stmt
);
3194 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3195 is_widening_mult_p, but we still need the rhs returns.
3197 It might also appear that it would be sufficient to use the existing
3198 operands of the widening multiply, but that would limit the choice of
3199 multiply-and-accumulate instructions.
3201 If the widened-multiplication result has more than one uses, it is
3202 probably wiser not to do the conversion. */
3203 if (code
== PLUS_EXPR
3204 && (rhs1_code
== MULT_EXPR
|| rhs1_code
== WIDEN_MULT_EXPR
))
3206 if (!has_single_use (rhs1
)
3207 || !is_widening_mult_p (rhs1_stmt
, &type1
, &mult_rhs1
,
3208 &type2
, &mult_rhs2
))
3211 conv_stmt
= conv1_stmt
;
3213 else if (rhs2_code
== MULT_EXPR
|| rhs2_code
== WIDEN_MULT_EXPR
)
3215 if (!has_single_use (rhs2
)
3216 || !is_widening_mult_p (rhs2_stmt
, &type1
, &mult_rhs1
,
3217 &type2
, &mult_rhs2
))
3220 conv_stmt
= conv2_stmt
;
3225 to_mode
= TYPE_MODE (type
);
3226 from_mode
= TYPE_MODE (type1
);
3227 from_unsigned1
= TYPE_UNSIGNED (type1
);
3228 from_unsigned2
= TYPE_UNSIGNED (type2
);
3231 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
3232 if (from_unsigned1
!= from_unsigned2
)
3234 if (!INTEGRAL_TYPE_P (type
))
3236 /* We can use a signed multiply with unsigned types as long as
3237 there is a wider mode to use, or it is the smaller of the two
3238 types that is unsigned. Note that type1 >= type2, always. */
3240 && TYPE_PRECISION (type1
) == GET_MODE_PRECISION (from_mode
))
3242 && TYPE_PRECISION (type2
) == GET_MODE_PRECISION (from_mode
)))
3244 from_mode
= GET_MODE_WIDER_MODE (from_mode
);
3245 if (GET_MODE_SIZE (from_mode
) >= GET_MODE_SIZE (to_mode
))
3249 from_unsigned1
= from_unsigned2
= false;
3250 optype
= build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode
),
3254 /* If there was a conversion between the multiply and addition
3255 then we need to make sure it fits a multiply-and-accumulate.
3256 The should be a single mode change which does not change the
3260 /* We use the original, unmodified data types for this. */
3261 tree from_type
= TREE_TYPE (gimple_assign_rhs1 (conv_stmt
));
3262 tree to_type
= TREE_TYPE (gimple_assign_lhs (conv_stmt
));
3263 int data_size
= TYPE_PRECISION (type1
) + TYPE_PRECISION (type2
);
3264 bool is_unsigned
= TYPE_UNSIGNED (type1
) && TYPE_UNSIGNED (type2
);
3266 if (TYPE_PRECISION (from_type
) > TYPE_PRECISION (to_type
))
3268 /* Conversion is a truncate. */
3269 if (TYPE_PRECISION (to_type
) < data_size
)
3272 else if (TYPE_PRECISION (from_type
) < TYPE_PRECISION (to_type
))
3274 /* Conversion is an extend. Check it's the right sort. */
3275 if (TYPE_UNSIGNED (from_type
) != is_unsigned
3276 && !(is_unsigned
&& TYPE_PRECISION (from_type
) > data_size
))
3279 /* else convert is a no-op for our purposes. */
3282 /* Verify that the machine can perform a widening multiply
3283 accumulate in this mode/signedness combination, otherwise
3284 this transformation is likely to pessimize code. */
3285 this_optab
= optab_for_tree_code (wmult_code
, optype
, optab_default
);
3286 handler
= find_widening_optab_handler_and_mode (this_optab
, to_mode
,
3287 from_mode
, 0, &actual_mode
);
3289 if (handler
== CODE_FOR_nothing
)
3292 /* Ensure that the inputs to the handler are in the correct precison
3293 for the opcode. This will be the full mode size. */
3294 actual_precision
= GET_MODE_PRECISION (actual_mode
);
3295 if (actual_precision
!= TYPE_PRECISION (type1
)
3296 || from_unsigned1
!= TYPE_UNSIGNED (type1
))
3297 mult_rhs1
= build_and_insert_cast (gsi
, loc
,
3298 build_nonstandard_integer_type
3299 (actual_precision
, from_unsigned1
),
3301 if (actual_precision
!= TYPE_PRECISION (type2
)
3302 || from_unsigned2
!= TYPE_UNSIGNED (type2
))
3303 mult_rhs2
= build_and_insert_cast (gsi
, loc
,
3304 build_nonstandard_integer_type
3305 (actual_precision
, from_unsigned2
),
3308 if (!useless_type_conversion_p (type
, TREE_TYPE (add_rhs
)))
3309 add_rhs
= build_and_insert_cast (gsi
, loc
, type
, add_rhs
);
3311 /* Handle constants. */
3312 if (TREE_CODE (mult_rhs1
) == INTEGER_CST
)
3313 mult_rhs1
= fold_convert (type1
, mult_rhs1
);
3314 if (TREE_CODE (mult_rhs2
) == INTEGER_CST
)
3315 mult_rhs2
= fold_convert (type2
, mult_rhs2
);
3317 gimple_assign_set_rhs_with_ops (gsi
, wmult_code
, mult_rhs1
, mult_rhs2
,
3319 update_stmt (gsi_stmt (*gsi
));
3320 widen_mul_stats
.maccs_inserted
++;
3324 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3325 with uses in additions and subtractions to form fused multiply-add
3326 operations. Returns true if successful and MUL_STMT should be removed. */
3329 convert_mult_to_fma (gimple mul_stmt
, tree op1
, tree op2
)
3331 tree mul_result
= gimple_get_lhs (mul_stmt
);
3332 tree type
= TREE_TYPE (mul_result
);
3333 gimple use_stmt
, neguse_stmt
;
3335 use_operand_p use_p
;
3336 imm_use_iterator imm_iter
;
3338 if (FLOAT_TYPE_P (type
)
3339 && flag_fp_contract_mode
== FP_CONTRACT_OFF
)
3342 /* We don't want to do bitfield reduction ops. */
3343 if (INTEGRAL_TYPE_P (type
)
3344 && (TYPE_PRECISION (type
)
3345 != GET_MODE_PRECISION (TYPE_MODE (type
))))
3348 /* If the target doesn't support it, don't generate it. We assume that
3349 if fma isn't available then fms, fnma or fnms are not either. */
3350 if (optab_handler (fma_optab
, TYPE_MODE (type
)) == CODE_FOR_nothing
)
3353 /* If the multiplication has zero uses, it is kept around probably because
3354 of -fnon-call-exceptions. Don't optimize it away in that case,
3356 if (has_zero_uses (mul_result
))
3359 /* Make sure that the multiplication statement becomes dead after
3360 the transformation, thus that all uses are transformed to FMAs.
3361 This means we assume that an FMA operation has the same cost
3363 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, mul_result
)
3365 enum tree_code use_code
;
3366 tree result
= mul_result
;
3367 bool negate_p
= false;
3369 use_stmt
= USE_STMT (use_p
);
3371 if (is_gimple_debug (use_stmt
))
3374 /* For now restrict this operations to single basic blocks. In theory
3375 we would want to support sinking the multiplication in
3381 to form a fma in the then block and sink the multiplication to the
3383 if (gimple_bb (use_stmt
) != gimple_bb (mul_stmt
))
3386 if (!is_gimple_assign (use_stmt
))
3389 use_code
= gimple_assign_rhs_code (use_stmt
);
3391 /* A negate on the multiplication leads to FNMA. */
3392 if (use_code
== NEGATE_EXPR
)
3397 result
= gimple_assign_lhs (use_stmt
);
3399 /* Make sure the negate statement becomes dead with this
3400 single transformation. */
3401 if (!single_imm_use (gimple_assign_lhs (use_stmt
),
3402 &use_p
, &neguse_stmt
))
3405 /* Make sure the multiplication isn't also used on that stmt. */
3406 FOR_EACH_PHI_OR_STMT_USE (usep
, neguse_stmt
, iter
, SSA_OP_USE
)
3407 if (USE_FROM_PTR (usep
) == mul_result
)
3411 use_stmt
= neguse_stmt
;
3412 if (gimple_bb (use_stmt
) != gimple_bb (mul_stmt
))
3414 if (!is_gimple_assign (use_stmt
))
3417 use_code
= gimple_assign_rhs_code (use_stmt
);
3424 if (gimple_assign_rhs2 (use_stmt
) == result
)
3425 negate_p
= !negate_p
;
3430 /* FMA can only be formed from PLUS and MINUS. */
3434 /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3435 by a MULT_EXPR that we'll visit later, we might be able to
3436 get a more profitable match with fnma.
3437 OTOH, if we don't, a negate / fma pair has likely lower latency
3438 that a mult / subtract pair. */
3439 if (use_code
== MINUS_EXPR
&& !negate_p
3440 && gimple_assign_rhs1 (use_stmt
) == result
3441 && optab_handler (fms_optab
, TYPE_MODE (type
)) == CODE_FOR_nothing
3442 && optab_handler (fnma_optab
, TYPE_MODE (type
)) != CODE_FOR_nothing
)
3444 tree rhs2
= gimple_assign_rhs2 (use_stmt
);
3446 if (TREE_CODE (rhs2
) == SSA_NAME
)
3448 gimple stmt2
= SSA_NAME_DEF_STMT (rhs2
);
3449 if (has_single_use (rhs2
)
3450 && is_gimple_assign (stmt2
)
3451 && gimple_assign_rhs_code (stmt2
) == MULT_EXPR
)
3456 /* We can't handle a * b + a * b. */
3457 if (gimple_assign_rhs1 (use_stmt
) == gimple_assign_rhs2 (use_stmt
))
3460 /* While it is possible to validate whether or not the exact form
3461 that we've recognized is available in the backend, the assumption
3462 is that the transformation is never a loss. For instance, suppose
3463 the target only has the plain FMA pattern available. Consider
3464 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3465 is still two operations. Consider -(a*b)-c -> fma(-a,b,-c): we
3466 still have 3 operations, but in the FMA form the two NEGs are
3467 independent and could be run in parallel. */
3470 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, mul_result
)
3472 gimple_stmt_iterator gsi
= gsi_for_stmt (use_stmt
);
3473 enum tree_code use_code
;
3474 tree addop
, mulop1
= op1
, result
= mul_result
;
3475 bool negate_p
= false;
3477 if (is_gimple_debug (use_stmt
))
3480 use_code
= gimple_assign_rhs_code (use_stmt
);
3481 if (use_code
== NEGATE_EXPR
)
3483 result
= gimple_assign_lhs (use_stmt
);
3484 single_imm_use (gimple_assign_lhs (use_stmt
), &use_p
, &neguse_stmt
);
3485 gsi_remove (&gsi
, true);
3486 release_defs (use_stmt
);
3488 use_stmt
= neguse_stmt
;
3489 gsi
= gsi_for_stmt (use_stmt
);
3490 use_code
= gimple_assign_rhs_code (use_stmt
);
3494 if (gimple_assign_rhs1 (use_stmt
) == result
)
3496 addop
= gimple_assign_rhs2 (use_stmt
);
3497 /* a * b - c -> a * b + (-c) */
3498 if (gimple_assign_rhs_code (use_stmt
) == MINUS_EXPR
)
3499 addop
= force_gimple_operand_gsi (&gsi
,
3500 build1 (NEGATE_EXPR
,
3502 true, NULL_TREE
, true,
3507 addop
= gimple_assign_rhs1 (use_stmt
);
3508 /* a - b * c -> (-b) * c + a */
3509 if (gimple_assign_rhs_code (use_stmt
) == MINUS_EXPR
)
3510 negate_p
= !negate_p
;
3514 mulop1
= force_gimple_operand_gsi (&gsi
,
3515 build1 (NEGATE_EXPR
,
3517 true, NULL_TREE
, true,
3520 fma_stmt
= gimple_build_assign (gimple_assign_lhs (use_stmt
),
3521 FMA_EXPR
, mulop1
, op2
, addop
);
3522 gsi_replace (&gsi
, fma_stmt
, true);
3523 widen_mul_stats
.fmas_inserted
++;
3529 /* Find integer multiplications where the operands are extended from
3530 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3531 where appropriate. */
3535 const pass_data pass_data_optimize_widening_mul
=
3537 GIMPLE_PASS
, /* type */
3538 "widening_mul", /* name */
3539 OPTGROUP_NONE
, /* optinfo_flags */
3540 TV_NONE
, /* tv_id */
3541 PROP_ssa
, /* properties_required */
3542 0, /* properties_provided */
3543 0, /* properties_destroyed */
3544 0, /* todo_flags_start */
3545 TODO_update_ssa
, /* todo_flags_finish */
3548 class pass_optimize_widening_mul
: public gimple_opt_pass
3551 pass_optimize_widening_mul (gcc::context
*ctxt
)
3552 : gimple_opt_pass (pass_data_optimize_widening_mul
, ctxt
)
3555 /* opt_pass methods: */
3556 virtual bool gate (function
*)
3558 return flag_expensive_optimizations
&& optimize
;
3561 virtual unsigned int execute (function
*);
3563 }; // class pass_optimize_widening_mul
3566 pass_optimize_widening_mul::execute (function
*fun
)
3569 bool cfg_changed
= false;
3571 memset (&widen_mul_stats
, 0, sizeof (widen_mul_stats
));
3573 FOR_EACH_BB_FN (bb
, fun
)
3575 gimple_stmt_iterator gsi
;
3577 for (gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);)
3579 gimple stmt
= gsi_stmt (gsi
);
3580 enum tree_code code
;
3582 if (is_gimple_assign (stmt
))
3584 code
= gimple_assign_rhs_code (stmt
);
3588 if (!convert_mult_to_widen (stmt
, &gsi
)
3589 && convert_mult_to_fma (stmt
,
3590 gimple_assign_rhs1 (stmt
),
3591 gimple_assign_rhs2 (stmt
)))
3593 gsi_remove (&gsi
, true);
3594 release_defs (stmt
);
3601 convert_plusminus_to_widen (&gsi
, stmt
, code
);
3607 else if (is_gimple_call (stmt
)
3608 && gimple_call_lhs (stmt
))
3610 tree fndecl
= gimple_call_fndecl (stmt
);
3612 && DECL_BUILT_IN_CLASS (fndecl
) == BUILT_IN_NORMAL
)
3614 switch (DECL_FUNCTION_CODE (fndecl
))
3619 if (TREE_CODE (gimple_call_arg (stmt
, 1)) == REAL_CST
3620 && REAL_VALUES_EQUAL
3621 (TREE_REAL_CST (gimple_call_arg (stmt
, 1)),
3623 && convert_mult_to_fma (stmt
,
3624 gimple_call_arg (stmt
, 0),
3625 gimple_call_arg (stmt
, 0)))
3627 unlink_stmt_vdef (stmt
);
3628 if (gsi_remove (&gsi
, true)
3629 && gimple_purge_dead_eh_edges (bb
))
3631 release_defs (stmt
);
3644 statistics_counter_event (fun
, "widening multiplications inserted",
3645 widen_mul_stats
.widen_mults_inserted
);
3646 statistics_counter_event (fun
, "widening maccs inserted",
3647 widen_mul_stats
.maccs_inserted
);
3648 statistics_counter_event (fun
, "fused multiply-adds inserted",
3649 widen_mul_stats
.fmas_inserted
);
3651 return cfg_changed
? TODO_cleanup_cfg
: 0;
3657 make_pass_optimize_widening_mul (gcc::context
*ctxt
)
3659 return new pass_optimize_widening_mul (ctxt
);