2 Copyright (C) 2006-2023 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 /* This pass performs loop distribution: for example, the loop
39 Loop distribution is the dual of loop fusion. It separates statements
40 of a loop (or loop nest) into multiple loops (or loop nests) with the
41 same loop header. The major goal is to separate statements which may
42 be vectorized from those that can't. This pass implements distribution
43 in the following steps:
45 1) Seed partitions with specific type statements. For now we support
46 two types seed statements: statement defining variable used outside
47 of loop; statement storing to memory.
48 2) Build reduced dependence graph (RDG) for loop to be distributed.
49 The vertices (RDG:V) model all statements in the loop and the edges
50 (RDG:E) model flow and control dependencies between statements.
51 3) Apart from RDG, compute data dependencies between memory references.
52 4) Starting from seed statement, build up partition by adding depended
53 statements according to RDG's dependence information. Partition is
54 classified as parallel type if it can be executed paralleled; or as
55 sequential type if it can't. Parallel type partition is further
56 classified as different builtin kinds if it can be implemented as
57 builtin function calls.
58 5) Build partition dependence graph (PG) based on data dependencies.
59 The vertices (PG:V) model all partitions and the edges (PG:E) model
60 all data dependencies between every partitions pair. In general,
61 data dependence is either compilation time known or unknown. In C
62 family languages, there exists quite amount compilation time unknown
63 dependencies because of possible alias relation of data references.
64 We categorize PG's edge to two types: "true" edge that represents
65 compilation time known data dependencies; "alias" edge for all other
67 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge
68 partitions in each strong connected component (SCC) correspondingly.
69 Build new PG for merged partitions.
70 7) Traverse PG again and this time with both "true" and "alias" edges
71 included. We try to break SCCs by removing some edges. Because
72 SCCs by "true" edges are all fused in step 6), we can break SCCs
73 by removing some "alias" edges. It's NP-hard to choose optimal
74 edge set, fortunately simple approximation is good enough for us
75 given the small problem scale.
76 8) Collect all data dependencies of the removed "alias" edges. Create
77 runtime alias checks for collected data dependencies.
78 9) Version loop under the condition of runtime alias checks. Given
79 loop distribution generally introduces additional overhead, it is
80 only useful if vectorization is achieved in distributed loop. We
81 version loop with internal function call IFN_LOOP_DIST_ALIAS. If
82 no distributed loop can be vectorized, we simply remove distributed
83 loops and recover to the original one.
86 1) We only distribute innermost two-level loop nest now. We should
87 extend it for arbitrary loop nests in the future.
88 2) We only fuse partitions in SCC now. A better fusion algorithm is
89 desired to minimize loop overhead, maximize parallelism and maximize
94 #include "coretypes.h"
99 #include "tree-pass.h"
101 #include "gimple-pretty-print.h"
102 #include "fold-const.h"
104 #include "gimple-iterator.h"
105 #include "gimplify-me.h"
106 #include "stor-layout.h"
107 #include "tree-cfg.h"
108 #include "tree-ssa-loop-manip.h"
109 #include "tree-ssa-loop-ivopts.h"
110 #include "tree-ssa-loop.h"
111 #include "tree-into-ssa.h"
112 #include "tree-ssa.h"
114 #include "tree-scalar-evolution.h"
115 #include "tree-vectorizer.h"
117 #include "gimple-fold.h"
118 #include "tree-affine.h"
121 #include "memmodel.h"
123 #include "tree-ssa-loop-niter.h"
126 #define MAX_DATAREFS_NUM \
127 ((unsigned) param_loop_max_datarefs_for_datadeps)
129 /* Threshold controlling number of distributed partitions. Given it may
130 be unnecessary if a memory stream cost model is invented in the future,
131 we define it as a temporary macro, rather than a parameter. */
132 #define NUM_PARTITION_THRESHOLD (4)
134 /* Hashtable helpers. */
136 struct ddr_hasher
: nofree_ptr_hash
<struct data_dependence_relation
>
138 static inline hashval_t
hash (const data_dependence_relation
*);
139 static inline bool equal (const data_dependence_relation
*,
140 const data_dependence_relation
*);
143 /* Hash function for data dependence. */
146 ddr_hasher::hash (const data_dependence_relation
*ddr
)
149 h
.add_ptr (DDR_A (ddr
));
150 h
.add_ptr (DDR_B (ddr
));
154 /* Hash table equality function for data dependence. */
157 ddr_hasher::equal (const data_dependence_relation
*ddr1
,
158 const data_dependence_relation
*ddr2
)
160 return (DDR_A (ddr1
) == DDR_A (ddr2
) && DDR_B (ddr1
) == DDR_B (ddr2
));
165 #define DR_INDEX(dr) ((uintptr_t) (dr)->aux)
167 /* A Reduced Dependence Graph (RDG) vertex representing a statement. */
170 /* The statement represented by this vertex. */
173 /* Vector of data-references in this statement. */
174 vec
<data_reference_p
> datarefs
;
176 /* True when the statement contains a write to memory. */
179 /* True when the statement contains a read from memory. */
183 #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt
184 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
185 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
186 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
187 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
188 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
189 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
190 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
192 /* Data dependence type. */
196 /* Read After Write (RAW). */
199 /* Control dependence (execute conditional on). */
203 /* Dependence information attached to an edge of the RDG. */
207 /* Type of the dependence. */
208 enum rdg_dep_type type
;
211 #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type
213 /* Kind of distributed loop. */
214 enum partition_kind
{
216 /* Partial memset stands for a paritition can be distributed into a loop
217 of memset calls, rather than a single memset call. It's handled just
218 like a normal parition, i.e, distributed as separate loop, no memset
221 Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
222 loop nest as deep as possible. As a result, parloop achieves better
223 parallelization by parallelizing deeper loop nest. This hack should
224 be unnecessary and removed once distributed memset can be understood
225 and analyzed in data reference analysis. See PR82604 for more. */
226 PKIND_PARTIAL_MEMSET
,
227 PKIND_MEMSET
, PKIND_MEMCPY
, PKIND_MEMMOVE
230 /* Type of distributed loop. */
231 enum partition_type
{
232 /* The distributed loop can be executed parallelly. */
234 /* The distributed loop has to be executed sequentially. */
238 /* Builtin info for loop distribution. */
241 /* data-references a kind != PKIND_NORMAL partition is about. */
242 data_reference_p dst_dr
;
243 data_reference_p src_dr
;
244 /* Base address and size of memory objects operated by the builtin. Note
245 both dest and source memory objects must have the same size. */
249 /* Base and offset part of dst_base after stripping constant offset. This
250 is only used in memset builtin distribution for now. */
252 unsigned HOST_WIDE_INT dst_base_offset
;
255 /* Partition for loop distribution. */
258 /* Statements of the partition. */
260 /* True if the partition defines variable which is used outside of loop. */
263 enum partition_kind kind
;
264 enum partition_type type
;
265 /* Data references in the partition. */
267 /* Information of builtin parition. */
268 struct builtin_info
*builtin
;
271 /* Partitions are fused because of different reasons. */
274 FUSE_NON_BUILTIN
= 0,
281 /* Description on different fusing reason. */
282 static const char *fuse_message
[] = {
283 "they are non-builtins",
284 "they have reductions",
285 "they have shared memory refs",
286 "they are in the same dependence scc",
287 "there is no point to distribute loop"};
290 /* Dump vertex I in RDG to FILE. */
293 dump_rdg_vertex (FILE *file
, struct graph
*rdg
, int i
)
295 struct vertex
*v
= &(rdg
->vertices
[i
]);
296 struct graph_edge
*e
;
298 fprintf (file
, "(vertex %d: (%s%s) (in:", i
,
299 RDG_MEM_WRITE_STMT (rdg
, i
) ? "w" : "",
300 RDG_MEM_READS_STMT (rdg
, i
) ? "r" : "");
303 for (e
= v
->pred
; e
; e
= e
->pred_next
)
304 fprintf (file
, " %d", e
->src
);
306 fprintf (file
, ") (out:");
309 for (e
= v
->succ
; e
; e
= e
->succ_next
)
310 fprintf (file
, " %d", e
->dest
);
312 fprintf (file
, ")\n");
313 print_gimple_stmt (file
, RDGV_STMT (v
), 0, TDF_VOPS
|TDF_MEMSYMS
);
314 fprintf (file
, ")\n");
317 /* Call dump_rdg_vertex on stderr. */
320 debug_rdg_vertex (struct graph
*rdg
, int i
)
322 dump_rdg_vertex (stderr
, rdg
, i
);
325 /* Dump the reduced dependence graph RDG to FILE. */
328 dump_rdg (FILE *file
, struct graph
*rdg
)
330 fprintf (file
, "(rdg\n");
331 for (int i
= 0; i
< rdg
->n_vertices
; i
++)
332 dump_rdg_vertex (file
, rdg
, i
);
333 fprintf (file
, ")\n");
336 /* Call dump_rdg on stderr. */
339 debug_rdg (struct graph
*rdg
)
341 dump_rdg (stderr
, rdg
);
345 dot_rdg_1 (FILE *file
, struct graph
*rdg
)
348 pretty_printer buffer
;
349 pp_needs_newline (&buffer
) = false;
350 buffer
.buffer
->stream
= file
;
352 fprintf (file
, "digraph RDG {\n");
354 for (i
= 0; i
< rdg
->n_vertices
; i
++)
356 struct vertex
*v
= &(rdg
->vertices
[i
]);
357 struct graph_edge
*e
;
359 fprintf (file
, "%d [label=\"[%d] ", i
, i
);
360 pp_gimple_stmt_1 (&buffer
, RDGV_STMT (v
), 0, TDF_SLIM
);
362 fprintf (file
, "\"]\n");
364 /* Highlight reads from memory. */
365 if (RDG_MEM_READS_STMT (rdg
, i
))
366 fprintf (file
, "%d [style=filled, fillcolor=green]\n", i
);
368 /* Highlight stores to memory. */
369 if (RDG_MEM_WRITE_STMT (rdg
, i
))
370 fprintf (file
, "%d [style=filled, fillcolor=red]\n", i
);
373 for (e
= v
->succ
; e
; e
= e
->succ_next
)
374 switch (RDGE_TYPE (e
))
377 /* These are the most common dependences: don't print these. */
378 fprintf (file
, "%d -> %d \n", i
, e
->dest
);
382 fprintf (file
, "%d -> %d [label=control] \n", i
, e
->dest
);
390 fprintf (file
, "}\n\n");
393 /* Display the Reduced Dependence Graph using dotty. */
396 dot_rdg (struct graph
*rdg
)
398 /* When debugging, you may want to enable the following code. */
400 FILE *file
= popen ("dot -Tx11", "w");
403 dot_rdg_1 (file
, rdg
);
405 close (fileno (file
));
408 dot_rdg_1 (stderr
, rdg
);
412 /* Returns the index of STMT in RDG. */
415 rdg_vertex_for_stmt (struct graph
*rdg ATTRIBUTE_UNUSED
, gimple
*stmt
)
417 int index
= gimple_uid (stmt
);
418 gcc_checking_assert (index
== -1 || RDG_STMT (rdg
, index
) == stmt
);
422 /* Creates dependence edges in RDG for all the uses of DEF. IDEF is
423 the index of DEF in RDG. */
426 create_rdg_edges_for_scalar (struct graph
*rdg
, tree def
, int idef
)
428 use_operand_p imm_use_p
;
429 imm_use_iterator iterator
;
431 FOR_EACH_IMM_USE_FAST (imm_use_p
, iterator
, def
)
433 struct graph_edge
*e
;
434 int use
= rdg_vertex_for_stmt (rdg
, USE_STMT (imm_use_p
));
439 e
= add_edge (rdg
, idef
, use
);
440 e
->data
= XNEW (struct rdg_edge
);
441 RDGE_TYPE (e
) = flow_dd
;
445 /* Creates an edge for the control dependences of BB to the vertex V. */
448 create_edge_for_control_dependence (struct graph
*rdg
, basic_block bb
,
449 int v
, control_dependences
*cd
)
453 EXECUTE_IF_SET_IN_BITMAP (cd
->get_edges_dependent_on (bb
->index
),
456 basic_block cond_bb
= cd
->get_edge_src (edge_n
);
457 gimple
*stmt
= *gsi_last_bb (cond_bb
);
458 if (stmt
&& is_ctrl_stmt (stmt
))
460 struct graph_edge
*e
;
461 int c
= rdg_vertex_for_stmt (rdg
, stmt
);
465 e
= add_edge (rdg
, c
, v
);
466 e
->data
= XNEW (struct rdg_edge
);
467 RDGE_TYPE (e
) = control_dd
;
472 /* Creates the edges of the reduced dependence graph RDG. */
475 create_rdg_flow_edges (struct graph
*rdg
)
481 for (i
= 0; i
< rdg
->n_vertices
; i
++)
482 FOR_EACH_PHI_OR_STMT_DEF (def_p
, RDG_STMT (rdg
, i
),
484 create_rdg_edges_for_scalar (rdg
, DEF_FROM_PTR (def_p
), i
);
487 /* Creates the edges of the reduced dependence graph RDG. */
490 create_rdg_cd_edges (struct graph
*rdg
, control_dependences
*cd
, loop_p loop
)
494 for (i
= 0; i
< rdg
->n_vertices
; i
++)
496 gimple
*stmt
= RDG_STMT (rdg
, i
);
497 if (gimple_code (stmt
) == GIMPLE_PHI
)
501 FOR_EACH_EDGE (e
, ei
, gimple_bb (stmt
)->preds
)
502 if (flow_bb_inside_loop_p (loop
, e
->src
))
503 create_edge_for_control_dependence (rdg
, e
->src
, i
, cd
);
506 create_edge_for_control_dependence (rdg
, gimple_bb (stmt
), i
, cd
);
511 class loop_distribution
514 /* The loop (nest) to be distributed. */
515 vec
<loop_p
> loop_nest
;
517 /* Vector of data references in the loop to be distributed. */
518 vec
<data_reference_p
> datarefs_vec
;
520 /* If there is nonaddressable data reference in above vector. */
521 bool has_nonaddressable_dataref_p
;
523 /* Store index of data reference in aux field. */
525 /* Hash table for data dependence relation in the loop to be distributed. */
526 hash_table
<ddr_hasher
> *ddrs_table
;
528 /* Array mapping basic block's index to its topological order. */
529 int *bb_top_order_index
;
530 /* And size of the array. */
531 int bb_top_order_index_size
;
533 /* Build the vertices of the reduced dependence graph RDG. Return false
535 bool create_rdg_vertices (struct graph
*rdg
, const vec
<gimple
*> &stmts
,
538 /* Initialize STMTS with all the statements of LOOP. We use topological
539 order to discover all statements. The order is important because
540 generate_loops_for_partition is using the same traversal for identifying
541 statements in loop copies. */
542 void stmts_from_loop (class loop
*loop
, vec
<gimple
*> *stmts
);
545 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
546 LOOP, and one edge per flow dependence or control dependence from control
547 dependence CD. During visiting each statement, data references are also
548 collected and recorded in global data DATAREFS_VEC. */
549 struct graph
* build_rdg (class loop
*loop
, control_dependences
*cd
);
551 /* Merge PARTITION into the partition DEST. RDG is the reduced dependence
552 graph and we update type for result partition if it is non-NULL. */
553 void partition_merge_into (struct graph
*rdg
,
554 partition
*dest
, partition
*partition
,
558 /* Return data dependence relation for data references A and B. The two
559 data references must be in lexicographic order wrto reduced dependence
560 graph RDG. We firstly try to find ddr from global ddr hash table. If
561 it doesn't exist, compute the ddr and cache it. */
562 data_dependence_relation
* get_data_dependence (struct graph
*rdg
,
567 /* In reduced dependence graph RDG for loop distribution, return true if
568 dependence between references DR1 and DR2 leads to a dependence cycle
569 and such dependence cycle can't be resolved by runtime alias check. */
570 bool data_dep_in_cycle_p (struct graph
*rdg
, data_reference_p dr1
,
571 data_reference_p dr2
);
574 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
575 PARTITION1's type after merging PARTITION2 into PARTITION1. */
576 void update_type_for_merge (struct graph
*rdg
,
577 partition
*partition1
, partition
*partition2
);
580 /* Returns a partition with all the statements needed for computing
581 the vertex V of the RDG, also including the loop exit conditions. */
582 partition
*build_rdg_partition_for_vertex (struct graph
*rdg
, int v
);
584 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
585 if it forms builtin memcpy or memmove call. */
586 void classify_builtin_ldst (loop_p loop
, struct graph
*rdg
, partition
*partition
,
587 data_reference_p dst_dr
, data_reference_p src_dr
);
589 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
590 For the moment we detect memset, memcpy and memmove patterns. Bitmap
591 STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions.
592 Returns true if there is a reduction in all partitions and we
593 possibly did not mark PARTITION as having one for this reason. */
596 classify_partition (loop_p loop
,
597 struct graph
*rdg
, partition
*partition
,
598 bitmap stmt_in_all_partitions
);
601 /* Returns true when PARTITION1 and PARTITION2 access the same memory
603 bool share_memory_accesses (struct graph
*rdg
,
604 partition
*partition1
, partition
*partition2
);
606 /* For each seed statement in STARTING_STMTS, this function builds
607 partition for it by adding depended statements according to RDG.
608 All partitions are recorded in PARTITIONS. */
609 void rdg_build_partitions (struct graph
*rdg
,
610 vec
<gimple
*> starting_stmts
,
611 vec
<partition
*> *partitions
);
613 /* Compute partition dependence created by the data references in DRS1
614 and DRS2, modify and return DIR according to that. IF ALIAS_DDR is
615 not NULL, we record dependence introduced by possible alias between
616 two data references in ALIAS_DDRS; otherwise, we simply ignore such
617 dependence as if it doesn't exist at all. */
618 int pg_add_dependence_edges (struct graph
*rdg
, int dir
, bitmap drs1
,
619 bitmap drs2
, vec
<ddr_p
> *alias_ddrs
);
622 /* Build and return partition dependence graph for PARTITIONS. RDG is
623 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
624 is true, data dependence caused by possible alias between references
625 is ignored, as if it doesn't exist at all; otherwise all depdendences
627 struct graph
*build_partition_graph (struct graph
*rdg
,
628 vec
<struct partition
*> *partitions
,
629 bool ignore_alias_p
);
631 /* Given reduced dependence graph RDG merge strong connected components
632 of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by
633 possible alias between references is ignored, as if it doesn't exist
634 at all; otherwise all depdendences are considered. */
635 void merge_dep_scc_partitions (struct graph
*rdg
, vec
<struct partition
*>
636 *partitions
, bool ignore_alias_p
);
638 /* This is the main function breaking strong conected components in
639 PARTITIONS giving reduced depdendence graph RDG. Store data dependence
640 relations for runtime alias check in ALIAS_DDRS. */
641 void break_alias_scc_partitions (struct graph
*rdg
, vec
<struct partition
*>
642 *partitions
, vec
<ddr_p
> *alias_ddrs
);
645 /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
646 ALIAS_DDRS contains ddrs which need runtime alias check. */
647 void finalize_partitions (class loop
*loop
, vec
<struct partition
*>
648 *partitions
, vec
<ddr_p
> *alias_ddrs
);
650 /* Distributes the code from LOOP in such a way that producer statements
651 are placed before consumer statements. Tries to separate only the
652 statements from STMTS into separate loops. Returns the number of
653 distributed loops. Set NB_CALLS to number of generated builtin calls.
654 Set *DESTROY_P to whether LOOP needs to be destroyed. */
655 int distribute_loop (class loop
*loop
, const vec
<gimple
*> &stmts
,
656 control_dependences
*cd
, int *nb_calls
, bool *destroy_p
,
657 bool only_patterns_p
);
659 /* Transform loops which mimic the effects of builtins rawmemchr or strlen and
660 replace them accordingly. */
661 bool transform_reduction_loop (loop_p loop
);
663 /* Compute topological order for basic blocks. Topological order is
664 needed because data dependence is computed for data references in
665 lexicographical order. */
666 void bb_top_order_init (void);
668 void bb_top_order_destroy (void);
672 /* Getter for bb_top_order. */
674 inline int get_bb_top_order_index_size (void)
676 return bb_top_order_index_size
;
679 inline int get_bb_top_order_index (int i
)
681 return bb_top_order_index
[i
];
684 unsigned int execute (function
*fun
);
688 /* If X has a smaller topological sort number than Y, returns -1;
689 if greater, returns 1. */
691 bb_top_order_cmp_r (const void *x
, const void *y
, void *loop
)
693 loop_distribution
*_loop
=
694 (loop_distribution
*) loop
;
696 basic_block bb1
= *(const basic_block
*) x
;
697 basic_block bb2
= *(const basic_block
*) y
;
699 int bb_top_order_index_size
= _loop
->get_bb_top_order_index_size ();
701 gcc_assert (bb1
->index
< bb_top_order_index_size
702 && bb2
->index
< bb_top_order_index_size
);
703 gcc_assert (bb1
== bb2
704 || _loop
->get_bb_top_order_index(bb1
->index
)
705 != _loop
->get_bb_top_order_index(bb2
->index
));
707 return (_loop
->get_bb_top_order_index(bb1
->index
) -
708 _loop
->get_bb_top_order_index(bb2
->index
));
712 loop_distribution::create_rdg_vertices (struct graph
*rdg
,
713 const vec
<gimple
*> &stmts
,
719 FOR_EACH_VEC_ELT (stmts
, i
, stmt
)
721 struct vertex
*v
= &(rdg
->vertices
[i
]);
723 /* Record statement to vertex mapping. */
724 gimple_set_uid (stmt
, i
);
726 v
->data
= XNEW (struct rdg_vertex
);
727 RDGV_STMT (v
) = stmt
;
728 RDGV_DATAREFS (v
).create (0);
729 RDGV_HAS_MEM_WRITE (v
) = false;
730 RDGV_HAS_MEM_READS (v
) = false;
731 if (gimple_code (stmt
) == GIMPLE_PHI
)
734 unsigned drp
= datarefs_vec
.length ();
735 if (!find_data_references_in_stmt (loop
, stmt
, &datarefs_vec
))
737 for (unsigned j
= drp
; j
< datarefs_vec
.length (); ++j
)
739 data_reference_p dr
= datarefs_vec
[j
];
741 RDGV_HAS_MEM_READS (v
) = true;
743 RDGV_HAS_MEM_WRITE (v
) = true;
744 RDGV_DATAREFS (v
).safe_push (dr
);
745 has_nonaddressable_dataref_p
|= may_be_nonaddressable_p (dr
->ref
);
752 loop_distribution::stmts_from_loop (class loop
*loop
, vec
<gimple
*> *stmts
)
755 basic_block
*bbs
= get_loop_body_in_custom_order (loop
, this, bb_top_order_cmp_r
);
757 for (i
= 0; i
< loop
->num_nodes
; i
++)
759 basic_block bb
= bbs
[i
];
761 for (gphi_iterator bsi
= gsi_start_phis (bb
); !gsi_end_p (bsi
);
763 if (!virtual_operand_p (gimple_phi_result (bsi
.phi ())))
764 stmts
->safe_push (bsi
.phi ());
766 for (gimple_stmt_iterator bsi
= gsi_start_bb (bb
); !gsi_end_p (bsi
);
769 gimple
*stmt
= gsi_stmt (bsi
);
770 if (gimple_code (stmt
) != GIMPLE_LABEL
&& !is_gimple_debug (stmt
))
771 stmts
->safe_push (stmt
);
778 /* Free the reduced dependence graph RDG. */
781 free_rdg (struct graph
*rdg
)
785 for (i
= 0; i
< rdg
->n_vertices
; i
++)
787 struct vertex
*v
= &(rdg
->vertices
[i
]);
788 struct graph_edge
*e
;
790 for (e
= v
->succ
; e
; e
= e
->succ_next
)
795 gimple_set_uid (RDGV_STMT (v
), -1);
796 (RDGV_DATAREFS (v
)).release ();
805 loop_distribution::build_rdg (class loop
*loop
, control_dependences
*cd
)
809 /* Create the RDG vertices from the stmts of the loop nest. */
810 auto_vec
<gimple
*, 10> stmts
;
811 stmts_from_loop (loop
, &stmts
);
812 rdg
= new_graph (stmts
.length ());
813 if (!create_rdg_vertices (rdg
, stmts
, loop
))
820 create_rdg_flow_edges (rdg
);
822 create_rdg_cd_edges (rdg
, cd
, loop
);
828 /* Allocate and initialize a partition from BITMAP. */
831 partition_alloc (void)
833 partition
*partition
= XCNEW (struct partition
);
834 partition
->stmts
= BITMAP_ALLOC (NULL
);
835 partition
->reduction_p
= false;
836 partition
->loc
= UNKNOWN_LOCATION
;
837 partition
->kind
= PKIND_NORMAL
;
838 partition
->type
= PTYPE_PARALLEL
;
839 partition
->datarefs
= BITMAP_ALLOC (NULL
);
843 /* Free PARTITION. */
846 partition_free (partition
*partition
)
848 BITMAP_FREE (partition
->stmts
);
849 BITMAP_FREE (partition
->datarefs
);
850 if (partition
->builtin
)
851 free (partition
->builtin
);
856 /* Returns true if the partition can be generated as a builtin. */
859 partition_builtin_p (partition
*partition
)
861 return partition
->kind
> PKIND_PARTIAL_MEMSET
;
864 /* Returns true if the partition contains a reduction. */
867 partition_reduction_p (partition
*partition
)
869 return partition
->reduction_p
;
873 loop_distribution::partition_merge_into (struct graph
*rdg
,
874 partition
*dest
, partition
*partition
, enum fuse_type ft
)
876 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
878 fprintf (dump_file
, "Fuse partitions because %s:\n", fuse_message
[ft
]);
879 fprintf (dump_file
, " Part 1: ");
880 dump_bitmap (dump_file
, dest
->stmts
);
881 fprintf (dump_file
, " Part 2: ");
882 dump_bitmap (dump_file
, partition
->stmts
);
885 dest
->kind
= PKIND_NORMAL
;
886 if (dest
->type
== PTYPE_PARALLEL
)
887 dest
->type
= partition
->type
;
889 bitmap_ior_into (dest
->stmts
, partition
->stmts
);
890 if (partition_reduction_p (partition
))
891 dest
->reduction_p
= true;
893 /* Further check if any data dependence prevents us from executing the
894 new partition parallelly. */
895 if (dest
->type
== PTYPE_PARALLEL
&& rdg
!= NULL
)
896 update_type_for_merge (rdg
, dest
, partition
);
898 bitmap_ior_into (dest
->datarefs
, partition
->datarefs
);
902 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
906 ssa_name_has_uses_outside_loop_p (tree def
, loop_p loop
)
908 imm_use_iterator imm_iter
;
911 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, def
)
913 if (is_gimple_debug (USE_STMT (use_p
)))
916 basic_block use_bb
= gimple_bb (USE_STMT (use_p
));
917 if (!flow_bb_inside_loop_p (loop
, use_bb
))
924 /* Returns true when STMT defines a scalar variable used after the
928 stmt_has_scalar_dependences_outside_loop (loop_p loop
, gimple
*stmt
)
933 if (gimple_code (stmt
) == GIMPLE_PHI
)
934 return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt
), loop
);
936 FOR_EACH_SSA_DEF_OPERAND (def_p
, stmt
, op_iter
, SSA_OP_DEF
)
937 if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p
), loop
))
943 /* Return a copy of LOOP placed before LOOP. */
946 copy_loop_before (class loop
*loop
, bool redirect_lc_phi_defs
)
949 edge preheader
= loop_preheader_edge (loop
);
951 initialize_original_copy_tables ();
952 res
= slpeel_tree_duplicate_loop_to_edge_cfg (loop
, NULL
, preheader
);
953 gcc_assert (res
!= NULL
);
955 /* When a not last partition is supposed to keep the LC PHIs computed
956 adjust their definitions. */
957 if (redirect_lc_phi_defs
)
959 edge exit
= single_exit (loop
);
960 for (gphi_iterator si
= gsi_start_phis (exit
->dest
); !gsi_end_p (si
);
963 gphi
*phi
= si
.phi ();
964 if (virtual_operand_p (gimple_phi_result (phi
)))
966 use_operand_p use_p
= PHI_ARG_DEF_PTR_FROM_EDGE (phi
, exit
);
967 tree new_def
= get_current_def (USE_FROM_PTR (use_p
));
968 SET_USE (use_p
, new_def
);
972 free_original_copy_tables ();
973 delete_update_ssa ();
978 /* Creates an empty basic block after LOOP. */
981 create_bb_after_loop (class loop
*loop
)
983 edge exit
= single_exit (loop
);
991 /* Generate code for PARTITION from the code in LOOP. The loop is
992 copied when COPY_P is true. All the statements not flagged in the
993 PARTITION bitmap are removed from the loop or from its copy. The
994 statements are indexed in sequence inside a basic block, and the
995 basic blocks of a loop are taken in dom order. */
998 generate_loops_for_partition (class loop
*loop
, partition
*partition
,
999 bool copy_p
, bool keep_lc_phis_p
)
1006 int orig_loop_num
= loop
->orig_loop_num
;
1007 loop
= copy_loop_before (loop
, keep_lc_phis_p
);
1008 gcc_assert (loop
!= NULL
);
1009 loop
->orig_loop_num
= orig_loop_num
;
1010 create_preheader (loop
, CP_SIMPLE_PREHEADERS
);
1011 create_bb_after_loop (loop
);
1015 /* Origin number is set to the new versioned loop's num. */
1016 gcc_assert (loop
->orig_loop_num
!= loop
->num
);
1019 /* Remove stmts not in the PARTITION bitmap. */
1020 bbs
= get_loop_body_in_dom_order (loop
);
1022 if (MAY_HAVE_DEBUG_BIND_STMTS
)
1023 for (i
= 0; i
< loop
->num_nodes
; i
++)
1025 basic_block bb
= bbs
[i
];
1027 for (gphi_iterator bsi
= gsi_start_phis (bb
); !gsi_end_p (bsi
);
1030 gphi
*phi
= bsi
.phi ();
1031 if (!virtual_operand_p (gimple_phi_result (phi
))
1032 && !bitmap_bit_p (partition
->stmts
, gimple_uid (phi
)))
1033 reset_debug_uses (phi
);
1036 for (gimple_stmt_iterator bsi
= gsi_start_bb (bb
); !gsi_end_p (bsi
); gsi_next (&bsi
))
1038 gimple
*stmt
= gsi_stmt (bsi
);
1039 if (gimple_code (stmt
) != GIMPLE_LABEL
1040 && !is_gimple_debug (stmt
)
1041 && !bitmap_bit_p (partition
->stmts
, gimple_uid (stmt
)))
1042 reset_debug_uses (stmt
);
1046 for (i
= 0; i
< loop
->num_nodes
; i
++)
1048 basic_block bb
= bbs
[i
];
1049 edge inner_exit
= NULL
;
1051 if (loop
!= bb
->loop_father
)
1052 inner_exit
= single_exit (bb
->loop_father
);
1054 for (gphi_iterator bsi
= gsi_start_phis (bb
); !gsi_end_p (bsi
);)
1056 gphi
*phi
= bsi
.phi ();
1057 if (!virtual_operand_p (gimple_phi_result (phi
))
1058 && !bitmap_bit_p (partition
->stmts
, gimple_uid (phi
)))
1059 remove_phi_node (&bsi
, true);
1064 for (gimple_stmt_iterator bsi
= gsi_start_bb (bb
); !gsi_end_p (bsi
);)
1066 gimple
*stmt
= gsi_stmt (bsi
);
1067 if (gimple_code (stmt
) != GIMPLE_LABEL
1068 && !is_gimple_debug (stmt
)
1069 && !bitmap_bit_p (partition
->stmts
, gimple_uid (stmt
)))
1071 /* In distribution of loop nest, if bb is inner loop's exit_bb,
1072 we choose its exit edge/path in order to avoid generating
1073 infinite loop. For all other cases, we choose an arbitrary
1074 path through the empty CFG part that this unnecessary
1075 control stmt controls. */
1076 if (gcond
*cond_stmt
= dyn_cast
<gcond
*> (stmt
))
1078 if (inner_exit
&& inner_exit
->flags
& EDGE_TRUE_VALUE
)
1079 gimple_cond_make_true (cond_stmt
);
1081 gimple_cond_make_false (cond_stmt
);
1084 else if (gimple_code (stmt
) == GIMPLE_SWITCH
)
1086 gswitch
*switch_stmt
= as_a
<gswitch
*> (stmt
);
1087 gimple_switch_set_index
1088 (switch_stmt
, CASE_LOW (gimple_switch_label (switch_stmt
, 1)));
1093 unlink_stmt_vdef (stmt
);
1094 gsi_remove (&bsi
, true);
1095 release_defs (stmt
);
1106 /* If VAL memory representation contains the same value in all bytes,
1107 return that value, otherwise return -1.
1108 E.g. for 0x24242424 return 0x24, for IEEE double
1109 747708026454360457216.0 return 0x44, etc. */
1112 const_with_all_bytes_same (tree val
)
1114 unsigned char buf
[64];
1117 if (integer_zerop (val
)
1118 || (TREE_CODE (val
) == CONSTRUCTOR
1119 && !TREE_CLOBBER_P (val
)
1120 && CONSTRUCTOR_NELTS (val
) == 0))
1123 if (real_zerop (val
))
1125 /* Only return 0 for +0.0, not for -0.0, which doesn't have
1126 an all bytes same memory representation. Don't transform
1127 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */
1128 switch (TREE_CODE (val
))
1131 if (!real_isneg (TREE_REAL_CST_PTR (val
)))
1135 if (!const_with_all_bytes_same (TREE_REALPART (val
))
1136 && !const_with_all_bytes_same (TREE_IMAGPART (val
)))
1141 unsigned int count
= vector_cst_encoded_nelts (val
);
1143 for (j
= 0; j
< count
; ++j
)
1144 if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val
, j
)))
1155 if (CHAR_BIT
!= 8 || BITS_PER_UNIT
!= 8)
1158 len
= native_encode_expr (val
, buf
, sizeof (buf
));
1161 for (i
= 1; i
< len
; i
++)
1162 if (buf
[i
] != buf
[0])
1167 /* Generate a call to memset for PARTITION in LOOP. */
1170 generate_memset_builtin (class loop
*loop
, partition
*partition
)
1172 gimple_stmt_iterator gsi
;
1173 tree mem
, fn
, nb_bytes
;
1175 struct builtin_info
*builtin
= partition
->builtin
;
1178 /* The new statements will be placed before LOOP. */
1179 gsi
= gsi_last_bb (loop_preheader_edge (loop
)->src
);
1181 nb_bytes
= rewrite_to_non_trapping_overflow (builtin
->size
);
1182 nb_bytes
= force_gimple_operand_gsi (&gsi
, nb_bytes
, true, NULL_TREE
,
1183 false, GSI_CONTINUE_LINKING
);
1184 mem
= rewrite_to_non_trapping_overflow (builtin
->dst_base
);
1185 mem
= force_gimple_operand_gsi (&gsi
, mem
, true, NULL_TREE
,
1186 false, GSI_CONTINUE_LINKING
);
1188 /* This exactly matches the pattern recognition in classify_partition. */
1189 val
= gimple_assign_rhs1 (DR_STMT (builtin
->dst_dr
));
1190 /* Handle constants like 0x15151515 and similarly
1191 floating point constants etc. where all bytes are the same. */
1192 int bytev
= const_with_all_bytes_same (val
);
1194 val
= build_int_cst (integer_type_node
, bytev
);
1195 else if (TREE_CODE (val
) == INTEGER_CST
)
1196 val
= fold_convert (integer_type_node
, val
);
1197 else if (!useless_type_conversion_p (integer_type_node
, TREE_TYPE (val
)))
1199 tree tem
= make_ssa_name (integer_type_node
);
1200 gimple
*cstmt
= gimple_build_assign (tem
, NOP_EXPR
, val
);
1201 gsi_insert_after (&gsi
, cstmt
, GSI_CONTINUE_LINKING
);
1205 fn
= build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET
));
1206 fn_call
= gimple_build_call (fn
, 3, mem
, val
, nb_bytes
);
1207 gimple_set_location (fn_call
, partition
->loc
);
1208 gsi_insert_after (&gsi
, fn_call
, GSI_CONTINUE_LINKING
);
1211 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1213 fprintf (dump_file
, "generated memset");
1215 fprintf (dump_file
, " zero\n");
1217 fprintf (dump_file
, "\n");
1221 /* Generate a call to memcpy for PARTITION in LOOP. */
1224 generate_memcpy_builtin (class loop
*loop
, partition
*partition
)
1226 gimple_stmt_iterator gsi
;
1228 tree dest
, src
, fn
, nb_bytes
;
1229 enum built_in_function kind
;
1230 struct builtin_info
*builtin
= partition
->builtin
;
1232 /* The new statements will be placed before LOOP. */
1233 gsi
= gsi_last_bb (loop_preheader_edge (loop
)->src
);
1235 nb_bytes
= rewrite_to_non_trapping_overflow (builtin
->size
);
1236 nb_bytes
= force_gimple_operand_gsi (&gsi
, nb_bytes
, true, NULL_TREE
,
1237 false, GSI_CONTINUE_LINKING
);
1238 dest
= rewrite_to_non_trapping_overflow (builtin
->dst_base
);
1239 src
= rewrite_to_non_trapping_overflow (builtin
->src_base
);
1240 if (partition
->kind
== PKIND_MEMCPY
1241 || ! ptr_derefs_may_alias_p (dest
, src
))
1242 kind
= BUILT_IN_MEMCPY
;
1244 kind
= BUILT_IN_MEMMOVE
;
1245 /* Try harder if we're copying a constant size. */
1246 if (kind
== BUILT_IN_MEMMOVE
&& poly_int_tree_p (nb_bytes
))
1248 aff_tree asrc
, adest
;
1249 tree_to_aff_combination (src
, ptr_type_node
, &asrc
);
1250 tree_to_aff_combination (dest
, ptr_type_node
, &adest
);
1251 aff_combination_scale (&adest
, -1);
1252 aff_combination_add (&asrc
, &adest
);
1253 if (aff_comb_cannot_overlap_p (&asrc
, wi::to_poly_widest (nb_bytes
),
1254 wi::to_poly_widest (nb_bytes
)))
1255 kind
= BUILT_IN_MEMCPY
;
1258 dest
= force_gimple_operand_gsi (&gsi
, dest
, true, NULL_TREE
,
1259 false, GSI_CONTINUE_LINKING
);
1260 src
= force_gimple_operand_gsi (&gsi
, src
, true, NULL_TREE
,
1261 false, GSI_CONTINUE_LINKING
);
1262 fn
= build_fold_addr_expr (builtin_decl_implicit (kind
));
1263 fn_call
= gimple_build_call (fn
, 3, dest
, src
, nb_bytes
);
1264 gimple_set_location (fn_call
, partition
->loc
);
1265 gsi_insert_after (&gsi
, fn_call
, GSI_CONTINUE_LINKING
);
1268 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1270 if (kind
== BUILT_IN_MEMCPY
)
1271 fprintf (dump_file
, "generated memcpy\n");
1273 fprintf (dump_file
, "generated memmove\n");
1277 /* Remove and destroy the loop LOOP. */
1280 destroy_loop (class loop
*loop
)
1282 unsigned nbbs
= loop
->num_nodes
;
1283 edge exit
= single_exit (loop
);
1284 basic_block src
= loop_preheader_edge (loop
)->src
, dest
= exit
->dest
;
1288 bbs
= get_loop_body_in_dom_order (loop
);
1290 gimple_stmt_iterator dst_gsi
= gsi_after_labels (exit
->dest
);
1291 bool safe_p
= single_pred_p (exit
->dest
);
1292 for (unsigned i
= 0; i
< nbbs
; ++i
)
1294 /* We have made sure to not leave any dangling uses of SSA
1295 names defined in the loop. With the exception of virtuals.
1296 Make sure we replace all uses of virtual defs that will remain
1297 outside of the loop with the bare symbol as delete_basic_block
1298 will release them. */
1299 for (gphi_iterator gsi
= gsi_start_phis (bbs
[i
]); !gsi_end_p (gsi
);
1302 gphi
*phi
= gsi
.phi ();
1303 if (virtual_operand_p (gimple_phi_result (phi
)))
1304 mark_virtual_phi_result_for_renaming (phi
);
1306 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]); !gsi_end_p (gsi
);)
1308 gimple
*stmt
= gsi_stmt (gsi
);
1309 tree vdef
= gimple_vdef (stmt
);
1310 if (vdef
&& TREE_CODE (vdef
) == SSA_NAME
)
1311 mark_virtual_operand_for_renaming (vdef
);
1312 /* Also move and eventually reset debug stmts. We can leave
1313 constant values in place in case the stmt dominates the exit.
1314 ??? Non-constant values from the last iteration can be
1315 replaced with final values if we can compute them. */
1316 if (gimple_debug_bind_p (stmt
))
1318 tree val
= gimple_debug_bind_get_value (stmt
);
1319 gsi_move_before (&gsi
, &dst_gsi
);
1322 || !is_gimple_min_invariant (val
)
1323 || !dominated_by_p (CDI_DOMINATORS
, exit
->src
, bbs
[i
])))
1325 gimple_debug_bind_reset_value (stmt
);
1334 redirect_edge_pred (exit
, src
);
1335 exit
->flags
&= ~(EDGE_TRUE_VALUE
|EDGE_FALSE_VALUE
);
1336 exit
->flags
|= EDGE_FALLTHRU
;
1337 cancel_loop_tree (loop
);
1338 rescan_loop_exit (exit
, false, true);
1344 delete_basic_block (bbs
[i
]);
1350 set_immediate_dominator (CDI_DOMINATORS
, dest
,
1351 recompute_dominator (CDI_DOMINATORS
, dest
));
1354 /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */
1357 generate_code_for_partition (class loop
*loop
,
1358 partition
*partition
, bool copy_p
,
1359 bool keep_lc_phis_p
)
1361 switch (partition
->kind
)
1364 case PKIND_PARTIAL_MEMSET
:
1365 /* Reductions all have to be in the last partition. */
1366 gcc_assert (!partition_reduction_p (partition
)
1368 generate_loops_for_partition (loop
, partition
, copy_p
,
1373 generate_memset_builtin (loop
, partition
);
1378 generate_memcpy_builtin (loop
, partition
);
1385 /* Common tail for partitions we turn into a call. If this was the last
1386 partition for which we generate code, we have to destroy the loop. */
1392 data_dependence_relation
*
1393 loop_distribution::get_data_dependence (struct graph
*rdg
, data_reference_p a
,
1396 struct data_dependence_relation ent
, **slot
;
1397 struct data_dependence_relation
*ddr
;
1399 gcc_assert (DR_IS_WRITE (a
) || DR_IS_WRITE (b
));
1400 gcc_assert (rdg_vertex_for_stmt (rdg
, DR_STMT (a
))
1401 <= rdg_vertex_for_stmt (rdg
, DR_STMT (b
)));
1404 slot
= ddrs_table
->find_slot (&ent
, INSERT
);
1407 ddr
= initialize_data_dependence_relation (a
, b
, loop_nest
);
1408 compute_affine_dependence (ddr
, loop_nest
[0]);
1416 loop_distribution::data_dep_in_cycle_p (struct graph
*rdg
,
1417 data_reference_p dr1
,
1418 data_reference_p dr2
)
1420 struct data_dependence_relation
*ddr
;
1422 /* Re-shuffle data-refs to be in topological order. */
1423 if (rdg_vertex_for_stmt (rdg
, DR_STMT (dr1
))
1424 > rdg_vertex_for_stmt (rdg
, DR_STMT (dr2
)))
1425 std::swap (dr1
, dr2
);
1427 ddr
= get_data_dependence (rdg
, dr1
, dr2
);
1429 /* In case of no data dependence. */
1430 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
1432 /* For unknown data dependence or known data dependence which can't be
1433 expressed in classic distance vector, we check if it can be resolved
1434 by runtime alias check. If yes, we still consider data dependence
1435 as won't introduce data dependence cycle. */
1436 else if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
1437 || DDR_NUM_DIST_VECTS (ddr
) == 0)
1438 return !runtime_alias_check_p (ddr
, NULL
, true);
1439 else if (DDR_NUM_DIST_VECTS (ddr
) > 1)
1441 else if (DDR_REVERSED_P (ddr
)
1442 || lambda_vector_zerop (DDR_DIST_VECT (ddr
, 0), 1))
1449 loop_distribution::update_type_for_merge (struct graph
*rdg
,
1450 partition
*partition1
,
1451 partition
*partition2
)
1454 bitmap_iterator bi
, bj
;
1455 data_reference_p dr1
, dr2
;
1457 EXECUTE_IF_SET_IN_BITMAP (partition1
->datarefs
, 0, i
, bi
)
1459 unsigned start
= (partition1
== partition2
) ? i
+ 1 : 0;
1461 dr1
= datarefs_vec
[i
];
1462 EXECUTE_IF_SET_IN_BITMAP (partition2
->datarefs
, start
, j
, bj
)
1464 dr2
= datarefs_vec
[j
];
1465 if (DR_IS_READ (dr1
) && DR_IS_READ (dr2
))
1468 /* Partition can only be executed sequentially if there is any
1469 data dependence cycle. */
1470 if (data_dep_in_cycle_p (rdg
, dr1
, dr2
))
1472 partition1
->type
= PTYPE_SEQUENTIAL
;
1480 loop_distribution::build_rdg_partition_for_vertex (struct graph
*rdg
, int v
)
1482 partition
*partition
= partition_alloc ();
1483 auto_vec
<int, 3> nodes
;
1486 data_reference_p dr
;
1488 graphds_dfs (rdg
, &v
, 1, &nodes
, false, NULL
);
1490 FOR_EACH_VEC_ELT (nodes
, i
, x
)
1492 bitmap_set_bit (partition
->stmts
, x
);
1494 for (j
= 0; RDG_DATAREFS (rdg
, x
).iterate (j
, &dr
); ++j
)
1496 unsigned idx
= (unsigned) DR_INDEX (dr
);
1497 gcc_assert (idx
< datarefs_vec
.length ());
1499 /* Partition can only be executed sequentially if there is any
1500 unknown data reference. */
1501 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
)
1502 || !DR_INIT (dr
) || !DR_STEP (dr
))
1503 partition
->type
= PTYPE_SEQUENTIAL
;
1505 bitmap_set_bit (partition
->datarefs
, idx
);
1509 if (partition
->type
== PTYPE_SEQUENTIAL
)
1512 /* Further check if any data dependence prevents us from executing the
1513 partition parallelly. */
1514 update_type_for_merge (rdg
, partition
, partition
);
1519 /* Given PARTITION of LOOP and RDG, record single load/store data references
1520 for builtin partition in SRC_DR/DST_DR, return false if there is no such
1524 find_single_drs (class loop
*loop
, struct graph
*rdg
, const bitmap
&partition_stmts
,
1525 data_reference_p
*dst_dr
, data_reference_p
*src_dr
)
1528 data_reference_p single_ld
= NULL
, single_st
= NULL
;
1531 EXECUTE_IF_SET_IN_BITMAP (partition_stmts
, 0, i
, bi
)
1533 gimple
*stmt
= RDG_STMT (rdg
, i
);
1534 data_reference_p dr
;
1536 if (gimple_code (stmt
) == GIMPLE_PHI
)
1539 /* Any scalar stmts are ok. */
1540 if (!gimple_vuse (stmt
))
1543 /* Otherwise just regular loads/stores. */
1544 if (!gimple_assign_single_p (stmt
))
1547 /* But exactly one store and/or load. */
1548 for (unsigned j
= 0; RDG_DATAREFS (rdg
, i
).iterate (j
, &dr
); ++j
)
1550 tree type
= TREE_TYPE (DR_REF (dr
));
1552 /* The memset, memcpy and memmove library calls are only
1553 able to deal with generic address space. */
1554 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type
)))
1557 if (DR_IS_READ (dr
))
1559 if (single_ld
!= NULL
)
1565 if (single_st
!= NULL
)
1572 if (!single_ld
&& !single_st
)
1575 basic_block bb_ld
= NULL
;
1576 basic_block bb_st
= NULL
;
1580 /* Bail out if this is a bitfield memory reference. */
1581 if (TREE_CODE (DR_REF (single_ld
)) == COMPONENT_REF
1582 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld
), 1)))
1585 /* Data reference must be executed exactly once per iteration of each
1586 loop in the loop nest. We only need to check dominance information
1587 against the outermost one in a perfect loop nest because a bb can't
1588 dominate outermost loop's latch without dominating inner loop's. */
1589 bb_ld
= gimple_bb (DR_STMT (single_ld
));
1590 if (!dominated_by_p (CDI_DOMINATORS
, loop
->latch
, bb_ld
))
1596 /* Bail out if this is a bitfield memory reference. */
1597 if (TREE_CODE (DR_REF (single_st
)) == COMPONENT_REF
1598 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st
), 1)))
1601 /* Data reference must be executed exactly once per iteration.
1602 Same as single_ld, we only need to check against the outermost
1604 bb_st
= gimple_bb (DR_STMT (single_st
));
1605 if (!dominated_by_p (CDI_DOMINATORS
, loop
->latch
, bb_st
))
1609 if (single_ld
&& single_st
)
1611 /* Load and store must be in the same loop nest. */
1612 if (bb_st
->loop_father
!= bb_ld
->loop_father
)
1615 edge e
= single_exit (bb_st
->loop_father
);
1616 bool dom_ld
= dominated_by_p (CDI_DOMINATORS
, e
->src
, bb_ld
);
1617 bool dom_st
= dominated_by_p (CDI_DOMINATORS
, e
->src
, bb_st
);
1618 if (dom_ld
!= dom_st
)
1622 *src_dr
= single_ld
;
1623 *dst_dr
= single_st
;
1627 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1628 loops from inner to outer to see if loop's step equals to access size at
1629 each level of loop. Return 2 if we can prove this at all level loops;
1630 record access base and size in BASE and SIZE; save loop's step at each
1631 level of loop in STEPS if it is not null. For example:
1633 int arr[100][100][100];
1634 for (i = 0; i < 100; i++) ;steps[2] = 40000
1635 for (j = 100; j > 0; j--) ;steps[1] = -400
1636 for (k = 0; k < 100; k++) ;steps[0] = 4
1637 arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000
1639 Return 1 if we can prove the equality at the innermost loop, but not all
1640 level loops. In this case, no information is recorded.
1642 Return 0 if no equality can be proven at any level loops. */
1645 compute_access_range (loop_p loop_nest
, data_reference_p dr
, tree
*base
,
1646 tree
*size
, vec
<tree
> *steps
= NULL
)
1648 location_t loc
= gimple_location (DR_STMT (dr
));
1649 basic_block bb
= gimple_bb (DR_STMT (dr
));
1650 class loop
*loop
= bb
->loop_father
;
1651 tree ref
= DR_REF (dr
);
1652 tree access_base
= build_fold_addr_expr (ref
);
1653 tree access_size
= TYPE_SIZE_UNIT (TREE_TYPE (ref
));
1657 tree scev_fn
= analyze_scalar_evolution (loop
, access_base
);
1658 if (TREE_CODE (scev_fn
) != POLYNOMIAL_CHREC
)
1661 access_base
= CHREC_LEFT (scev_fn
);
1662 if (tree_contains_chrecs (access_base
, NULL
))
1665 tree scev_step
= CHREC_RIGHT (scev_fn
);
1666 /* Only support constant steps. */
1667 if (TREE_CODE (scev_step
) != INTEGER_CST
)
1670 enum ev_direction access_dir
= scev_direction (scev_fn
);
1671 if (access_dir
== EV_DIR_UNKNOWN
)
1675 steps
->safe_push (scev_step
);
1677 scev_step
= fold_convert_loc (loc
, sizetype
, scev_step
);
1678 /* Compute absolute value of scev step. */
1679 if (access_dir
== EV_DIR_DECREASES
)
1680 scev_step
= fold_build1_loc (loc
, NEGATE_EXPR
, sizetype
, scev_step
);
1682 /* At each level of loop, scev step must equal to access size. In other
1683 words, DR must access consecutive memory between loop iterations. */
1684 if (!operand_equal_p (scev_step
, access_size
, 0))
1687 /* Access stride can be computed for data reference at least for the
1691 /* Compute DR's execution times in loop. */
1692 tree niters
= number_of_latch_executions (loop
);
1693 niters
= fold_convert_loc (loc
, sizetype
, niters
);
1694 if (dominated_by_p (CDI_DOMINATORS
, single_exit (loop
)->src
, bb
))
1695 niters
= size_binop_loc (loc
, PLUS_EXPR
, niters
, size_one_node
);
1697 /* Compute DR's overall access size in loop. */
1698 access_size
= fold_build2_loc (loc
, MULT_EXPR
, sizetype
,
1700 /* Adjust base address in case of negative step. */
1701 if (access_dir
== EV_DIR_DECREASES
)
1703 tree adj
= fold_build2_loc (loc
, MINUS_EXPR
, sizetype
,
1704 scev_step
, access_size
);
1705 access_base
= fold_build_pointer_plus_loc (loc
, access_base
, adj
);
1707 } while (loop
!= loop_nest
&& (loop
= loop_outer (loop
)) != NULL
);
1709 *base
= access_base
;
1710 *size
= access_size
;
1711 /* Access stride can be computed for data reference at each level loop. */
1715 /* Allocate and return builtin struct. Record information like DST_DR,
1716 SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */
1718 static struct builtin_info
*
1719 alloc_builtin (data_reference_p dst_dr
, data_reference_p src_dr
,
1720 tree dst_base
, tree src_base
, tree size
)
1722 struct builtin_info
*builtin
= XNEW (struct builtin_info
);
1723 builtin
->dst_dr
= dst_dr
;
1724 builtin
->src_dr
= src_dr
;
1725 builtin
->dst_base
= dst_base
;
1726 builtin
->src_base
= src_base
;
1727 builtin
->size
= size
;
1731 /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1735 classify_builtin_st (loop_p loop
, partition
*partition
, data_reference_p dr
)
1737 gimple
*stmt
= DR_STMT (dr
);
1738 tree base
, size
, rhs
= gimple_assign_rhs1 (stmt
);
1740 if (const_with_all_bytes_same (rhs
) == -1
1741 && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs
))
1742 || (TYPE_MODE (TREE_TYPE (rhs
))
1743 != TYPE_MODE (unsigned_char_type_node
))))
1746 if (TREE_CODE (rhs
) == SSA_NAME
1747 && !SSA_NAME_IS_DEFAULT_DEF (rhs
)
1748 && flow_bb_inside_loop_p (loop
, gimple_bb (SSA_NAME_DEF_STMT (rhs
))))
1751 int res
= compute_access_range (loop
, dr
, &base
, &size
);
1756 partition
->kind
= PKIND_PARTIAL_MEMSET
;
1762 split_constant_offset (base
, &base_base
, &base_offset
);
1763 if (!cst_and_fits_in_hwi (base_offset
))
1765 unsigned HOST_WIDE_INT const_base_offset
= int_cst_value (base_offset
);
1767 struct builtin_info
*builtin
;
1768 builtin
= alloc_builtin (dr
, NULL
, base
, NULL_TREE
, size
);
1769 builtin
->dst_base_base
= base_base
;
1770 builtin
->dst_base_offset
= const_base_offset
;
1771 partition
->builtin
= builtin
;
1772 partition
->kind
= PKIND_MEMSET
;
1775 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1776 if it forms builtin memcpy or memmove call. */
1779 loop_distribution::classify_builtin_ldst (loop_p loop
, struct graph
*rdg
,
1780 partition
*partition
,
1781 data_reference_p dst_dr
,
1782 data_reference_p src_dr
)
1784 tree base
, size
, src_base
, src_size
;
1785 auto_vec
<tree
> dst_steps
, src_steps
;
1787 /* Compute access range of both load and store. */
1788 int res
= compute_access_range (loop
, dst_dr
, &base
, &size
, &dst_steps
);
1791 res
= compute_access_range (loop
, src_dr
, &src_base
, &src_size
, &src_steps
);
1795 /* They must have the same access size. */
1796 if (!operand_equal_p (size
, src_size
, 0))
1799 /* They must have the same storage order. */
1800 if (reverse_storage_order_for_component_p (DR_REF (dst_dr
))
1801 != reverse_storage_order_for_component_p (DR_REF (src_dr
)))
1804 /* Load and store in loop nest must access memory in the same way, i.e,
1805 their must have the same steps in each loop of the nest. */
1806 if (dst_steps
.length () != src_steps
.length ())
1808 for (unsigned i
= 0; i
< dst_steps
.length (); ++i
)
1809 if (!operand_equal_p (dst_steps
[i
], src_steps
[i
], 0))
1812 /* Now check that if there is a dependence. */
1813 ddr_p ddr
= get_data_dependence (rdg
, src_dr
, dst_dr
);
1815 /* Classify as memmove if no dependence between load and store. */
1816 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
1818 partition
->builtin
= alloc_builtin (dst_dr
, src_dr
, base
, src_base
, size
);
1819 partition
->kind
= PKIND_MEMMOVE
;
1823 /* Can't do memmove in case of unknown dependence or dependence without
1824 classical distance vector. */
1825 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
1826 || DDR_NUM_DIST_VECTS (ddr
) == 0)
1830 lambda_vector dist_v
;
1831 int num_lev
= (DDR_LOOP_NEST (ddr
)).length ();
1832 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
1834 unsigned dep_lev
= dependence_level (dist_v
, num_lev
);
1835 /* Can't do memmove if load depends on store. */
1836 if (dep_lev
> 0 && dist_v
[dep_lev
- 1] > 0 && !DDR_REVERSED_P (ddr
))
1840 partition
->builtin
= alloc_builtin (dst_dr
, src_dr
, base
, src_base
, size
);
1841 partition
->kind
= PKIND_MEMMOVE
;
1846 loop_distribution::classify_partition (loop_p loop
,
1847 struct graph
*rdg
, partition
*partition
,
1848 bitmap stmt_in_all_partitions
)
1852 data_reference_p single_ld
= NULL
, single_st
= NULL
;
1853 bool volatiles_p
= false, has_reduction
= false;
1855 EXECUTE_IF_SET_IN_BITMAP (partition
->stmts
, 0, i
, bi
)
1857 gimple
*stmt
= RDG_STMT (rdg
, i
);
1859 if (gimple_has_volatile_ops (stmt
))
1862 /* If the stmt is not included by all partitions and there is uses
1863 outside of the loop, then mark the partition as reduction. */
1864 if (stmt_has_scalar_dependences_outside_loop (loop
, stmt
))
1866 /* Due to limitation in the transform phase we have to fuse all
1867 reduction partitions. As a result, this could cancel valid
1868 loop distribution especially for loop that induction variable
1869 is used outside of loop. To workaround this issue, we skip
1870 marking partition as reudction if the reduction stmt belongs
1871 to all partitions. In such case, reduction will be computed
1872 correctly no matter how partitions are fused/distributed. */
1873 if (!bitmap_bit_p (stmt_in_all_partitions
, i
))
1874 partition
->reduction_p
= true;
1876 has_reduction
= true;
1880 /* Simple workaround to prevent classifying the partition as builtin
1881 if it contains any use outside of loop. For the case where all
1882 partitions have the reduction this simple workaround is delayed
1883 to only affect the last partition. */
1884 if (partition
->reduction_p
)
1885 return has_reduction
;
1887 /* Perform general partition disqualification for builtins. */
1889 || !flag_tree_loop_distribute_patterns
)
1890 return has_reduction
;
1892 /* Find single load/store data references for builtin partition. */
1893 if (!find_single_drs (loop
, rdg
, partition
->stmts
, &single_st
, &single_ld
)
1895 return has_reduction
;
1897 if (single_ld
&& single_st
)
1899 gimple
*store
= DR_STMT (single_st
), *load
= DR_STMT (single_ld
);
1900 /* Direct aggregate copy or via an SSA name temporary. */
1902 && gimple_assign_lhs (load
) != gimple_assign_rhs1 (store
))
1903 return has_reduction
;
1906 partition
->loc
= gimple_location (DR_STMT (single_st
));
1908 /* Classify the builtin kind. */
1909 if (single_ld
== NULL
)
1910 classify_builtin_st (loop
, partition
, single_st
);
1912 classify_builtin_ldst (loop
, rdg
, partition
, single_st
, single_ld
);
1913 return has_reduction
;
1917 loop_distribution::share_memory_accesses (struct graph
*rdg
,
1918 partition
*partition1
, partition
*partition2
)
1921 bitmap_iterator bi
, bj
;
1922 data_reference_p dr1
, dr2
;
1924 /* First check whether in the intersection of the two partitions are
1925 any loads or stores. Common loads are the situation that happens
1927 EXECUTE_IF_AND_IN_BITMAP (partition1
->stmts
, partition2
->stmts
, 0, i
, bi
)
1928 if (RDG_MEM_WRITE_STMT (rdg
, i
)
1929 || RDG_MEM_READS_STMT (rdg
, i
))
1932 /* Then check whether the two partitions access the same memory object. */
1933 EXECUTE_IF_SET_IN_BITMAP (partition1
->datarefs
, 0, i
, bi
)
1935 dr1
= datarefs_vec
[i
];
1937 if (!DR_BASE_ADDRESS (dr1
)
1938 || !DR_OFFSET (dr1
) || !DR_INIT (dr1
) || !DR_STEP (dr1
))
1941 EXECUTE_IF_SET_IN_BITMAP (partition2
->datarefs
, 0, j
, bj
)
1943 dr2
= datarefs_vec
[j
];
1945 if (!DR_BASE_ADDRESS (dr2
)
1946 || !DR_OFFSET (dr2
) || !DR_INIT (dr2
) || !DR_STEP (dr2
))
1949 if (operand_equal_p (DR_BASE_ADDRESS (dr1
), DR_BASE_ADDRESS (dr2
), 0)
1950 && operand_equal_p (DR_OFFSET (dr1
), DR_OFFSET (dr2
), 0)
1951 && operand_equal_p (DR_INIT (dr1
), DR_INIT (dr2
), 0)
1952 && operand_equal_p (DR_STEP (dr1
), DR_STEP (dr2
), 0))
1960 /* For each seed statement in STARTING_STMTS, this function builds
1961 partition for it by adding depended statements according to RDG.
1962 All partitions are recorded in PARTITIONS. */
1965 loop_distribution::rdg_build_partitions (struct graph
*rdg
,
1966 vec
<gimple
*> starting_stmts
,
1967 vec
<partition
*> *partitions
)
1969 auto_bitmap processed
;
1973 FOR_EACH_VEC_ELT (starting_stmts
, i
, stmt
)
1975 int v
= rdg_vertex_for_stmt (rdg
, stmt
);
1977 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1979 "ldist asked to generate code for vertex %d\n", v
);
1981 /* If the vertex is already contained in another partition so
1982 is the partition rooted at it. */
1983 if (bitmap_bit_p (processed
, v
))
1986 partition
*partition
= build_rdg_partition_for_vertex (rdg
, v
);
1987 bitmap_ior_into (processed
, partition
->stmts
);
1989 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1991 fprintf (dump_file
, "ldist creates useful %s partition:\n",
1992 partition
->type
== PTYPE_PARALLEL
? "parallel" : "sequent");
1993 bitmap_print (dump_file
, partition
->stmts
, " ", "\n");
1996 partitions
->safe_push (partition
);
1999 /* All vertices should have been assigned to at least one partition now,
2000 other than vertices belonging to dead code. */
2003 /* Dump to FILE the PARTITIONS. */
2006 dump_rdg_partitions (FILE *file
, const vec
<partition
*> &partitions
)
2009 partition
*partition
;
2011 FOR_EACH_VEC_ELT (partitions
, i
, partition
)
2012 debug_bitmap_file (file
, partition
->stmts
);
2015 /* Debug PARTITIONS. */
2016 extern void debug_rdg_partitions (const vec
<partition
*> &);
2019 debug_rdg_partitions (const vec
<partition
*> &partitions
)
2021 dump_rdg_partitions (stderr
, partitions
);
2024 /* Returns the number of read and write operations in the RDG. */
2027 number_of_rw_in_rdg (struct graph
*rdg
)
2031 for (i
= 0; i
< rdg
->n_vertices
; i
++)
2033 if (RDG_MEM_WRITE_STMT (rdg
, i
))
2036 if (RDG_MEM_READS_STMT (rdg
, i
))
2043 /* Returns the number of read and write operations in a PARTITION of
2047 number_of_rw_in_partition (struct graph
*rdg
, partition
*partition
)
2053 EXECUTE_IF_SET_IN_BITMAP (partition
->stmts
, 0, i
, ii
)
2055 if (RDG_MEM_WRITE_STMT (rdg
, i
))
2058 if (RDG_MEM_READS_STMT (rdg
, i
))
2065 /* Returns true when one of the PARTITIONS contains all the read or
2066 write operations of RDG. */
2069 partition_contains_all_rw (struct graph
*rdg
,
2070 const vec
<partition
*> &partitions
)
2073 partition
*partition
;
2074 int nrw
= number_of_rw_in_rdg (rdg
);
2076 FOR_EACH_VEC_ELT (partitions
, i
, partition
)
2077 if (nrw
== number_of_rw_in_partition (rdg
, partition
))
2084 loop_distribution::pg_add_dependence_edges (struct graph
*rdg
, int dir
,
2085 bitmap drs1
, bitmap drs2
, vec
<ddr_p
> *alias_ddrs
)
2088 bitmap_iterator bi
, bj
;
2089 data_reference_p dr1
, dr2
, saved_dr1
;
2091 /* dependence direction - 0 is no dependence, -1 is back,
2092 1 is forth, 2 is both (we can stop then, merging will occur). */
2093 EXECUTE_IF_SET_IN_BITMAP (drs1
, 0, i
, bi
)
2095 dr1
= datarefs_vec
[i
];
2097 EXECUTE_IF_SET_IN_BITMAP (drs2
, 0, j
, bj
)
2099 int res
, this_dir
= 1;
2102 dr2
= datarefs_vec
[j
];
2104 /* Skip all <read, read> data dependence. */
2105 if (DR_IS_READ (dr1
) && DR_IS_READ (dr2
))
2109 /* Re-shuffle data-refs to be in topological order. */
2110 if (rdg_vertex_for_stmt (rdg
, DR_STMT (dr1
))
2111 > rdg_vertex_for_stmt (rdg
, DR_STMT (dr2
)))
2113 std::swap (dr1
, dr2
);
2114 this_dir
= -this_dir
;
2116 ddr
= get_data_dependence (rdg
, dr1
, dr2
);
2117 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
2120 res
= data_ref_compare_tree (DR_BASE_ADDRESS (dr1
),
2121 DR_BASE_ADDRESS (dr2
));
2122 /* Be conservative. If data references are not well analyzed,
2123 or the two data references have the same base address and
2124 offset, add dependence and consider it alias to each other.
2125 In other words, the dependence cannot be resolved by
2126 runtime alias check. */
2127 if (!DR_BASE_ADDRESS (dr1
) || !DR_BASE_ADDRESS (dr2
)
2128 || !DR_OFFSET (dr1
) || !DR_OFFSET (dr2
)
2129 || !DR_INIT (dr1
) || !DR_INIT (dr2
)
2130 || !DR_STEP (dr1
) || !tree_fits_uhwi_p (DR_STEP (dr1
))
2131 || !DR_STEP (dr2
) || !tree_fits_uhwi_p (DR_STEP (dr2
))
2134 /* Data dependence could be resolved by runtime alias check,
2135 record it in ALIAS_DDRS. */
2136 else if (alias_ddrs
!= NULL
)
2137 alias_ddrs
->safe_push (ddr
);
2138 /* Or simply ignore it. */
2140 else if (DDR_ARE_DEPENDENT (ddr
) == NULL_TREE
)
2142 if (DDR_REVERSED_P (ddr
))
2143 this_dir
= -this_dir
;
2145 /* Known dependences can still be unordered througout the
2146 iteration space, see gcc.dg/tree-ssa/ldist-16.c and
2147 gcc.dg/tree-ssa/pr94969.c. */
2148 if (DDR_NUM_DIST_VECTS (ddr
) != 1)
2150 /* If the overlap is exact preserve stmt order. */
2151 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr
, 0),
2152 DDR_NB_LOOPS (ddr
)))
2154 /* Else as the distance vector is lexicographic positive swap
2155 the dependence direction. */
2157 this_dir
= -this_dir
;
2165 else if (this_dir
!= 0 && dir
!= this_dir
)
2167 /* Shuffle "back" dr1. */
2174 /* Compare postorder number of the partition graph vertices V1 and V2. */
2177 pgcmp (const void *v1_
, const void *v2_
)
2179 const vertex
*v1
= (const vertex
*)v1_
;
2180 const vertex
*v2
= (const vertex
*)v2_
;
2181 return v2
->post
- v1
->post
;
2184 /* Data attached to vertices of partition dependence graph. */
2187 /* ID of the corresponding partition. */
2189 /* The partition. */
2190 struct partition
*partition
;
2193 /* Data attached to edges of partition dependence graph. */
2196 /* If the dependence edge can be resolved by runtime alias check,
2197 this vector contains data dependence relations for runtime alias
2198 check. On the other hand, if the dependence edge is introduced
2199 because of compilation time known data dependence, this vector
2200 contains nothing. */
2201 vec
<ddr_p
> alias_ddrs
;
2204 /* Callback data for traversing edges in graph. */
2205 struct pg_edge_callback_data
2207 /* Bitmap contains strong connected components should be merged. */
2208 bitmap sccs_to_merge
;
2209 /* Array constains component information for all vertices. */
2210 int *vertices_component
;
2211 /* Vector to record all data dependence relations which are needed
2212 to break strong connected components by runtime alias checks. */
2213 vec
<ddr_p
> *alias_ddrs
;
2216 /* Initialize vertice's data for partition dependence graph PG with
2220 init_partition_graph_vertices (struct graph
*pg
,
2221 vec
<struct partition
*> *partitions
)
2224 partition
*partition
;
2225 struct pg_vdata
*data
;
2227 for (i
= 0; partitions
->iterate (i
, &partition
); ++i
)
2229 data
= new pg_vdata
;
2230 pg
->vertices
[i
].data
= data
;
2232 data
->partition
= partition
;
2236 /* Add edge <I, J> to partition dependence graph PG. Attach vector of data
2237 dependence relations to the EDGE if DDRS isn't NULL. */
2240 add_partition_graph_edge (struct graph
*pg
, int i
, int j
, vec
<ddr_p
> *ddrs
)
2242 struct graph_edge
*e
= add_edge (pg
, i
, j
);
2244 /* If the edge is attached with data dependence relations, it means this
2245 dependence edge can be resolved by runtime alias checks. */
2248 struct pg_edata
*data
= new pg_edata
;
2250 gcc_assert (ddrs
->length () > 0);
2252 data
->alias_ddrs
= vNULL
;
2253 data
->alias_ddrs
.safe_splice (*ddrs
);
2257 /* Callback function for graph travesal algorithm. It returns true
2258 if edge E should skipped when traversing the graph. */
2261 pg_skip_alias_edge (struct graph_edge
*e
)
2263 struct pg_edata
*data
= (struct pg_edata
*)e
->data
;
2264 return (data
!= NULL
&& data
->alias_ddrs
.length () > 0);
2267 /* Callback function freeing data attached to edge E of graph. */
2270 free_partition_graph_edata_cb (struct graph
*, struct graph_edge
*e
, void *)
2272 if (e
->data
!= NULL
)
2274 struct pg_edata
*data
= (struct pg_edata
*)e
->data
;
2275 data
->alias_ddrs
.release ();
2280 /* Free data attached to vertice of partition dependence graph PG. */
2283 free_partition_graph_vdata (struct graph
*pg
)
2286 struct pg_vdata
*data
;
2288 for (i
= 0; i
< pg
->n_vertices
; ++i
)
2290 data
= (struct pg_vdata
*)pg
->vertices
[i
].data
;
2295 /* Build and return partition dependence graph for PARTITIONS. RDG is
2296 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
2297 is true, data dependence caused by possible alias between references
2298 is ignored, as if it doesn't exist at all; otherwise all depdendences
2302 loop_distribution::build_partition_graph (struct graph
*rdg
,
2303 vec
<struct partition
*> *partitions
,
2304 bool ignore_alias_p
)
2307 struct partition
*partition1
, *partition2
;
2308 graph
*pg
= new_graph (partitions
->length ());
2309 auto_vec
<ddr_p
> alias_ddrs
, *alias_ddrs_p
;
2311 alias_ddrs_p
= ignore_alias_p
? NULL
: &alias_ddrs
;
2313 init_partition_graph_vertices (pg
, partitions
);
2315 for (i
= 0; partitions
->iterate (i
, &partition1
); ++i
)
2317 for (j
= i
+ 1; partitions
->iterate (j
, &partition2
); ++j
)
2319 /* dependence direction - 0 is no dependence, -1 is back,
2320 1 is forth, 2 is both (we can stop then, merging will occur). */
2323 /* If the first partition has reduction, add back edge; if the
2324 second partition has reduction, add forth edge. This makes
2325 sure that reduction partition will be sorted as the last one. */
2326 if (partition_reduction_p (partition1
))
2328 else if (partition_reduction_p (partition2
))
2331 /* Cleanup the temporary vector. */
2332 alias_ddrs
.truncate (0);
2334 dir
= pg_add_dependence_edges (rdg
, dir
, partition1
->datarefs
,
2335 partition2
->datarefs
, alias_ddrs_p
);
2337 /* Add edge to partition graph if there exists dependence. There
2338 are two types of edges. One type edge is caused by compilation
2339 time known dependence, this type cannot be resolved by runtime
2340 alias check. The other type can be resolved by runtime alias
2342 if (dir
== 1 || dir
== 2
2343 || alias_ddrs
.length () > 0)
2345 /* Attach data dependence relations to edge that can be resolved
2346 by runtime alias check. */
2347 bool alias_edge_p
= (dir
!= 1 && dir
!= 2);
2348 add_partition_graph_edge (pg
, i
, j
,
2349 (alias_edge_p
) ? &alias_ddrs
: NULL
);
2351 if (dir
== -1 || dir
== 2
2352 || alias_ddrs
.length () > 0)
2354 /* Attach data dependence relations to edge that can be resolved
2355 by runtime alias check. */
2356 bool alias_edge_p
= (dir
!= -1 && dir
!= 2);
2357 add_partition_graph_edge (pg
, j
, i
,
2358 (alias_edge_p
) ? &alias_ddrs
: NULL
);
2365 /* Sort partitions in PG in descending post order and store them in
2369 sort_partitions_by_post_order (struct graph
*pg
,
2370 vec
<struct partition
*> *partitions
)
2373 struct pg_vdata
*data
;
2375 /* Now order the remaining nodes in descending postorder. */
2376 qsort (pg
->vertices
, pg
->n_vertices
, sizeof (vertex
), pgcmp
);
2377 partitions
->truncate (0);
2378 for (i
= 0; i
< pg
->n_vertices
; ++i
)
2380 data
= (struct pg_vdata
*)pg
->vertices
[i
].data
;
2381 if (data
->partition
)
2382 partitions
->safe_push (data
->partition
);
2387 loop_distribution::merge_dep_scc_partitions (struct graph
*rdg
,
2388 vec
<struct partition
*> *partitions
,
2389 bool ignore_alias_p
)
2391 struct partition
*partition1
, *partition2
;
2392 struct pg_vdata
*data
;
2393 graph
*pg
= build_partition_graph (rdg
, partitions
, ignore_alias_p
);
2394 int i
, j
, num_sccs
= graphds_scc (pg
, NULL
);
2396 /* Strong connected compoenent means dependence cycle, we cannot distribute
2397 them. So fuse them together. */
2398 if ((unsigned) num_sccs
< partitions
->length ())
2400 for (i
= 0; i
< num_sccs
; ++i
)
2402 for (j
= 0; partitions
->iterate (j
, &partition1
); ++j
)
2403 if (pg
->vertices
[j
].component
== i
)
2405 for (j
= j
+ 1; partitions
->iterate (j
, &partition2
); ++j
)
2406 if (pg
->vertices
[j
].component
== i
)
2408 partition_merge_into (NULL
, partition1
,
2409 partition2
, FUSE_SAME_SCC
);
2410 partition1
->type
= PTYPE_SEQUENTIAL
;
2411 (*partitions
)[j
] = NULL
;
2412 partition_free (partition2
);
2413 data
= (struct pg_vdata
*)pg
->vertices
[j
].data
;
2414 data
->partition
= NULL
;
2419 sort_partitions_by_post_order (pg
, partitions
);
2420 gcc_assert (partitions
->length () == (unsigned)num_sccs
);
2421 free_partition_graph_vdata (pg
);
2422 for_each_edge (pg
, free_partition_graph_edata_cb
, NULL
);
2426 /* Callback function for traversing edge E in graph G. DATA is private
2430 pg_collect_alias_ddrs (struct graph
*g
, struct graph_edge
*e
, void *data
)
2432 int i
, j
, component
;
2433 struct pg_edge_callback_data
*cbdata
;
2434 struct pg_edata
*edata
= (struct pg_edata
*) e
->data
;
2436 /* If the edge doesn't have attached data dependence, it represents
2437 compilation time known dependences. This type dependence cannot
2438 be resolved by runtime alias check. */
2439 if (edata
== NULL
|| edata
->alias_ddrs
.length () == 0)
2442 cbdata
= (struct pg_edge_callback_data
*) data
;
2445 component
= cbdata
->vertices_component
[i
];
2446 /* Vertices are topologically sorted according to compilation time
2447 known dependences, so we can break strong connected components
2448 by removing edges of the opposite direction, i.e, edges pointing
2449 from vertice with smaller post number to vertice with bigger post
2451 if (g
->vertices
[i
].post
< g
->vertices
[j
].post
2452 /* We only need to remove edges connecting vertices in the same
2453 strong connected component to break it. */
2454 && component
== cbdata
->vertices_component
[j
]
2455 /* Check if we want to break the strong connected component or not. */
2456 && !bitmap_bit_p (cbdata
->sccs_to_merge
, component
))
2457 cbdata
->alias_ddrs
->safe_splice (edata
->alias_ddrs
);
2460 /* Callback function for traversing edge E. DATA is private
2464 pg_unmark_merged_alias_ddrs (struct graph
*, struct graph_edge
*e
, void *data
)
2466 int i
, j
, component
;
2467 struct pg_edge_callback_data
*cbdata
;
2468 struct pg_edata
*edata
= (struct pg_edata
*) e
->data
;
2470 if (edata
== NULL
|| edata
->alias_ddrs
.length () == 0)
2473 cbdata
= (struct pg_edge_callback_data
*) data
;
2476 component
= cbdata
->vertices_component
[i
];
2477 /* Make sure to not skip vertices inside SCCs we are going to merge. */
2478 if (component
== cbdata
->vertices_component
[j
]
2479 && bitmap_bit_p (cbdata
->sccs_to_merge
, component
))
2481 edata
->alias_ddrs
.release ();
2487 /* This is the main function breaking strong conected components in
2488 PARTITIONS giving reduced depdendence graph RDG. Store data dependence
2489 relations for runtime alias check in ALIAS_DDRS. */
2491 loop_distribution::break_alias_scc_partitions (struct graph
*rdg
,
2492 vec
<struct partition
*> *partitions
,
2493 vec
<ddr_p
> *alias_ddrs
)
2495 int i
, j
, k
, num_sccs
, num_sccs_no_alias
= 0;
2496 /* Build partition dependence graph. */
2497 graph
*pg
= build_partition_graph (rdg
, partitions
, false);
2499 alias_ddrs
->truncate (0);
2500 /* Find strong connected components in the graph, with all dependence edges
2502 num_sccs
= graphds_scc (pg
, NULL
);
2503 /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2504 compilation time known dependences are merged before this function. */
2505 if ((unsigned) num_sccs
< partitions
->length ())
2507 struct pg_edge_callback_data cbdata
;
2508 auto_bitmap sccs_to_merge
;
2509 auto_vec
<enum partition_type
> scc_types
;
2510 struct partition
*partition
, *first
;
2512 /* If all partitions in a SCC have the same type, we can simply merge the
2513 SCC. This loop finds out such SCCS and record them in bitmap. */
2514 bitmap_set_range (sccs_to_merge
, 0, (unsigned) num_sccs
);
2515 for (i
= 0; i
< num_sccs
; ++i
)
2517 for (j
= 0; partitions
->iterate (j
, &first
); ++j
)
2518 if (pg
->vertices
[j
].component
== i
)
2521 bool same_type
= true, all_builtins
= partition_builtin_p (first
);
2522 for (++j
; partitions
->iterate (j
, &partition
); ++j
)
2524 if (pg
->vertices
[j
].component
!= i
)
2527 if (first
->type
!= partition
->type
)
2532 all_builtins
&= partition_builtin_p (partition
);
2534 /* Merge SCC if all partitions in SCC have the same type, though the
2535 result partition is sequential, because vectorizer can do better
2536 runtime alias check. One expecption is all partitions in SCC are
2538 if (!same_type
|| all_builtins
)
2539 bitmap_clear_bit (sccs_to_merge
, i
);
2542 /* Initialize callback data for traversing. */
2543 cbdata
.sccs_to_merge
= sccs_to_merge
;
2544 cbdata
.alias_ddrs
= alias_ddrs
;
2545 cbdata
.vertices_component
= XNEWVEC (int, pg
->n_vertices
);
2546 /* Record the component information which will be corrupted by next
2547 graph scc finding call. */
2548 for (i
= 0; i
< pg
->n_vertices
; ++i
)
2549 cbdata
.vertices_component
[i
] = pg
->vertices
[i
].component
;
2551 /* Collect data dependences for runtime alias checks to break SCCs. */
2552 if (bitmap_count_bits (sccs_to_merge
) != (unsigned) num_sccs
)
2554 /* For SCCs we want to merge clear all alias_ddrs for edges
2555 inside the component. */
2556 for_each_edge (pg
, pg_unmark_merged_alias_ddrs
, &cbdata
);
2558 /* Run SCC finding algorithm again, with alias dependence edges
2559 skipped. This is to topologically sort partitions according to
2560 compilation time known dependence. Note the topological order
2561 is stored in the form of pg's post order number. */
2562 num_sccs_no_alias
= graphds_scc (pg
, NULL
, pg_skip_alias_edge
);
2563 /* We cannot assert partitions->length () == num_sccs_no_alias
2564 since we are not ignoring alias edges in cycles we are
2565 going to merge. That's required to compute correct postorder. */
2566 /* With topological order, we can construct two subgraphs L and R.
2567 L contains edge <x, y> where x < y in terms of post order, while
2568 R contains edge <x, y> where x > y. Edges for compilation time
2569 known dependence all fall in R, so we break SCCs by removing all
2570 (alias) edges of in subgraph L. */
2571 for_each_edge (pg
, pg_collect_alias_ddrs
, &cbdata
);
2574 /* For SCC that doesn't need to be broken, merge it. */
2575 for (i
= 0; i
< num_sccs
; ++i
)
2577 if (!bitmap_bit_p (sccs_to_merge
, i
))
2580 for (j
= 0; partitions
->iterate (j
, &first
); ++j
)
2581 if (cbdata
.vertices_component
[j
] == i
)
2583 for (k
= j
+ 1; partitions
->iterate (k
, &partition
); ++k
)
2585 struct pg_vdata
*data
;
2587 if (cbdata
.vertices_component
[k
] != i
)
2590 partition_merge_into (NULL
, first
, partition
, FUSE_SAME_SCC
);
2591 (*partitions
)[k
] = NULL
;
2592 partition_free (partition
);
2593 data
= (struct pg_vdata
*)pg
->vertices
[k
].data
;
2594 gcc_assert (data
->id
== k
);
2595 data
->partition
= NULL
;
2596 /* The result partition of merged SCC must be sequential. */
2597 first
->type
= PTYPE_SEQUENTIAL
;
2600 /* If reduction partition's SCC is broken by runtime alias checks,
2601 we force a negative post order to it making sure it will be scheduled
2603 if (num_sccs_no_alias
> 0)
2606 for (i
= 0; i
< pg
->n_vertices
; ++i
)
2608 struct pg_vdata
*data
= (struct pg_vdata
*)pg
->vertices
[i
].data
;
2609 if (data
->partition
&& partition_reduction_p (data
->partition
))
2611 gcc_assert (j
== -1);
2616 pg
->vertices
[j
].post
= -1;
2619 free (cbdata
.vertices_component
);
2622 sort_partitions_by_post_order (pg
, partitions
);
2623 free_partition_graph_vdata (pg
);
2624 for_each_edge (pg
, free_partition_graph_edata_cb
, NULL
);
2627 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2629 fprintf (dump_file
, "Possible alias data dependence to break:\n");
2630 dump_data_dependence_relations (dump_file
, *alias_ddrs
);
2634 /* Compute and return an expression whose value is the segment length which
2635 will be accessed by DR in NITERS iterations. */
2638 data_ref_segment_size (struct data_reference
*dr
, tree niters
)
2640 niters
= size_binop (MINUS_EXPR
,
2641 fold_convert (sizetype
, niters
),
2643 return size_binop (MULT_EXPR
,
2644 fold_convert (sizetype
, DR_STEP (dr
)),
2645 fold_convert (sizetype
, niters
));
2648 /* Return true if LOOP's latch is dominated by statement for data reference
2652 latch_dominated_by_data_ref (class loop
*loop
, data_reference
*dr
)
2654 return dominated_by_p (CDI_DOMINATORS
, single_exit (loop
)->src
,
2655 gimple_bb (DR_STMT (dr
)));
2658 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2659 data dependence relations ALIAS_DDRS. */
2662 compute_alias_check_pairs (class loop
*loop
, vec
<ddr_p
> *alias_ddrs
,
2663 vec
<dr_with_seg_len_pair_t
> *comp_alias_pairs
)
2666 unsigned HOST_WIDE_INT factor
= 1;
2667 tree niters_plus_one
, niters
= number_of_latch_executions (loop
);
2669 gcc_assert (niters
!= NULL_TREE
&& niters
!= chrec_dont_know
);
2670 niters
= fold_convert (sizetype
, niters
);
2671 niters_plus_one
= size_binop (PLUS_EXPR
, niters
, size_one_node
);
2673 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2674 fprintf (dump_file
, "Creating alias check pairs:\n");
2676 /* Iterate all data dependence relations and compute alias check pairs. */
2677 for (i
= 0; i
< alias_ddrs
->length (); i
++)
2679 ddr_p ddr
= (*alias_ddrs
)[i
];
2680 struct data_reference
*dr_a
= DDR_A (ddr
);
2681 struct data_reference
*dr_b
= DDR_B (ddr
);
2682 tree seg_length_a
, seg_length_b
;
2684 if (latch_dominated_by_data_ref (loop
, dr_a
))
2685 seg_length_a
= data_ref_segment_size (dr_a
, niters_plus_one
);
2687 seg_length_a
= data_ref_segment_size (dr_a
, niters
);
2689 if (latch_dominated_by_data_ref (loop
, dr_b
))
2690 seg_length_b
= data_ref_segment_size (dr_b
, niters_plus_one
);
2692 seg_length_b
= data_ref_segment_size (dr_b
, niters
);
2694 unsigned HOST_WIDE_INT access_size_a
2695 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a
))));
2696 unsigned HOST_WIDE_INT access_size_b
2697 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b
))));
2698 unsigned int align_a
= TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a
)));
2699 unsigned int align_b
= TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b
)));
2701 dr_with_seg_len_pair_t dr_with_seg_len_pair
2702 (dr_with_seg_len (dr_a
, seg_length_a
, access_size_a
, align_a
),
2703 dr_with_seg_len (dr_b
, seg_length_b
, access_size_b
, align_b
),
2704 /* ??? Would WELL_ORDERED be safe? */
2705 dr_with_seg_len_pair_t::REORDERED
);
2707 comp_alias_pairs
->safe_push (dr_with_seg_len_pair
);
2710 if (tree_fits_uhwi_p (niters
))
2711 factor
= tree_to_uhwi (niters
);
2713 /* Prune alias check pairs. */
2714 prune_runtime_alias_test_list (comp_alias_pairs
, factor
);
2715 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2717 "Improved number of alias checks from %d to %d\n",
2718 alias_ddrs
->length (), comp_alias_pairs
->length ());
2721 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2722 checks and version LOOP under condition of these runtime alias checks. */
2725 version_loop_by_alias_check (vec
<struct partition
*> *partitions
,
2726 class loop
*loop
, vec
<ddr_p
> *alias_ddrs
)
2728 profile_probability prob
;
2729 basic_block cond_bb
;
2731 tree lhs
, arg0
, cond_expr
= NULL_TREE
;
2732 gimple_seq cond_stmts
= NULL
;
2733 gimple
*call_stmt
= NULL
;
2734 auto_vec
<dr_with_seg_len_pair_t
> comp_alias_pairs
;
2736 /* Generate code for runtime alias checks if necessary. */
2737 gcc_assert (alias_ddrs
->length () > 0);
2739 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2741 "Version loop <%d> with runtime alias check\n", loop
->num
);
2743 compute_alias_check_pairs (loop
, alias_ddrs
, &comp_alias_pairs
);
2744 create_runtime_alias_checks (loop
, &comp_alias_pairs
, &cond_expr
);
2745 cond_expr
= force_gimple_operand_1 (cond_expr
, &cond_stmts
,
2746 is_gimple_val
, NULL_TREE
);
2748 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */
2749 bool cancelable_p
= flag_tree_loop_vectorize
;
2753 struct partition
*partition
;
2754 for (; partitions
->iterate (i
, &partition
); ++i
)
2755 if (!partition_builtin_p (partition
))
2758 /* If all partitions are builtins, distributing it would be profitable and
2759 we don't want to cancel the runtime alias checks. */
2760 if (i
== partitions
->length ())
2761 cancelable_p
= false;
2764 /* Generate internal function call for loop distribution alias check if the
2765 runtime alias check should be cancelable. */
2768 call_stmt
= gimple_build_call_internal (IFN_LOOP_DIST_ALIAS
,
2769 2, NULL_TREE
, cond_expr
);
2770 lhs
= make_ssa_name (boolean_type_node
);
2771 gimple_call_set_lhs (call_stmt
, lhs
);
2776 prob
= profile_probability::guessed_always ().apply_scale (9, 10);
2777 initialize_original_copy_tables ();
2778 nloop
= loop_version (loop
, lhs
, &cond_bb
, prob
, prob
.invert (),
2779 prob
, prob
.invert (), true);
2780 free_original_copy_tables ();
2781 /* Record the original loop number in newly generated loops. In case of
2782 distribution, the original loop will be distributed and the new loop
2784 loop
->orig_loop_num
= nloop
->num
;
2785 nloop
->orig_loop_num
= nloop
->num
;
2786 nloop
->dont_vectorize
= true;
2787 nloop
->force_vectorize
= false;
2791 /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2792 loop could be destroyed. */
2793 arg0
= build_int_cst (integer_type_node
, loop
->orig_loop_num
);
2794 gimple_call_set_arg (call_stmt
, 0, arg0
);
2795 gimple_seq_add_stmt_without_update (&cond_stmts
, call_stmt
);
2800 gimple_stmt_iterator cond_gsi
= gsi_last_bb (cond_bb
);
2801 gsi_insert_seq_before (&cond_gsi
, cond_stmts
, GSI_SAME_STMT
);
2803 update_ssa (TODO_update_ssa_no_phi
);
2806 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2807 ALIAS_DDRS are data dependence relations for runtime alias check. */
2810 version_for_distribution_p (vec
<struct partition
*> *partitions
,
2811 vec
<ddr_p
> *alias_ddrs
)
2813 /* No need to version loop if we have only one partition. */
2814 if (partitions
->length () == 1)
2817 /* Need to version loop if runtime alias check is necessary. */
2818 return (alias_ddrs
->length () > 0);
2821 /* Compare base offset of builtin mem* partitions P1 and P2. */
2824 offset_cmp (const void *vp1
, const void *vp2
)
2826 struct partition
*p1
= *(struct partition
*const *) vp1
;
2827 struct partition
*p2
= *(struct partition
*const *) vp2
;
2828 unsigned HOST_WIDE_INT o1
= p1
->builtin
->dst_base_offset
;
2829 unsigned HOST_WIDE_INT o2
= p2
->builtin
->dst_base_offset
;
2830 return (o2
< o1
) - (o1
< o2
);
2833 /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special
2834 case optimization transforming below code:
2836 __builtin_memset (&obj, 0, 100);
2838 __builtin_memset (_1, 0, 200);
2840 __builtin_memset (_2, 0, 100);
2844 __builtin_memset (&obj, 0, 400);
2846 Note we don't have dependence information between different partitions
2847 at this point, as a result, we can't handle nonadjacent memset builtin
2848 partitions since dependence might be broken. */
2851 fuse_memset_builtins (vec
<struct partition
*> *partitions
)
2854 struct partition
*part1
, *part2
;
2857 for (i
= 0; partitions
->iterate (i
, &part1
);)
2859 if (part1
->kind
!= PKIND_MEMSET
)
2865 /* Find sub-array of memset builtins of the same base. Index range
2866 of the sub-array is [i, j) with "j > i". */
2867 for (j
= i
+ 1; partitions
->iterate (j
, &part2
); ++j
)
2869 if (part2
->kind
!= PKIND_MEMSET
2870 || !operand_equal_p (part1
->builtin
->dst_base_base
,
2871 part2
->builtin
->dst_base_base
, 0))
2874 /* Memset calls setting different values can't be merged. */
2875 rhs1
= gimple_assign_rhs1 (DR_STMT (part1
->builtin
->dst_dr
));
2876 rhs2
= gimple_assign_rhs1 (DR_STMT (part2
->builtin
->dst_dr
));
2877 if (!operand_equal_p (rhs1
, rhs2
, 0))
2881 /* Stable sort is required in order to avoid breaking dependence. */
2882 gcc_stablesort (&(*partitions
)[i
], j
- i
, sizeof (*partitions
)[i
],
2884 /* Continue with next partition. */
2888 /* Merge all consecutive memset builtin partitions. */
2889 for (i
= 0; i
< partitions
->length () - 1;)
2891 part1
= (*partitions
)[i
];
2892 if (part1
->kind
!= PKIND_MEMSET
)
2898 part2
= (*partitions
)[i
+ 1];
2899 /* Only merge memset partitions of the same base and with constant
2901 if (part2
->kind
!= PKIND_MEMSET
2902 || TREE_CODE (part1
->builtin
->size
) != INTEGER_CST
2903 || TREE_CODE (part2
->builtin
->size
) != INTEGER_CST
2904 || !operand_equal_p (part1
->builtin
->dst_base_base
,
2905 part2
->builtin
->dst_base_base
, 0))
2910 rhs1
= gimple_assign_rhs1 (DR_STMT (part1
->builtin
->dst_dr
));
2911 rhs2
= gimple_assign_rhs1 (DR_STMT (part2
->builtin
->dst_dr
));
2912 int bytev1
= const_with_all_bytes_same (rhs1
);
2913 int bytev2
= const_with_all_bytes_same (rhs2
);
2914 /* Only merge memset partitions of the same value. */
2915 if (bytev1
!= bytev2
|| bytev1
== -1)
2920 wide_int end1
= wi::add (part1
->builtin
->dst_base_offset
,
2921 wi::to_wide (part1
->builtin
->size
));
2922 /* Only merge adjacent memset partitions. */
2923 if (wi::ne_p (end1
, part2
->builtin
->dst_base_offset
))
2928 /* Merge partitions[i] and partitions[i+1]. */
2929 part1
->builtin
->size
= fold_build2 (PLUS_EXPR
, sizetype
,
2930 part1
->builtin
->size
,
2931 part2
->builtin
->size
);
2932 partition_free (part2
);
2933 partitions
->ordered_remove (i
+ 1);
2938 loop_distribution::finalize_partitions (class loop
*loop
,
2939 vec
<struct partition
*> *partitions
,
2940 vec
<ddr_p
> *alias_ddrs
)
2943 struct partition
*partition
, *a
;
2945 if (partitions
->length () == 1
2946 || alias_ddrs
->length () > 0)
2949 unsigned num_builtin
= 0, num_normal
= 0, num_partial_memset
= 0;
2950 bool same_type_p
= true;
2951 enum partition_type type
= ((*partitions
)[0])->type
;
2952 for (i
= 0; partitions
->iterate (i
, &partition
); ++i
)
2954 same_type_p
&= (type
== partition
->type
);
2955 if (partition_builtin_p (partition
))
2961 if (partition
->kind
== PKIND_PARTIAL_MEMSET
)
2962 num_partial_memset
++;
2965 /* Don't distribute current loop into too many loops given we don't have
2966 memory stream cost model. Be even more conservative in case of loop
2967 nest distribution. */
2968 if ((same_type_p
&& num_builtin
== 0
2969 && (loop
->inner
== NULL
|| num_normal
!= 2 || num_partial_memset
!= 1))
2970 || (loop
->inner
!= NULL
2971 && i
>= NUM_PARTITION_THRESHOLD
&& num_normal
> 1)
2972 || (loop
->inner
== NULL
2973 && i
>= NUM_PARTITION_THRESHOLD
&& num_normal
> num_builtin
))
2975 a
= (*partitions
)[0];
2976 for (i
= 1; partitions
->iterate (i
, &partition
); ++i
)
2978 partition_merge_into (NULL
, a
, partition
, FUSE_FINALIZE
);
2979 partition_free (partition
);
2981 partitions
->truncate (1);
2984 /* Fuse memset builtins if possible. */
2985 if (partitions
->length () > 1)
2986 fuse_memset_builtins (partitions
);
2989 /* Distributes the code from LOOP in such a way that producer statements
2990 are placed before consumer statements. Tries to separate only the
2991 statements from STMTS into separate loops. Returns the number of
2992 distributed loops. Set NB_CALLS to number of generated builtin calls.
2993 Set *DESTROY_P to whether LOOP needs to be destroyed. */
2996 loop_distribution::distribute_loop (class loop
*loop
,
2997 const vec
<gimple
*> &stmts
,
2998 control_dependences
*cd
, int *nb_calls
, bool *destroy_p
,
2999 bool only_patterns_p
)
3001 ddrs_table
= new hash_table
<ddr_hasher
> (389);
3003 partition
*partition
;
3008 loop_nest
.create (0);
3009 if (!find_loop_nest (loop
, &loop_nest
))
3011 loop_nest
.release ();
3016 datarefs_vec
.create (20);
3017 has_nonaddressable_dataref_p
= false;
3018 rdg
= build_rdg (loop
, cd
);
3021 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3023 "Loop %d not distributed: failed to build the RDG.\n",
3026 loop_nest
.release ();
3027 free_data_refs (datarefs_vec
);
3032 if (datarefs_vec
.length () > MAX_DATAREFS_NUM
)
3034 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3036 "Loop %d not distributed: too many memory references.\n",
3040 loop_nest
.release ();
3041 free_data_refs (datarefs_vec
);
3046 data_reference_p dref
;
3047 for (i
= 0; datarefs_vec
.iterate (i
, &dref
); ++i
)
3048 dref
->aux
= (void *) (uintptr_t) i
;
3050 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3051 dump_rdg (dump_file
, rdg
);
3053 auto_vec
<struct partition
*, 3> partitions
;
3054 rdg_build_partitions (rdg
, stmts
, &partitions
);
3056 auto_vec
<ddr_p
> alias_ddrs
;
3058 auto_bitmap stmt_in_all_partitions
;
3059 bitmap_copy (stmt_in_all_partitions
, partitions
[0]->stmts
);
3060 for (i
= 1; partitions
.iterate (i
, &partition
); ++i
)
3061 bitmap_and_into (stmt_in_all_partitions
, partitions
[i
]->stmts
);
3063 bool any_builtin
= false;
3064 bool reduction_in_all
= false;
3065 int reduction_partition_num
= -1;
3066 FOR_EACH_VEC_ELT (partitions
, i
, partition
)
3069 |= classify_partition (loop
, rdg
, partition
, stmt_in_all_partitions
);
3070 any_builtin
|= partition_builtin_p (partition
);
3073 /* If we are only distributing patterns but did not detect any,
3082 /* If we are only distributing patterns fuse all partitions that
3083 were not classified as builtins. This also avoids chopping
3084 a loop into pieces, separated by builtin calls. That is, we
3085 only want no or a single loop body remaining. */
3086 struct partition
*into
;
3087 if (only_patterns_p
)
3089 for (i
= 0; partitions
.iterate (i
, &into
); ++i
)
3090 if (!partition_builtin_p (into
))
3092 for (++i
; partitions
.iterate (i
, &partition
); ++i
)
3093 if (!partition_builtin_p (partition
))
3095 partition_merge_into (NULL
, into
, partition
, FUSE_NON_BUILTIN
);
3096 partitions
.unordered_remove (i
);
3097 partition_free (partition
);
3102 /* Due to limitations in the transform phase we have to fuse all
3103 reduction partitions into the last partition so the existing
3104 loop will contain all loop-closed PHI nodes. */
3105 for (i
= 0; partitions
.iterate (i
, &into
); ++i
)
3106 if (partition_reduction_p (into
))
3108 for (i
= i
+ 1; partitions
.iterate (i
, &partition
); ++i
)
3109 if (partition_reduction_p (partition
))
3111 partition_merge_into (rdg
, into
, partition
, FUSE_REDUCTION
);
3112 partitions
.unordered_remove (i
);
3113 partition_free (partition
);
3117 /* Apply our simple cost model - fuse partitions with similar
3119 for (i
= 0; partitions
.iterate (i
, &into
); ++i
)
3121 bool changed
= false;
3122 for (int j
= i
+ 1; partitions
.iterate (j
, &partition
); ++j
)
3124 if (share_memory_accesses (rdg
, into
, partition
))
3126 partition_merge_into (rdg
, into
, partition
, FUSE_SHARE_REF
);
3127 partitions
.unordered_remove (j
);
3128 partition_free (partition
);
3133 /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
3134 accesses when 1 and 2 have similar accesses but not 0 and 1
3135 then in the next iteration we will fail to consider merging
3136 1 into 0,2. So try again if we did any merging into 0. */
3141 /* Put a non-builtin partition last if we need to preserve a reduction.
3142 In most cases this helps to keep a normal partition last avoiding to
3143 spill a reduction result across builtin calls.
3144 ??? The proper way would be to use dependences to see whether we
3145 can move builtin partitions earlier during merge_dep_scc_partitions
3146 and its sort_partitions_by_post_order. Especially when the
3147 dependence graph is composed of multiple independent subgraphs the
3148 heuristic does not work reliably. */
3149 if (reduction_in_all
3150 && partition_builtin_p (partitions
.last()))
3151 FOR_EACH_VEC_ELT (partitions
, i
, partition
)
3152 if (!partition_builtin_p (partition
))
3154 partitions
.unordered_remove (i
);
3155 partitions
.quick_push (partition
);
3159 /* Build the partition dependency graph and fuse partitions in strong
3160 connected component. */
3161 if (partitions
.length () > 1)
3163 /* Don't support loop nest distribution under runtime alias check
3164 since it's not likely to enable many vectorization opportunities.
3165 Also if loop has any data reference which may be not addressable
3166 since alias check needs to take, compare address of the object. */
3167 if (loop
->inner
|| has_nonaddressable_dataref_p
)
3168 merge_dep_scc_partitions (rdg
, &partitions
, false);
3171 merge_dep_scc_partitions (rdg
, &partitions
, true);
3172 if (partitions
.length () > 1)
3173 break_alias_scc_partitions (rdg
, &partitions
, &alias_ddrs
);
3177 finalize_partitions (loop
, &partitions
, &alias_ddrs
);
3179 /* If there is a reduction in all partitions make sure the last
3180 non-builtin partition provides the LC PHI defs. */
3181 if (reduction_in_all
)
3183 FOR_EACH_VEC_ELT (partitions
, i
, partition
)
3184 if (!partition_builtin_p (partition
))
3185 reduction_partition_num
= i
;
3186 if (reduction_partition_num
== -1)
3188 /* If all partitions are builtin, force the last one to
3189 be code generated as normal partition. */
3190 partition
= partitions
.last ();
3191 partition
->kind
= PKIND_NORMAL
;
3195 nbp
= partitions
.length ();
3197 || (nbp
== 1 && !partition_builtin_p (partitions
[0]))
3198 || (nbp
> 1 && partition_contains_all_rw (rdg
, partitions
)))
3204 if (version_for_distribution_p (&partitions
, &alias_ddrs
))
3205 version_loop_by_alias_check (&partitions
, loop
, &alias_ddrs
);
3207 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3210 "distribute loop <%d> into partitions:\n", loop
->num
);
3211 dump_rdg_partitions (dump_file
, partitions
);
3214 FOR_EACH_VEC_ELT (partitions
, i
, partition
)
3216 if (partition_builtin_p (partition
))
3218 *destroy_p
|= generate_code_for_partition (loop
, partition
, i
< nbp
- 1,
3219 i
== reduction_partition_num
);
3223 loop_nest
.release ();
3224 free_data_refs (datarefs_vec
);
3225 for (hash_table
<ddr_hasher
>::iterator iter
= ddrs_table
->begin ();
3226 iter
!= ddrs_table
->end (); ++iter
)
3228 free_dependence_relation (*iter
);
3233 FOR_EACH_VEC_ELT (partitions
, i
, partition
)
3234 partition_free (partition
);
3237 return nbp
- *nb_calls
;
3241 void loop_distribution::bb_top_order_init (void)
3244 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (cfun
) - NUM_FIXED_BLOCKS
);
3245 edge entry
= single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun
));
3246 bitmap exit_bbs
= BITMAP_ALLOC (NULL
);
3248 bb_top_order_index
= XNEWVEC (int, last_basic_block_for_fn (cfun
));
3249 bb_top_order_index_size
= last_basic_block_for_fn (cfun
);
3251 entry
->flags
&= ~EDGE_DFS_BACK
;
3252 bitmap_set_bit (exit_bbs
, EXIT_BLOCK
);
3253 rpo_num
= rev_post_order_and_mark_dfs_back_seme (cfun
, entry
, exit_bbs
, true,
3255 BITMAP_FREE (exit_bbs
);
3257 for (int i
= 0; i
< rpo_num
; i
++)
3258 bb_top_order_index
[rpo
[i
]] = i
;
3263 void loop_distribution::bb_top_order_destroy ()
3265 free (bb_top_order_index
);
3266 bb_top_order_index
= NULL
;
3267 bb_top_order_index_size
= 0;
3271 /* Given LOOP, this function records seed statements for distribution in
3272 WORK_LIST. Return false if there is nothing for distribution. */
3275 find_seed_stmts_for_distribution (class loop
*loop
, vec
<gimple
*> *work_list
)
3277 basic_block
*bbs
= get_loop_body_in_dom_order (loop
);
3279 /* Initialize the worklist with stmts we seed the partitions with. */
3280 for (unsigned i
= 0; i
< loop
->num_nodes
; ++i
)
3282 /* In irreducible sub-regions we don't know how to redirect
3283 conditions, so fail. See PR100492. */
3284 if (bbs
[i
]->flags
& BB_IRREDUCIBLE_LOOP
)
3286 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3287 fprintf (dump_file
, "loop %d contains an irreducible region.\n",
3289 work_list
->truncate (0);
3292 for (gphi_iterator gsi
= gsi_start_phis (bbs
[i
]);
3293 !gsi_end_p (gsi
); gsi_next (&gsi
))
3295 gphi
*phi
= gsi
.phi ();
3296 if (virtual_operand_p (gimple_phi_result (phi
)))
3298 /* Distribute stmts which have defs that are used outside of
3300 if (!stmt_has_scalar_dependences_outside_loop (loop
, phi
))
3302 work_list
->safe_push (phi
);
3304 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3305 !gsi_end_p (gsi
); gsi_next (&gsi
))
3307 gimple
*stmt
= gsi_stmt (gsi
);
3309 /* Ignore clobbers, they do not have true side effects. */
3310 if (gimple_clobber_p (stmt
))
3313 /* If there is a stmt with side-effects bail out - we
3314 cannot and should not distribute this loop. */
3315 if (gimple_has_side_effects (stmt
))
3321 /* Distribute stmts which have defs that are used outside of
3323 if (stmt_has_scalar_dependences_outside_loop (loop
, stmt
))
3325 /* Otherwise only distribute stores for now. */
3326 else if (!gimple_vdef (stmt
))
3329 work_list
->safe_push (stmt
);
3332 bool res
= work_list
->length () > 0;
3333 if (res
&& !can_copy_bbs_p (bbs
, loop
->num_nodes
))
3335 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3336 fprintf (dump_file
, "cannot copy loop %d.\n", loop
->num
);
3343 /* A helper function for generate_{rawmemchr,strlen}_builtin functions in order
3344 to place new statements SEQ before LOOP and replace the old reduction
3345 variable with the new one. */
3348 generate_reduction_builtin_1 (loop_p loop
, gimple_seq
&seq
,
3349 tree reduction_var_old
, tree reduction_var_new
,
3350 const char *info
, machine_mode load_mode
)
3352 gcc_assert (flag_tree_loop_distribute_patterns
);
3354 /* Place new statements before LOOP. */
3355 gimple_stmt_iterator gsi
= gsi_last_bb (loop_preheader_edge (loop
)->src
);
3356 gsi_insert_seq_after (&gsi
, seq
, GSI_CONTINUE_LINKING
);
3358 /* Replace old reduction variable with new one. */
3359 imm_use_iterator iter
;
3361 use_operand_p use_p
;
3362 FOR_EACH_IMM_USE_STMT (stmt
, iter
, reduction_var_old
)
3364 FOR_EACH_IMM_USE_ON_STMT (use_p
, iter
)
3365 SET_USE (use_p
, reduction_var_new
);
3370 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3371 fprintf (dump_file
, info
, GET_MODE_NAME (load_mode
));
3374 /* Generate a call to rawmemchr and place it before LOOP. REDUCTION_VAR is
3375 replaced with a fresh SSA name representing the result of the call. */
3378 generate_rawmemchr_builtin (loop_p loop
, tree reduction_var
,
3379 data_reference_p store_dr
, tree base
, tree pattern
,
3382 gimple_seq seq
= NULL
;
3384 tree mem
= force_gimple_operand (base
, &seq
, true, NULL_TREE
);
3385 gimple
*fn_call
= gimple_build_call_internal (IFN_RAWMEMCHR
, 2, mem
, pattern
);
3386 tree reduction_var_new
= copy_ssa_name (reduction_var
);
3387 gimple_call_set_lhs (fn_call
, reduction_var_new
);
3388 gimple_set_location (fn_call
, loc
);
3389 gimple_seq_add_stmt (&seq
, fn_call
);
3393 gassign
*g
= gimple_build_assign (DR_REF (store_dr
), reduction_var_new
);
3394 gimple_seq_add_stmt (&seq
, g
);
3397 generate_reduction_builtin_1 (loop
, seq
, reduction_var
, reduction_var_new
,
3398 "generated rawmemchr%s\n",
3399 TYPE_MODE (TREE_TYPE (TREE_TYPE (base
))));
3402 /* Helper function for generate_strlen_builtin(,_using_rawmemchr) */
3405 generate_strlen_builtin_1 (loop_p loop
, gimple_seq
&seq
,
3406 tree reduction_var_old
, tree reduction_var_new
,
3407 machine_mode mode
, tree start_len
)
3409 /* REDUCTION_VAR_NEW has either size type or ptrdiff type and must be
3410 converted if types of old and new reduction variable are not compatible. */
3411 reduction_var_new
= gimple_convert (&seq
, TREE_TYPE (reduction_var_old
),
3414 /* Loops of the form `for (i=42; s[i]; ++i);` have an additional start
3416 if (!integer_zerop (start_len
))
3418 tree lhs
= make_ssa_name (TREE_TYPE (reduction_var_new
));
3419 gimple
*g
= gimple_build_assign (lhs
, PLUS_EXPR
, reduction_var_new
,
3421 gimple_seq_add_stmt (&seq
, g
);
3422 reduction_var_new
= lhs
;
3425 generate_reduction_builtin_1 (loop
, seq
, reduction_var_old
, reduction_var_new
,
3426 "generated strlen%s\n", mode
);
3429 /* Generate a call to strlen and place it before LOOP. REDUCTION_VAR is
3430 replaced with a fresh SSA name representing the result of the call. */
3433 generate_strlen_builtin (loop_p loop
, tree reduction_var
, tree base
,
3434 tree start_len
, location_t loc
)
3436 gimple_seq seq
= NULL
;
3438 tree reduction_var_new
= make_ssa_name (size_type_node
);
3440 tree mem
= force_gimple_operand (base
, &seq
, true, NULL_TREE
);
3441 tree fn
= build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_STRLEN
));
3442 gimple
*fn_call
= gimple_build_call (fn
, 1, mem
);
3443 gimple_call_set_lhs (fn_call
, reduction_var_new
);
3444 gimple_set_location (fn_call
, loc
);
3445 gimple_seq_add_stmt (&seq
, fn_call
);
3447 generate_strlen_builtin_1 (loop
, seq
, reduction_var
, reduction_var_new
,
3451 /* Generate code in order to mimic the behaviour of strlen but this time over
3452 an array of elements with mode different than QI. REDUCTION_VAR is replaced
3453 with a fresh SSA name representing the result, i.e., the length. */
3456 generate_strlen_builtin_using_rawmemchr (loop_p loop
, tree reduction_var
,
3457 tree base
, tree load_type
,
3458 tree start_len
, location_t loc
)
3460 gimple_seq seq
= NULL
;
3462 tree start
= force_gimple_operand (base
, &seq
, true, NULL_TREE
);
3463 tree zero
= build_zero_cst (load_type
);
3464 gimple
*fn_call
= gimple_build_call_internal (IFN_RAWMEMCHR
, 2, start
, zero
);
3465 tree end
= make_ssa_name (TREE_TYPE (base
));
3466 gimple_call_set_lhs (fn_call
, end
);
3467 gimple_set_location (fn_call
, loc
);
3468 gimple_seq_add_stmt (&seq
, fn_call
);
3470 /* Determine the number of elements between START and END by
3471 evaluating (END - START) / sizeof (*START). */
3472 tree diff
= make_ssa_name (ptrdiff_type_node
);
3473 gimple
*diff_stmt
= gimple_build_assign (diff
, POINTER_DIFF_EXPR
, end
, start
);
3474 gimple_seq_add_stmt (&seq
, diff_stmt
);
3475 /* Let SIZE be the size of each character. */
3476 tree size
= gimple_convert (&seq
, ptrdiff_type_node
,
3477 TYPE_SIZE_UNIT (load_type
));
3478 tree count
= make_ssa_name (ptrdiff_type_node
);
3479 gimple
*count_stmt
= gimple_build_assign (count
, TRUNC_DIV_EXPR
, diff
, size
);
3480 gimple_seq_add_stmt (&seq
, count_stmt
);
3482 generate_strlen_builtin_1 (loop
, seq
, reduction_var
, count
,
3483 TYPE_MODE (load_type
),
3487 /* Return true if we can count at least as many characters by taking pointer
3488 difference as we can count via reduction_var without an overflow. Thus
3489 compute 2^n < (2^(m-1) / s) where n = TYPE_PRECISION (reduction_var_type),
3490 m = TYPE_PRECISION (ptrdiff_type_node), and s = size of each character. */
3492 reduction_var_overflows_first (tree reduction_var_type
, tree load_type
)
3494 widest_int n2
= wi::lshift (1, TYPE_PRECISION (reduction_var_type
));;
3495 widest_int m2
= wi::lshift (1, TYPE_PRECISION (ptrdiff_type_node
) - 1);
3496 widest_int s
= wi::to_widest (TYPE_SIZE_UNIT (load_type
));
3497 return wi::ltu_p (n2
, wi::udiv_trunc (m2
, s
));
3501 determine_reduction_stmt_1 (const loop_p loop
, const basic_block
*bbs
)
3503 gimple
*reduction_stmt
= NULL
;
3505 for (unsigned i
= 0, ninsns
= 0; i
< loop
->num_nodes
; ++i
)
3507 basic_block bb
= bbs
[i
];
3509 for (gphi_iterator bsi
= gsi_start_phis (bb
); !gsi_end_p (bsi
);
3510 gsi_next_nondebug (&bsi
))
3512 gphi
*phi
= bsi
.phi ();
3513 if (virtual_operand_p (gimple_phi_result (phi
)))
3515 if (stmt_has_scalar_dependences_outside_loop (loop
, phi
))
3519 reduction_stmt
= phi
;
3523 for (gimple_stmt_iterator bsi
= gsi_start_bb (bb
); !gsi_end_p (bsi
);
3524 gsi_next_nondebug (&bsi
), ++ninsns
)
3526 /* Bail out early for loops which are unlikely to match. */
3529 gimple
*stmt
= gsi_stmt (bsi
);
3530 if (gimple_clobber_p (stmt
))
3532 if (gimple_code (stmt
) == GIMPLE_LABEL
)
3534 if (gimple_has_volatile_ops (stmt
))
3536 if (stmt_has_scalar_dependences_outside_loop (loop
, stmt
))
3540 reduction_stmt
= stmt
;
3545 return reduction_stmt
;
3548 /* If LOOP has a single non-volatile reduction statement, then return a pointer
3549 to it. Otherwise return NULL. */
3551 determine_reduction_stmt (const loop_p loop
)
3553 basic_block
*bbs
= get_loop_body (loop
);
3554 gimple
*reduction_stmt
= determine_reduction_stmt_1 (loop
, bbs
);
3556 return reduction_stmt
;
3559 /* Transform loops which mimic the effects of builtins rawmemchr or strlen and
3560 replace them accordingly. For example, a loop of the form
3562 for (; *p != 42; ++p);
3566 p = rawmemchr<MODE> (p, 42);
3568 under the assumption that rawmemchr is available for a particular MODE.
3572 for (i = 42; s[i]; ++i);
3574 which is replaced by
3576 i = (int)strlen (&s[42]) + 42;
3578 for some character array S. In case array S is not of type character array
3581 i = (int)(rawmemchr<MODE> (&s[42], 0) - &s[42]) + 42;
3583 assuming that rawmemchr is available for a particular MODE. */
3586 loop_distribution::transform_reduction_loop (loop_p loop
)
3588 gimple
*reduction_stmt
;
3589 data_reference_p load_dr
= NULL
, store_dr
= NULL
;
3591 edge e
= single_exit (loop
);
3592 gcond
*cond
= safe_dyn_cast
<gcond
*> (*gsi_last_bb (e
->src
));
3595 /* Ensure loop condition is an (in)equality test and loop is exited either if
3596 the inequality test fails or the equality test succeeds. */
3597 if (!(e
->flags
& EDGE_FALSE_VALUE
&& gimple_cond_code (cond
) == NE_EXPR
)
3598 && !(e
->flags
& EDGE_TRUE_VALUE
&& gimple_cond_code (cond
) == EQ_EXPR
))
3600 /* A limitation of the current implementation is that we only support
3601 constant patterns in (in)equality tests. */
3602 tree pattern
= gimple_cond_rhs (cond
);
3603 if (TREE_CODE (pattern
) != INTEGER_CST
)
3606 reduction_stmt
= determine_reduction_stmt (loop
);
3608 /* A limitation of the current implementation is that we require a reduction
3609 statement. Therefore, loops without a reduction statement as in the
3610 following are not recognized:
3612 void foo (void) { for (; *p; ++p); } */
3613 if (reduction_stmt
== NULL
)
3616 /* Reduction variables are guaranteed to be SSA names. */
3618 switch (gimple_code (reduction_stmt
))
3622 reduction_var
= gimple_get_lhs (reduction_stmt
);
3625 /* Bail out e.g. for GIMPLE_CALL. */
3629 struct graph
*rdg
= build_rdg (loop
, NULL
);
3632 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3634 "Loop %d not transformed: failed to build the RDG.\n",
3639 auto_bitmap partition_stmts
;
3640 bitmap_set_range (partition_stmts
, 0, rdg
->n_vertices
);
3641 find_single_drs (loop
, rdg
, partition_stmts
, &store_dr
, &load_dr
);
3644 /* Bail out if there is no single load. */
3645 if (load_dr
== NULL
)
3648 /* Reaching this point we have a loop with a single reduction variable,
3649 a single load, and an optional single store. */
3651 tree load_ref
= DR_REF (load_dr
);
3652 tree load_type
= TREE_TYPE (load_ref
);
3653 tree load_access_base
= build_fold_addr_expr (load_ref
);
3654 tree load_access_size
= TYPE_SIZE_UNIT (load_type
);
3655 affine_iv load_iv
, reduction_iv
;
3657 if (!INTEGRAL_TYPE_P (load_type
)
3658 || !type_has_mode_precision_p (load_type
))
3661 /* We already ensured that the loop condition tests for (in)equality where the
3662 rhs is a constant pattern. Now ensure that the lhs is the result of the
3664 if (gimple_cond_lhs (cond
) != gimple_assign_lhs (DR_STMT (load_dr
)))
3667 /* Bail out if no affine induction variable with constant step can be
3669 if (!simple_iv (loop
, loop
, load_access_base
, &load_iv
, false))
3672 /* Bail out if memory accesses are not consecutive or not growing. */
3673 if (!operand_equal_p (load_iv
.step
, load_access_size
, 0))
3676 if (!simple_iv (loop
, loop
, reduction_var
, &reduction_iv
, false))
3679 /* Handle rawmemchr like loops. */
3680 if (operand_equal_p (load_iv
.base
, reduction_iv
.base
)
3681 && operand_equal_p (load_iv
.step
, reduction_iv
.step
))
3685 /* Ensure that we store to X and load from X+I where I>0. */
3686 if (TREE_CODE (load_iv
.base
) != POINTER_PLUS_EXPR
3687 || !integer_onep (TREE_OPERAND (load_iv
.base
, 1)))
3689 tree ptr_base
= TREE_OPERAND (load_iv
.base
, 0);
3690 if (TREE_CODE (ptr_base
) != SSA_NAME
)
3692 gimple
*def
= SSA_NAME_DEF_STMT (ptr_base
);
3693 if (!gimple_assign_single_p (def
)
3694 || gimple_assign_rhs1 (def
) != DR_REF (store_dr
))
3696 /* Ensure that the reduction value is stored. */
3697 if (gimple_assign_rhs1 (DR_STMT (store_dr
)) != reduction_var
)
3700 /* Bail out if target does not provide rawmemchr for a certain mode. */
3701 machine_mode mode
= TYPE_MODE (load_type
);
3702 if (direct_optab_handler (rawmemchr_optab
, mode
) == CODE_FOR_nothing
)
3704 location_t loc
= gimple_location (DR_STMT (load_dr
));
3705 generate_rawmemchr_builtin (loop
, reduction_var
, store_dr
, load_iv
.base
,
3710 /* Handle strlen like loops. */
3711 if (store_dr
== NULL
3712 && integer_zerop (pattern
)
3713 && INTEGRAL_TYPE_P (TREE_TYPE (reduction_var
))
3714 && TREE_CODE (reduction_iv
.base
) == INTEGER_CST
3715 && TREE_CODE (reduction_iv
.step
) == INTEGER_CST
3716 && integer_onep (reduction_iv
.step
))
3718 location_t loc
= gimple_location (DR_STMT (load_dr
));
3719 tree reduction_var_type
= TREE_TYPE (reduction_var
);
3720 /* While determining the length of a string an overflow might occur.
3721 If an overflow only occurs in the loop implementation and not in the
3722 strlen implementation, then either the overflow is undefined or the
3723 truncated result of strlen equals the one of the loop. Otherwise if
3724 an overflow may also occur in the strlen implementation, then
3725 replacing a loop by a call to strlen is sound whenever we ensure that
3726 if an overflow occurs in the strlen implementation, then also an
3727 overflow occurs in the loop implementation which is undefined. It
3728 seems reasonable to relax this and assume that the strlen
3729 implementation cannot overflow in case sizetype is big enough in the
3730 sense that an overflow can only happen for string objects which are
3731 bigger than half of the address space; at least for 32-bit targets and
3734 For strlen which makes use of rawmemchr the maximal length of a string
3735 which can be determined without an overflow is PTRDIFF_MAX / S where
3736 each character has size S. Since an overflow for ptrdiff type is
3737 undefined we have to make sure that if an overflow occurs, then an
3738 overflow occurs in the loop implementation, too, and this is
3739 undefined, too. Similar as before we relax this and assume that no
3740 string object is larger than half of the address space; at least for
3741 32-bit targets and up. */
3742 if (TYPE_MODE (load_type
) == TYPE_MODE (char_type_node
)
3743 && TYPE_PRECISION (load_type
) == TYPE_PRECISION (char_type_node
)
3744 && ((TYPE_PRECISION (sizetype
) >= TYPE_PRECISION (ptr_type_node
) - 1
3745 && TYPE_PRECISION (ptr_type_node
) >= 32)
3746 || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type
)
3747 && TYPE_PRECISION (reduction_var_type
) <= TYPE_PRECISION (sizetype
)))
3748 && builtin_decl_implicit (BUILT_IN_STRLEN
))
3749 generate_strlen_builtin (loop
, reduction_var
, load_iv
.base
,
3750 reduction_iv
.base
, loc
);
3751 else if (direct_optab_handler (rawmemchr_optab
, TYPE_MODE (load_type
))
3753 && ((TYPE_PRECISION (ptrdiff_type_node
) == TYPE_PRECISION (ptr_type_node
)
3754 && TYPE_PRECISION (ptrdiff_type_node
) >= 32)
3755 || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type
)
3756 && reduction_var_overflows_first (reduction_var_type
, load_type
))))
3757 generate_strlen_builtin_using_rawmemchr (loop
, reduction_var
,
3760 reduction_iv
.base
, loc
);
3769 /* Given innermost LOOP, return the outermost enclosing loop that forms a
3770 perfect loop nest. */
3773 prepare_perfect_loop_nest (class loop
*loop
)
3775 class loop
*outer
= loop_outer (loop
);
3776 tree niters
= number_of_latch_executions (loop
);
3778 /* TODO: We only support the innermost 3-level loop nest distribution
3779 because of compilation time issue for now. This should be relaxed
3780 in the future. Note we only allow 3-level loop nest distribution
3781 when parallelizing loops. */
3782 while ((loop
->inner
== NULL
3783 || (loop
->inner
->inner
== NULL
&& flag_tree_parallelize_loops
> 1))
3784 && loop_outer (outer
)
3785 && outer
->inner
== loop
&& loop
->next
== NULL
3786 && single_exit (outer
)
3787 && !chrec_contains_symbols_defined_in_loop (niters
, outer
->num
)
3788 && (niters
= number_of_latch_executions (outer
)) != NULL_TREE
3789 && niters
!= chrec_dont_know
)
3792 outer
= loop_outer (loop
);
3800 loop_distribution::execute (function
*fun
)
3802 bool changed
= false;
3804 control_dependences
*cd
= NULL
;
3805 auto_vec
<loop_p
> loops_to_be_destroyed
;
3807 if (number_of_loops (fun
) <= 1)
3810 bb_top_order_init ();
3812 FOR_ALL_BB_FN (bb
, fun
)
3814 gimple_stmt_iterator gsi
;
3815 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
3816 gimple_set_uid (gsi_stmt (gsi
), -1);
3817 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
3818 gimple_set_uid (gsi_stmt (gsi
), -1);
3821 /* We can at the moment only distribute non-nested loops, thus restrict
3822 walking to innermost loops. */
3823 for (auto loop
: loops_list (cfun
, LI_ONLY_INNERMOST
))
3825 /* Don't distribute multiple exit edges loop, or cold loop when
3826 not doing pattern detection. */
3827 if (!single_exit (loop
)
3828 || (!flag_tree_loop_distribute_patterns
3829 && !optimize_loop_for_speed_p (loop
)))
3832 /* If niters is unknown don't distribute loop but rather try to transform
3833 it to a call to a builtin. */
3834 tree niters
= number_of_latch_executions (loop
);
3835 if (niters
== NULL_TREE
|| niters
== chrec_dont_know
)
3837 datarefs_vec
.create (20);
3838 if (flag_tree_loop_distribute_patterns
3839 && transform_reduction_loop (loop
))
3842 loops_to_be_destroyed
.safe_push (loop
);
3843 if (dump_enabled_p ())
3845 dump_user_location_t loc
= find_loop_location (loop
);
3846 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
,
3847 loc
, "Loop %d transformed into a builtin.\n",
3851 free_data_refs (datarefs_vec
);
3855 /* Get the perfect loop nest for distribution. */
3856 loop
= prepare_perfect_loop_nest (loop
);
3857 for (; loop
; loop
= loop
->inner
)
3859 auto_vec
<gimple
*> work_list
;
3860 if (!find_seed_stmts_for_distribution (loop
, &work_list
))
3863 const char *str
= loop
->inner
? " nest" : "";
3864 dump_user_location_t loc
= find_loop_location (loop
);
3867 calculate_dominance_info (CDI_DOMINATORS
);
3868 calculate_dominance_info (CDI_POST_DOMINATORS
);
3869 cd
= new control_dependences ();
3870 free_dominance_info (CDI_POST_DOMINATORS
);
3874 int nb_generated_loops
, nb_generated_calls
;
3875 bool only_patterns
= !optimize_loop_for_speed_p (loop
)
3876 || !flag_tree_loop_distribution
;
3877 /* do not try to distribute loops that are not expected to iterate. */
3880 HOST_WIDE_INT iterations
= estimated_loop_iterations_int (loop
);
3882 iterations
= likely_max_loop_iterations_int (loop
);
3884 only_patterns
= true;
3887 = distribute_loop (loop
, work_list
, cd
, &nb_generated_calls
,
3888 &destroy_p
, only_patterns
);
3890 loops_to_be_destroyed
.safe_push (loop
);
3892 if (nb_generated_loops
+ nb_generated_calls
> 0)
3895 if (dump_enabled_p ())
3896 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
,
3897 loc
, "Loop%s %d distributed: split to %d loops "
3898 "and %d library calls.\n", str
, loop
->num
,
3899 nb_generated_loops
, nb_generated_calls
);
3904 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3905 fprintf (dump_file
, "Loop%s %d not distributed.\n", str
, loop
->num
);
3912 if (bb_top_order_index
!= NULL
)
3913 bb_top_order_destroy ();
3917 /* Destroy loop bodies that could not be reused. Do this late as we
3918 otherwise can end up refering to stale data in control dependences. */
3921 FOR_EACH_VEC_ELT (loops_to_be_destroyed
, i
, loop
)
3922 destroy_loop (loop
);
3924 /* Cached scalar evolutions now may refer to wrong or non-existing
3927 mark_virtual_operands_for_renaming (fun
);
3928 rewrite_into_loop_closed_ssa (NULL
, TODO_update_ssa
);
3931 checking_verify_loop_structure ();
3933 return changed
? TODO_cleanup_cfg
: 0;
3937 /* Distribute all loops in the current function. */
3941 const pass_data pass_data_loop_distribution
=
3943 GIMPLE_PASS
, /* type */
3945 OPTGROUP_LOOP
, /* optinfo_flags */
3946 TV_TREE_LOOP_DISTRIBUTION
, /* tv_id */
3947 ( PROP_cfg
| PROP_ssa
), /* properties_required */
3948 0, /* properties_provided */
3949 0, /* properties_destroyed */
3950 0, /* todo_flags_start */
3951 0, /* todo_flags_finish */
3954 class pass_loop_distribution
: public gimple_opt_pass
3957 pass_loop_distribution (gcc::context
*ctxt
)
3958 : gimple_opt_pass (pass_data_loop_distribution
, ctxt
)
3961 /* opt_pass methods: */
3962 bool gate (function
*) final override
3964 return flag_tree_loop_distribution
3965 || flag_tree_loop_distribute_patterns
;
3968 unsigned int execute (function
*) final override
;
3970 }; // class pass_loop_distribution
3973 pass_loop_distribution::execute (function
*fun
)
3975 return loop_distribution ().execute (fun
);
3981 make_pass_loop_distribution (gcc::context
*ctxt
)
3983 return new pass_loop_distribution (ctxt
);