* tree-loop-distribution.c (struct builtin_info): New struct.
[official-gcc.git] / gcc / tree-loop-distribution.c
blob5e835be779da44d838eee787267e4ee6a75ab4c7
1 /* Loop distribution.
2 Copyright (C) 2006-2017 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 /* This pass performs loop distribution: for example, the loop
24 |DO I = 2, N
25 | A(I) = B(I) + C
26 | D(I) = A(I-1)*E
27 |ENDDO
29 is transformed to
31 |DOALL I = 2, N
32 | A(I) = B(I) + C
33 |ENDDO
35 |DOALL I = 2, N
36 | D(I) = A(I-1)*E
37 |ENDDO
39 Loop distribution is the dual of loop fusion. It separates statements
40 of a loop (or loop nest) into multiple loops (or loop nests) with the
41 same loop header. The major goal is to separate statements which may
42 be vectorized from those that can't. This pass implements distribution
43 in the following steps:
45 1) Seed partitions with specific type statements. For now we support
46 two types seed statements: statement defining variable used outside
47 of loop; statement storing to memory.
48 2) Build reduced dependence graph (RDG) for loop to be distributed.
49 The vertices (RDG:V) model all statements in the loop and the edges
50 (RDG:E) model flow and control dependencies between statements.
51 3) Apart from RDG, compute data dependencies between memory references.
52 4) Starting from seed statement, build up partition by adding depended
53 statements according to RDG's dependence information. Partition is
54 classified as parallel type if it can be executed paralleled; or as
55 sequential type if it can't. Parallel type partition is further
56 classified as different builtin kinds if it can be implemented as
57 builtin function calls.
58 5) Build partition dependence graph (PG) based on data dependencies.
59 The vertices (PG:V) model all partitions and the edges (PG:E) model
60 all data dependencies between every partitions pair. In general,
61 data dependence is either compilation time known or unknown. In C
62 family languages, there exists quite amount compilation time unknown
63 dependencies because of possible alias relation of data references.
64 We categorize PG's edge to two types: "true" edge that represents
65 compilation time known data dependencies; "alias" edge for all other
66 data dependencies.
67 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge
68 partitions in each strong connected component (SCC) correspondingly.
69 Build new PG for merged partitions.
70 7) Traverse PG again and this time with both "true" and "alias" edges
71 included. We try to break SCCs by removing some edges. Because
72 SCCs by "true" edges are all fused in step 6), we can break SCCs
73 by removing some "alias" edges. It's NP-hard to choose optimal
74 edge set, fortunately simple approximation is good enough for us
75 given the small problem scale.
76 8) Collect all data dependencies of the removed "alias" edges. Create
77 runtime alias checks for collected data dependencies.
78 9) Version loop under the condition of runtime alias checks. Given
79 loop distribution generally introduces additional overhead, it is
80 only useful if vectorization is achieved in distributed loop. We
81 version loop with internal function call IFN_LOOP_DIST_ALIAS. If
82 no distributed loop can be vectorized, we simply remove distributed
83 loops and recover to the original one.
85 TODO:
86 1) We only distribute innermost two-level loop nest now. We should
87 extend it for arbitrary loop nests in the future.
88 2) We only fuse partitions in SCC now. A better fusion algorithm is
89 desired to minimize loop overhead, maximize parallelism and maximize
90 data reuse. */
92 #include "config.h"
93 #include "system.h"
94 #include "coretypes.h"
95 #include "backend.h"
96 #include "tree.h"
97 #include "gimple.h"
98 #include "cfghooks.h"
99 #include "tree-pass.h"
100 #include "ssa.h"
101 #include "gimple-pretty-print.h"
102 #include "fold-const.h"
103 #include "cfganal.h"
104 #include "gimple-iterator.h"
105 #include "gimplify-me.h"
106 #include "stor-layout.h"
107 #include "tree-cfg.h"
108 #include "tree-ssa-loop-manip.h"
109 #include "tree-ssa-loop.h"
110 #include "tree-into-ssa.h"
111 #include "tree-ssa.h"
112 #include "cfgloop.h"
113 #include "tree-scalar-evolution.h"
114 #include "params.h"
115 #include "tree-vectorizer.h"
118 #define MAX_DATAREFS_NUM \
119 ((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
121 /* Threshold controlling number of distributed partitions. Given it may
122 be unnecessary if a memory stream cost model is invented in the future,
123 we define it as a temporary macro, rather than a parameter. */
124 #define NUM_PARTITION_THRESHOLD (4)
126 /* Hashtable helpers. */
128 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
130 static inline hashval_t hash (const data_dependence_relation *);
131 static inline bool equal (const data_dependence_relation *,
132 const data_dependence_relation *);
135 /* Hash function for data dependence. */
137 inline hashval_t
138 ddr_hasher::hash (const data_dependence_relation *ddr)
140 inchash::hash h;
141 h.add_ptr (DDR_A (ddr));
142 h.add_ptr (DDR_B (ddr));
143 return h.end ();
146 /* Hash table equality function for data dependence. */
148 inline bool
149 ddr_hasher::equal (const data_dependence_relation *ddr1,
150 const data_dependence_relation *ddr2)
152 return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
155 /* The loop (nest) to be distributed. */
156 static vec<loop_p> loop_nest;
158 /* Vector of data references in the loop to be distributed. */
159 static vec<data_reference_p> datarefs_vec;
161 /* Store index of data reference in aux field. */
162 #define DR_INDEX(dr) ((uintptr_t) (dr)->aux)
164 /* Hash table for data dependence relation in the loop to be distributed. */
165 static hash_table<ddr_hasher> *ddrs_table;
167 /* A Reduced Dependence Graph (RDG) vertex representing a statement. */
168 struct rdg_vertex
170 /* The statement represented by this vertex. */
171 gimple *stmt;
173 /* Vector of data-references in this statement. */
174 vec<data_reference_p> datarefs;
176 /* True when the statement contains a write to memory. */
177 bool has_mem_write;
179 /* True when the statement contains a read from memory. */
180 bool has_mem_reads;
183 #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt
184 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
185 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
186 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
187 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
188 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
189 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
190 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
192 /* Data dependence type. */
194 enum rdg_dep_type
196 /* Read After Write (RAW). */
197 flow_dd = 'f',
199 /* Control dependence (execute conditional on). */
200 control_dd = 'c'
203 /* Dependence information attached to an edge of the RDG. */
205 struct rdg_edge
207 /* Type of the dependence. */
208 enum rdg_dep_type type;
211 #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type
213 /* Dump vertex I in RDG to FILE. */
215 static void
216 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
218 struct vertex *v = &(rdg->vertices[i]);
219 struct graph_edge *e;
221 fprintf (file, "(vertex %d: (%s%s) (in:", i,
222 RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
223 RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
225 if (v->pred)
226 for (e = v->pred; e; e = e->pred_next)
227 fprintf (file, " %d", e->src);
229 fprintf (file, ") (out:");
231 if (v->succ)
232 for (e = v->succ; e; e = e->succ_next)
233 fprintf (file, " %d", e->dest);
235 fprintf (file, ")\n");
236 print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
237 fprintf (file, ")\n");
240 /* Call dump_rdg_vertex on stderr. */
242 DEBUG_FUNCTION void
243 debug_rdg_vertex (struct graph *rdg, int i)
245 dump_rdg_vertex (stderr, rdg, i);
248 /* Dump the reduced dependence graph RDG to FILE. */
250 static void
251 dump_rdg (FILE *file, struct graph *rdg)
253 fprintf (file, "(rdg\n");
254 for (int i = 0; i < rdg->n_vertices; i++)
255 dump_rdg_vertex (file, rdg, i);
256 fprintf (file, ")\n");
259 /* Call dump_rdg on stderr. */
261 DEBUG_FUNCTION void
262 debug_rdg (struct graph *rdg)
264 dump_rdg (stderr, rdg);
267 static void
268 dot_rdg_1 (FILE *file, struct graph *rdg)
270 int i;
271 pretty_printer buffer;
272 pp_needs_newline (&buffer) = false;
273 buffer.buffer->stream = file;
275 fprintf (file, "digraph RDG {\n");
277 for (i = 0; i < rdg->n_vertices; i++)
279 struct vertex *v = &(rdg->vertices[i]);
280 struct graph_edge *e;
282 fprintf (file, "%d [label=\"[%d] ", i, i);
283 pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
284 pp_flush (&buffer);
285 fprintf (file, "\"]\n");
287 /* Highlight reads from memory. */
288 if (RDG_MEM_READS_STMT (rdg, i))
289 fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
291 /* Highlight stores to memory. */
292 if (RDG_MEM_WRITE_STMT (rdg, i))
293 fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
295 if (v->succ)
296 for (e = v->succ; e; e = e->succ_next)
297 switch (RDGE_TYPE (e))
299 case flow_dd:
300 /* These are the most common dependences: don't print these. */
301 fprintf (file, "%d -> %d \n", i, e->dest);
302 break;
304 case control_dd:
305 fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
306 break;
308 default:
309 gcc_unreachable ();
313 fprintf (file, "}\n\n");
316 /* Display the Reduced Dependence Graph using dotty. */
318 DEBUG_FUNCTION void
319 dot_rdg (struct graph *rdg)
321 /* When debugging, you may want to enable the following code. */
322 #ifdef HAVE_POPEN
323 FILE *file = popen ("dot -Tx11", "w");
324 if (!file)
325 return;
326 dot_rdg_1 (file, rdg);
327 fflush (file);
328 close (fileno (file));
329 pclose (file);
330 #else
331 dot_rdg_1 (stderr, rdg);
332 #endif
335 /* Returns the index of STMT in RDG. */
337 static int
338 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
340 int index = gimple_uid (stmt);
341 gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
342 return index;
345 /* Creates dependence edges in RDG for all the uses of DEF. IDEF is
346 the index of DEF in RDG. */
348 static void
349 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
351 use_operand_p imm_use_p;
352 imm_use_iterator iterator;
354 FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
356 struct graph_edge *e;
357 int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
359 if (use < 0)
360 continue;
362 e = add_edge (rdg, idef, use);
363 e->data = XNEW (struct rdg_edge);
364 RDGE_TYPE (e) = flow_dd;
368 /* Creates an edge for the control dependences of BB to the vertex V. */
370 static void
371 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
372 int v, control_dependences *cd)
374 bitmap_iterator bi;
375 unsigned edge_n;
376 EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
377 0, edge_n, bi)
379 basic_block cond_bb = cd->get_edge_src (edge_n);
380 gimple *stmt = last_stmt (cond_bb);
381 if (stmt && is_ctrl_stmt (stmt))
383 struct graph_edge *e;
384 int c = rdg_vertex_for_stmt (rdg, stmt);
385 if (c < 0)
386 continue;
388 e = add_edge (rdg, c, v);
389 e->data = XNEW (struct rdg_edge);
390 RDGE_TYPE (e) = control_dd;
395 /* Creates the edges of the reduced dependence graph RDG. */
397 static void
398 create_rdg_flow_edges (struct graph *rdg)
400 int i;
401 def_operand_p def_p;
402 ssa_op_iter iter;
404 for (i = 0; i < rdg->n_vertices; i++)
405 FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
406 iter, SSA_OP_DEF)
407 create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
410 /* Creates the edges of the reduced dependence graph RDG. */
412 static void
413 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
415 int i;
417 for (i = 0; i < rdg->n_vertices; i++)
419 gimple *stmt = RDG_STMT (rdg, i);
420 if (gimple_code (stmt) == GIMPLE_PHI)
422 edge_iterator ei;
423 edge e;
424 FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
425 if (flow_bb_inside_loop_p (loop, e->src))
426 create_edge_for_control_dependence (rdg, e->src, i, cd);
428 else
429 create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
433 /* Build the vertices of the reduced dependence graph RDG. Return false
434 if that failed. */
436 static bool
437 create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop)
439 int i;
440 gimple *stmt;
442 FOR_EACH_VEC_ELT (stmts, i, stmt)
444 struct vertex *v = &(rdg->vertices[i]);
446 /* Record statement to vertex mapping. */
447 gimple_set_uid (stmt, i);
449 v->data = XNEW (struct rdg_vertex);
450 RDGV_STMT (v) = stmt;
451 RDGV_DATAREFS (v).create (0);
452 RDGV_HAS_MEM_WRITE (v) = false;
453 RDGV_HAS_MEM_READS (v) = false;
454 if (gimple_code (stmt) == GIMPLE_PHI)
455 continue;
457 unsigned drp = datarefs_vec.length ();
458 if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
459 return false;
460 for (unsigned j = drp; j < datarefs_vec.length (); ++j)
462 data_reference_p dr = datarefs_vec[j];
463 if (DR_IS_READ (dr))
464 RDGV_HAS_MEM_READS (v) = true;
465 else
466 RDGV_HAS_MEM_WRITE (v) = true;
467 RDGV_DATAREFS (v).safe_push (dr);
470 return true;
473 /* Array mapping basic block's index to its topological order. */
474 static int *bb_top_order_index;
475 /* And size of the array. */
476 static int bb_top_order_index_size;
478 /* If X has a smaller topological sort number than Y, returns -1;
479 if greater, returns 1. */
481 static int
482 bb_top_order_cmp (const void *x, const void *y)
484 basic_block bb1 = *(const basic_block *) x;
485 basic_block bb2 = *(const basic_block *) y;
487 gcc_assert (bb1->index < bb_top_order_index_size
488 && bb2->index < bb_top_order_index_size);
489 gcc_assert (bb1 == bb2
490 || bb_top_order_index[bb1->index]
491 != bb_top_order_index[bb2->index]);
493 return (bb_top_order_index[bb1->index] - bb_top_order_index[bb2->index]);
496 /* Initialize STMTS with all the statements of LOOP. We use topological
497 order to discover all statements. The order is important because
498 generate_loops_for_partition is using the same traversal for identifying
499 statements in loop copies. */
501 static void
502 stmts_from_loop (struct loop *loop, vec<gimple *> *stmts)
504 unsigned int i;
505 basic_block *bbs = get_loop_body_in_custom_order (loop, bb_top_order_cmp);
507 for (i = 0; i < loop->num_nodes; i++)
509 basic_block bb = bbs[i];
511 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
512 gsi_next (&bsi))
513 if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
514 stmts->safe_push (bsi.phi ());
516 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
517 gsi_next (&bsi))
519 gimple *stmt = gsi_stmt (bsi);
520 if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
521 stmts->safe_push (stmt);
525 free (bbs);
528 /* Free the reduced dependence graph RDG. */
530 static void
531 free_rdg (struct graph *rdg)
533 int i;
535 for (i = 0; i < rdg->n_vertices; i++)
537 struct vertex *v = &(rdg->vertices[i]);
538 struct graph_edge *e;
540 for (e = v->succ; e; e = e->succ_next)
541 free (e->data);
543 if (v->data)
545 gimple_set_uid (RDGV_STMT (v), -1);
546 (RDGV_DATAREFS (v)).release ();
547 free (v->data);
551 free_graph (rdg);
554 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
555 LOOP, and one edge per flow dependence or control dependence from control
556 dependence CD. During visiting each statement, data references are also
557 collected and recorded in global data DATAREFS_VEC. */
559 static struct graph *
560 build_rdg (struct loop *loop, control_dependences *cd)
562 struct graph *rdg;
564 /* Create the RDG vertices from the stmts of the loop nest. */
565 auto_vec<gimple *, 10> stmts;
566 stmts_from_loop (loop, &stmts);
567 rdg = new_graph (stmts.length ());
568 if (!create_rdg_vertices (rdg, stmts, loop))
570 free_rdg (rdg);
571 return NULL;
573 stmts.release ();
575 create_rdg_flow_edges (rdg);
576 if (cd)
577 create_rdg_cd_edges (rdg, cd, loop);
579 return rdg;
583 /* Kind of distributed loop. */
584 enum partition_kind {
585 PKIND_NORMAL, PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
588 /* Type of distributed loop. */
589 enum partition_type {
590 /* The distributed loop can be executed parallelly. */
591 PTYPE_PARALLEL = 0,
592 /* The distributed loop has to be executed sequentially. */
593 PTYPE_SEQUENTIAL
596 /* Builtin info for loop distribution. */
597 struct builtin_info
599 /* data-references a kind != PKIND_NORMAL partition is about. */
600 data_reference_p dst_dr;
601 data_reference_p src_dr;
602 /* Base address and size of memory objects operated by the builtin. Note
603 both dest and source memory objects must have the same size. */
604 tree dst_base;
605 tree src_base;
606 tree size;
609 /* Partition for loop distribution. */
610 struct partition
612 /* Statements of the partition. */
613 bitmap stmts;
614 /* True if the partition defines variable which is used outside of loop. */
615 bool reduction_p;
616 enum partition_kind kind;
617 enum partition_type type;
618 /* Data references in the partition. */
619 bitmap datarefs;
620 /* Information of builtin parition. */
621 struct builtin_info *builtin;
625 /* Allocate and initialize a partition from BITMAP. */
627 static partition *
628 partition_alloc (void)
630 partition *partition = XCNEW (struct partition);
631 partition->stmts = BITMAP_ALLOC (NULL);
632 partition->reduction_p = false;
633 partition->kind = PKIND_NORMAL;
634 partition->datarefs = BITMAP_ALLOC (NULL);
635 return partition;
638 /* Free PARTITION. */
640 static void
641 partition_free (partition *partition)
643 BITMAP_FREE (partition->stmts);
644 BITMAP_FREE (partition->datarefs);
645 if (partition->builtin)
646 free (partition->builtin);
648 free (partition);
651 /* Returns true if the partition can be generated as a builtin. */
653 static bool
654 partition_builtin_p (partition *partition)
656 return partition->kind != PKIND_NORMAL;
659 /* Returns true if the partition contains a reduction. */
661 static bool
662 partition_reduction_p (partition *partition)
664 return partition->reduction_p;
667 /* Partitions are fused because of different reasons. */
668 enum fuse_type
670 FUSE_NON_BUILTIN = 0,
671 FUSE_REDUCTION = 1,
672 FUSE_SHARE_REF = 2,
673 FUSE_SAME_SCC = 3,
674 FUSE_FINALIZE = 4
677 /* Description on different fusing reason. */
678 static const char *fuse_message[] = {
679 "they are non-builtins",
680 "they have reductions",
681 "they have shared memory refs",
682 "they are in the same dependence scc",
683 "there is no point to distribute loop"};
685 static void
686 update_type_for_merge (struct graph *, partition *, partition *);
688 /* Merge PARTITION into the partition DEST. RDG is the reduced dependence
689 graph and we update type for result partition if it is non-NULL. */
691 static void
692 partition_merge_into (struct graph *rdg, partition *dest,
693 partition *partition, enum fuse_type ft)
695 if (dump_file && (dump_flags & TDF_DETAILS))
697 fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
698 fprintf (dump_file, " Part 1: ");
699 dump_bitmap (dump_file, dest->stmts);
700 fprintf (dump_file, " Part 2: ");
701 dump_bitmap (dump_file, partition->stmts);
704 dest->kind = PKIND_NORMAL;
705 if (dest->type == PTYPE_PARALLEL)
706 dest->type = partition->type;
708 bitmap_ior_into (dest->stmts, partition->stmts);
709 if (partition_reduction_p (partition))
710 dest->reduction_p = true;
712 /* Further check if any data dependence prevents us from executing the
713 new partition parallelly. */
714 if (dest->type == PTYPE_PARALLEL && rdg != NULL)
715 update_type_for_merge (rdg, dest, partition);
717 bitmap_ior_into (dest->datarefs, partition->datarefs);
721 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
722 the LOOP. */
724 static bool
725 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
727 imm_use_iterator imm_iter;
728 use_operand_p use_p;
730 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
732 if (is_gimple_debug (USE_STMT (use_p)))
733 continue;
735 basic_block use_bb = gimple_bb (USE_STMT (use_p));
736 if (!flow_bb_inside_loop_p (loop, use_bb))
737 return true;
740 return false;
743 /* Returns true when STMT defines a scalar variable used after the
744 loop LOOP. */
746 static bool
747 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
749 def_operand_p def_p;
750 ssa_op_iter op_iter;
752 if (gimple_code (stmt) == GIMPLE_PHI)
753 return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
755 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
756 if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
757 return true;
759 return false;
762 /* Return a copy of LOOP placed before LOOP. */
764 static struct loop *
765 copy_loop_before (struct loop *loop)
767 struct loop *res;
768 edge preheader = loop_preheader_edge (loop);
770 initialize_original_copy_tables ();
771 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
772 gcc_assert (res != NULL);
773 free_original_copy_tables ();
774 delete_update_ssa ();
776 return res;
779 /* Creates an empty basic block after LOOP. */
781 static void
782 create_bb_after_loop (struct loop *loop)
784 edge exit = single_exit (loop);
786 if (!exit)
787 return;
789 split_edge (exit);
792 /* Generate code for PARTITION from the code in LOOP. The loop is
793 copied when COPY_P is true. All the statements not flagged in the
794 PARTITION bitmap are removed from the loop or from its copy. The
795 statements are indexed in sequence inside a basic block, and the
796 basic blocks of a loop are taken in dom order. */
798 static void
799 generate_loops_for_partition (struct loop *loop, partition *partition,
800 bool copy_p)
802 unsigned i;
803 basic_block *bbs;
805 if (copy_p)
807 int orig_loop_num = loop->orig_loop_num;
808 loop = copy_loop_before (loop);
809 gcc_assert (loop != NULL);
810 loop->orig_loop_num = orig_loop_num;
811 create_preheader (loop, CP_SIMPLE_PREHEADERS);
812 create_bb_after_loop (loop);
814 else
816 /* Origin number is set to the new versioned loop's num. */
817 gcc_assert (loop->orig_loop_num != loop->num);
820 /* Remove stmts not in the PARTITION bitmap. */
821 bbs = get_loop_body_in_dom_order (loop);
823 if (MAY_HAVE_DEBUG_STMTS)
824 for (i = 0; i < loop->num_nodes; i++)
826 basic_block bb = bbs[i];
828 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
829 gsi_next (&bsi))
831 gphi *phi = bsi.phi ();
832 if (!virtual_operand_p (gimple_phi_result (phi))
833 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
834 reset_debug_uses (phi);
837 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
839 gimple *stmt = gsi_stmt (bsi);
840 if (gimple_code (stmt) != GIMPLE_LABEL
841 && !is_gimple_debug (stmt)
842 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
843 reset_debug_uses (stmt);
847 for (i = 0; i < loop->num_nodes; i++)
849 basic_block bb = bbs[i];
850 edge inner_exit = NULL;
852 if (loop != bb->loop_father)
853 inner_exit = single_exit (bb->loop_father);
855 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
857 gphi *phi = bsi.phi ();
858 if (!virtual_operand_p (gimple_phi_result (phi))
859 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
860 remove_phi_node (&bsi, true);
861 else
862 gsi_next (&bsi);
865 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
867 gimple *stmt = gsi_stmt (bsi);
868 if (gimple_code (stmt) != GIMPLE_LABEL
869 && !is_gimple_debug (stmt)
870 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
872 /* In distribution of loop nest, if bb is inner loop's exit_bb,
873 we choose its exit edge/path in order to avoid generating
874 infinite loop. For all other cases, we choose an arbitrary
875 path through the empty CFG part that this unnecessary
876 control stmt controls. */
877 if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
879 if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
880 gimple_cond_make_true (cond_stmt);
881 else
882 gimple_cond_make_false (cond_stmt);
883 update_stmt (stmt);
885 else if (gimple_code (stmt) == GIMPLE_SWITCH)
887 gswitch *switch_stmt = as_a <gswitch *> (stmt);
888 gimple_switch_set_index
889 (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
890 update_stmt (stmt);
892 else
894 unlink_stmt_vdef (stmt);
895 gsi_remove (&bsi, true);
896 release_defs (stmt);
897 continue;
900 gsi_next (&bsi);
904 free (bbs);
907 /* If VAL memory representation contains the same value in all bytes,
908 return that value, otherwise return -1.
909 E.g. for 0x24242424 return 0x24, for IEEE double
910 747708026454360457216.0 return 0x44, etc. */
912 static int
913 const_with_all_bytes_same (tree val)
915 unsigned char buf[64];
916 int i, len;
918 if (integer_zerop (val)
919 || (TREE_CODE (val) == CONSTRUCTOR
920 && !TREE_CLOBBER_P (val)
921 && CONSTRUCTOR_NELTS (val) == 0))
922 return 0;
924 if (real_zerop (val))
926 /* Only return 0 for +0.0, not for -0.0, which doesn't have
927 an all bytes same memory representation. Don't transform
928 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */
929 switch (TREE_CODE (val))
931 case REAL_CST:
932 if (!real_isneg (TREE_REAL_CST_PTR (val)))
933 return 0;
934 break;
935 case COMPLEX_CST:
936 if (!const_with_all_bytes_same (TREE_REALPART (val))
937 && !const_with_all_bytes_same (TREE_IMAGPART (val)))
938 return 0;
939 break;
940 case VECTOR_CST:
941 unsigned int j;
942 for (j = 0; j < VECTOR_CST_NELTS (val); ++j)
943 if (const_with_all_bytes_same (VECTOR_CST_ELT (val, j)))
944 break;
945 if (j == VECTOR_CST_NELTS (val))
946 return 0;
947 break;
948 default:
949 break;
953 if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
954 return -1;
956 len = native_encode_expr (val, buf, sizeof (buf));
957 if (len == 0)
958 return -1;
959 for (i = 1; i < len; i++)
960 if (buf[i] != buf[0])
961 return -1;
962 return buf[0];
965 /* Generate a call to memset for PARTITION in LOOP. */
967 static void
968 generate_memset_builtin (struct loop *loop, partition *partition)
970 gimple_stmt_iterator gsi;
971 tree mem, fn, nb_bytes;
972 tree val;
973 struct builtin_info *builtin = partition->builtin;
974 gimple *fn_call;
976 /* The new statements will be placed before LOOP. */
977 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
979 nb_bytes = builtin->size;
980 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
981 false, GSI_CONTINUE_LINKING);
982 mem = builtin->dst_base;
983 mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
984 false, GSI_CONTINUE_LINKING);
986 /* This exactly matches the pattern recognition in classify_partition. */
987 val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
988 /* Handle constants like 0x15151515 and similarly
989 floating point constants etc. where all bytes are the same. */
990 int bytev = const_with_all_bytes_same (val);
991 if (bytev != -1)
992 val = build_int_cst (integer_type_node, bytev);
993 else if (TREE_CODE (val) == INTEGER_CST)
994 val = fold_convert (integer_type_node, val);
995 else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
997 tree tem = make_ssa_name (integer_type_node);
998 gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
999 gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1000 val = tem;
1003 fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1004 fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1005 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1007 if (dump_file && (dump_flags & TDF_DETAILS))
1009 fprintf (dump_file, "generated memset");
1010 if (bytev == 0)
1011 fprintf (dump_file, " zero\n");
1012 else
1013 fprintf (dump_file, "\n");
1017 /* Generate a call to memcpy for PARTITION in LOOP. */
1019 static void
1020 generate_memcpy_builtin (struct loop *loop, partition *partition)
1022 gimple_stmt_iterator gsi;
1023 gimple *fn_call;
1024 tree dest, src, fn, nb_bytes;
1025 enum built_in_function kind;
1026 struct builtin_info *builtin = partition->builtin;
1028 /* The new statements will be placed before LOOP. */
1029 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1031 nb_bytes = builtin->size;
1032 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1033 false, GSI_CONTINUE_LINKING);
1034 dest = builtin->dst_base;
1035 src = builtin->src_base;
1036 if (partition->kind == PKIND_MEMCPY
1037 || ! ptr_derefs_may_alias_p (dest, src))
1038 kind = BUILT_IN_MEMCPY;
1039 else
1040 kind = BUILT_IN_MEMMOVE;
1042 dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1043 false, GSI_CONTINUE_LINKING);
1044 src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1045 false, GSI_CONTINUE_LINKING);
1046 fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1047 fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1048 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1050 if (dump_file && (dump_flags & TDF_DETAILS))
1052 if (kind == BUILT_IN_MEMCPY)
1053 fprintf (dump_file, "generated memcpy\n");
1054 else
1055 fprintf (dump_file, "generated memmove\n");
1059 /* Remove and destroy the loop LOOP. */
1061 static void
1062 destroy_loop (struct loop *loop)
1064 unsigned nbbs = loop->num_nodes;
1065 edge exit = single_exit (loop);
1066 basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1067 basic_block *bbs;
1068 unsigned i;
1070 bbs = get_loop_body_in_dom_order (loop);
1072 redirect_edge_pred (exit, src);
1073 exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1074 exit->flags |= EDGE_FALLTHRU;
1075 cancel_loop_tree (loop);
1076 rescan_loop_exit (exit, false, true);
1078 i = nbbs;
1081 /* We have made sure to not leave any dangling uses of SSA
1082 names defined in the loop. With the exception of virtuals.
1083 Make sure we replace all uses of virtual defs that will remain
1084 outside of the loop with the bare symbol as delete_basic_block
1085 will release them. */
1086 --i;
1087 for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1088 gsi_next (&gsi))
1090 gphi *phi = gsi.phi ();
1091 if (virtual_operand_p (gimple_phi_result (phi)))
1092 mark_virtual_phi_result_for_renaming (phi);
1094 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);
1095 gsi_next (&gsi))
1097 gimple *stmt = gsi_stmt (gsi);
1098 tree vdef = gimple_vdef (stmt);
1099 if (vdef && TREE_CODE (vdef) == SSA_NAME)
1100 mark_virtual_operand_for_renaming (vdef);
1102 delete_basic_block (bbs[i]);
1104 while (i != 0);
1106 free (bbs);
1108 set_immediate_dominator (CDI_DOMINATORS, dest,
1109 recompute_dominator (CDI_DOMINATORS, dest));
1112 /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */
1114 static bool
1115 generate_code_for_partition (struct loop *loop,
1116 partition *partition, bool copy_p)
1118 switch (partition->kind)
1120 case PKIND_NORMAL:
1121 /* Reductions all have to be in the last partition. */
1122 gcc_assert (!partition_reduction_p (partition)
1123 || !copy_p);
1124 generate_loops_for_partition (loop, partition, copy_p);
1125 return false;
1127 case PKIND_MEMSET:
1128 generate_memset_builtin (loop, partition);
1129 break;
1131 case PKIND_MEMCPY:
1132 case PKIND_MEMMOVE:
1133 generate_memcpy_builtin (loop, partition);
1134 break;
1136 default:
1137 gcc_unreachable ();
1140 /* Common tail for partitions we turn into a call. If this was the last
1141 partition for which we generate code, we have to destroy the loop. */
1142 if (!copy_p)
1143 return true;
1144 return false;
1147 /* Return data dependence relation for data references A and B. The two
1148 data references must be in lexicographic order wrto reduced dependence
1149 graph RDG. We firstly try to find ddr from global ddr hash table. If
1150 it doesn't exist, compute the ddr and cache it. */
1152 static data_dependence_relation *
1153 get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b)
1155 struct data_dependence_relation ent, **slot;
1156 struct data_dependence_relation *ddr;
1158 gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1159 gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1160 <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1161 ent.a = a;
1162 ent.b = b;
1163 slot = ddrs_table->find_slot (&ent, INSERT);
1164 if (*slot == NULL)
1166 ddr = initialize_data_dependence_relation (a, b, loop_nest);
1167 compute_affine_dependence (ddr, loop_nest[0]);
1168 *slot = ddr;
1171 return *slot;
1174 /* In reduced dependence graph RDG for loop distribution, return true if
1175 dependence between references DR1 and DR2 leads to a dependence cycle
1176 and such dependence cycle can't be resolved by runtime alias check. */
1178 static bool
1179 data_dep_in_cycle_p (struct graph *rdg,
1180 data_reference_p dr1, data_reference_p dr2)
1182 struct data_dependence_relation *ddr;
1184 /* Re-shuffle data-refs to be in topological order. */
1185 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1186 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1187 std::swap (dr1, dr2);
1189 ddr = get_data_dependence (rdg, dr1, dr2);
1191 /* In case of no data dependence. */
1192 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1193 return false;
1194 /* For unknown data dependence or known data dependence which can't be
1195 expressed in classic distance vector, we check if it can be resolved
1196 by runtime alias check. If yes, we still consider data dependence
1197 as won't introduce data dependence cycle. */
1198 else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1199 || DDR_NUM_DIST_VECTS (ddr) == 0)
1200 return !runtime_alias_check_p (ddr, NULL, true);
1201 else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1202 return true;
1203 else if (DDR_REVERSED_P (ddr)
1204 || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1205 return false;
1207 return true;
1210 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
1211 PARTITION1's type after merging PARTITION2 into PARTITION1. */
1213 static void
1214 update_type_for_merge (struct graph *rdg,
1215 partition *partition1, partition *partition2)
1217 unsigned i, j;
1218 bitmap_iterator bi, bj;
1219 data_reference_p dr1, dr2;
1221 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1223 unsigned start = (partition1 == partition2) ? i + 1 : 0;
1225 dr1 = datarefs_vec[i];
1226 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1228 dr2 = datarefs_vec[j];
1229 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1230 continue;
1232 /* Partition can only be executed sequentially if there is any
1233 data dependence cycle. */
1234 if (data_dep_in_cycle_p (rdg, dr1, dr2))
1236 partition1->type = PTYPE_SEQUENTIAL;
1237 return;
1243 /* Returns a partition with all the statements needed for computing
1244 the vertex V of the RDG, also including the loop exit conditions. */
1246 static partition *
1247 build_rdg_partition_for_vertex (struct graph *rdg, int v)
1249 partition *partition = partition_alloc ();
1250 auto_vec<int, 3> nodes;
1251 unsigned i, j;
1252 int x;
1253 data_reference_p dr;
1255 graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1257 FOR_EACH_VEC_ELT (nodes, i, x)
1259 bitmap_set_bit (partition->stmts, x);
1261 for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1263 unsigned idx = (unsigned) DR_INDEX (dr);
1264 gcc_assert (idx < datarefs_vec.length ());
1266 /* Partition can only be executed sequentially if there is any
1267 unknown data reference. */
1268 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1269 || !DR_INIT (dr) || !DR_STEP (dr))
1270 partition->type = PTYPE_SEQUENTIAL;
1272 bitmap_set_bit (partition->datarefs, idx);
1276 if (partition->type == PTYPE_SEQUENTIAL)
1277 return partition;
1279 /* Further check if any data dependence prevents us from executing the
1280 partition parallelly. */
1281 update_type_for_merge (rdg, partition, partition);
1283 return partition;
1286 /* Given PARTITION of RDG, record single load/store data references for
1287 builtin partition in SRC_DR/DST_DR, return false if there is no such
1288 data references. */
1290 static bool
1291 find_single_drs (struct graph *rdg, partition *partition,
1292 data_reference_p *dst_dr, data_reference_p *src_dr)
1294 unsigned i;
1295 data_reference_p single_ld = NULL, single_st = NULL;
1296 bitmap_iterator bi;
1298 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1300 gimple *stmt = RDG_STMT (rdg, i);
1301 data_reference_p dr;
1303 if (gimple_code (stmt) == GIMPLE_PHI)
1304 continue;
1306 /* Any scalar stmts are ok. */
1307 if (!gimple_vuse (stmt))
1308 continue;
1310 /* Otherwise just regular loads/stores. */
1311 if (!gimple_assign_single_p (stmt))
1312 return false;
1314 /* But exactly one store and/or load. */
1315 for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1317 tree type = TREE_TYPE (DR_REF (dr));
1319 /* The memset, memcpy and memmove library calls are only
1320 able to deal with generic address space. */
1321 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1322 return false;
1324 if (DR_IS_READ (dr))
1326 if (single_ld != NULL)
1327 return false;
1328 single_ld = dr;
1330 else
1332 if (single_st != NULL)
1333 return false;
1334 single_st = dr;
1339 if (!single_st)
1340 return false;
1342 /* Bail out if this is a bitfield memory reference. */
1343 if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1344 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1345 return false;
1347 /* Data reference must be executed exactly once per iteration. */
1348 basic_block bb_st = gimple_bb (DR_STMT (single_st));
1349 struct loop *inner = bb_st->loop_father;
1350 if (!dominated_by_p (CDI_DOMINATORS, inner->latch, bb_st))
1351 return false;
1353 if (single_ld)
1355 gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1356 /* Direct aggregate copy or via an SSA name temporary. */
1357 if (load != store
1358 && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1359 return false;
1361 /* Bail out if this is a bitfield memory reference. */
1362 if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1363 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1364 return false;
1366 /* Load and store must be in the same loop nest. */
1367 basic_block bb_ld = gimple_bb (DR_STMT (single_ld));
1368 if (inner != bb_ld->loop_father)
1369 return false;
1371 /* Data reference must be executed exactly once per iteration. */
1372 if (!dominated_by_p (CDI_DOMINATORS, inner->latch, bb_ld))
1373 return false;
1375 edge e = single_exit (inner);
1376 bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1377 bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1378 if (dom_ld != dom_st)
1379 return false;
1382 *src_dr = single_ld;
1383 *dst_dr = single_st;
1384 return true;
1387 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1388 loops from inner to outer to see if loop's step equals to access size at
1389 each level of loop. Return true if yes; record access base and size in
1390 BASE and SIZE; save loop's step at each level of loop in STEPS if it is
1391 not null. For example:
1393 int arr[100][100][100];
1394 for (i = 0; i < 100; i++) ;steps[2] = 40000
1395 for (j = 100; j > 0; j--) ;steps[1] = -400
1396 for (k = 0; k < 100; k++) ;steps[0] = 4
1397 arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000. */
1399 static bool
1400 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1401 tree *size, vec<tree> *steps = NULL)
1403 location_t loc = gimple_location (DR_STMT (dr));
1404 basic_block bb = gimple_bb (DR_STMT (dr));
1405 struct loop *loop = bb->loop_father;
1406 tree ref = DR_REF (dr);
1407 tree access_base = build_fold_addr_expr (ref);
1408 tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1410 do {
1411 tree scev_fn = analyze_scalar_evolution (loop, access_base);
1412 if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1413 return false;
1415 access_base = CHREC_LEFT (scev_fn);
1416 if (tree_contains_chrecs (access_base, NULL))
1417 return false;
1419 tree scev_step = CHREC_RIGHT (scev_fn);
1420 /* Only support constant steps. */
1421 if (TREE_CODE (scev_step) != INTEGER_CST)
1422 return false;
1424 enum ev_direction access_dir = scev_direction (scev_fn);
1425 if (access_dir == EV_DIR_UNKNOWN)
1426 return false;
1428 if (steps != NULL)
1429 steps->safe_push (scev_step);
1431 scev_step = fold_convert_loc (loc, sizetype, scev_step);
1432 /* Compute absolute value of scev step. */
1433 if (access_dir == EV_DIR_DECREASES)
1434 scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1436 /* At each level of loop, scev step must equal to access size. In other
1437 words, DR must access consecutive memory between loop iterations. */
1438 if (!operand_equal_p (scev_step, access_size, 0))
1439 return false;
1441 /* Compute DR's execution times in loop. */
1442 tree niters = number_of_latch_executions (loop);
1443 niters = fold_convert_loc (loc, sizetype, niters);
1444 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1445 niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1447 /* Compute DR's overall access size in loop. */
1448 access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1449 niters, scev_step);
1450 /* Adjust base address in case of negative step. */
1451 if (access_dir == EV_DIR_DECREASES)
1453 tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1454 scev_step, access_size);
1455 access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1457 } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1459 *base = access_base;
1460 *size = access_size;
1461 return true;
1464 /* Allocate and return builtin struct. Record information like DST_DR,
1465 SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */
1467 static struct builtin_info *
1468 alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1469 tree dst_base, tree src_base, tree size)
1471 struct builtin_info *builtin = XNEW (struct builtin_info);
1472 builtin->dst_dr = dst_dr;
1473 builtin->src_dr = src_dr;
1474 builtin->dst_base = dst_base;
1475 builtin->src_base = src_base;
1476 builtin->size = size;
1477 return builtin;
1480 /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1481 memset call. */
1483 static void
1484 classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1486 gimple *stmt = DR_STMT (dr);
1487 tree base, size, rhs = gimple_assign_rhs1 (stmt);
1489 if (const_with_all_bytes_same (rhs) == -1
1490 && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1491 || (TYPE_MODE (TREE_TYPE (rhs))
1492 != TYPE_MODE (unsigned_char_type_node))))
1493 return;
1495 if (TREE_CODE (rhs) == SSA_NAME
1496 && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1497 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1498 return;
1500 if (!compute_access_range (loop, dr, &base, &size))
1501 return;
1503 partition->builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1504 partition->kind = PKIND_MEMSET;
1507 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1508 if it forms builtin memcpy or memmove call. */
1510 static void
1511 classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
1512 data_reference_p dst_dr, data_reference_p src_dr)
1514 tree base, size, src_base, src_size;
1515 auto_vec<tree> dst_steps, src_steps;
1517 /* Compute access range of both load and store. They much have the same
1518 access size. */
1519 if (!compute_access_range (loop, dst_dr, &base, &size, &dst_steps)
1520 || !compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps)
1521 || !operand_equal_p (size, src_size, 0))
1522 return;
1524 /* Load and store in loop nest must access memory in the same way, i.e,
1525 their must have the same steps in each loop of the nest. */
1526 if (dst_steps.length () != src_steps.length ())
1527 return;
1528 for (unsigned i = 0; i < dst_steps.length (); ++i)
1529 if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1530 return;
1532 /* Now check that if there is a dependence. */
1533 ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1535 /* Classify as memcpy if no dependence between load and store. */
1536 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1538 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1539 partition->kind = PKIND_MEMCPY;
1540 return;
1543 /* Can't do memmove in case of unknown dependence or dependence without
1544 classical distance vector. */
1545 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1546 || DDR_NUM_DIST_VECTS (ddr) == 0)
1547 return;
1549 unsigned i;
1550 lambda_vector dist_v;
1551 int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1552 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1554 unsigned dep_lev = dependence_level (dist_v, num_lev);
1555 /* Can't do memmove if load depends on store. */
1556 if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1557 return;
1560 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1561 partition->kind = PKIND_MEMMOVE;
1562 return;
1565 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
1566 For the moment we detect memset, memcpy and memmove patterns. Bitmap
1567 STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */
1569 static void
1570 classify_partition (loop_p loop, struct graph *rdg, partition *partition,
1571 bitmap stmt_in_all_partitions)
1573 bitmap_iterator bi;
1574 unsigned i;
1575 data_reference_p single_ld = NULL, single_st = NULL;
1576 bool volatiles_p = false, has_reduction = false;
1578 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1580 gimple *stmt = RDG_STMT (rdg, i);
1582 if (gimple_has_volatile_ops (stmt))
1583 volatiles_p = true;
1585 /* If the stmt is not included by all partitions and there is uses
1586 outside of the loop, then mark the partition as reduction. */
1587 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1589 /* Due to limitation in the transform phase we have to fuse all
1590 reduction partitions. As a result, this could cancel valid
1591 loop distribution especially for loop that induction variable
1592 is used outside of loop. To workaround this issue, we skip
1593 marking partition as reudction if the reduction stmt belongs
1594 to all partitions. In such case, reduction will be computed
1595 correctly no matter how partitions are fused/distributed. */
1596 if (!bitmap_bit_p (stmt_in_all_partitions, i))
1598 partition->reduction_p = true;
1599 return;
1601 has_reduction = true;
1605 /* Perform general partition disqualification for builtins. */
1606 if (volatiles_p
1607 /* Simple workaround to prevent classifying the partition as builtin
1608 if it contains any use outside of loop. */
1609 || has_reduction
1610 || !flag_tree_loop_distribute_patterns)
1611 return;
1613 /* Find single load/store data references for builtin partition. */
1614 if (!find_single_drs (rdg, partition, &single_st, &single_ld))
1615 return;
1617 /* Classify the builtin kind. */
1618 if (single_ld == NULL)
1619 classify_builtin_st (loop, partition, single_st);
1620 else
1621 classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1624 /* Returns true when PARTITION1 and PARTITION2 access the same memory
1625 object in RDG. */
1627 static bool
1628 share_memory_accesses (struct graph *rdg,
1629 partition *partition1, partition *partition2)
1631 unsigned i, j;
1632 bitmap_iterator bi, bj;
1633 data_reference_p dr1, dr2;
1635 /* First check whether in the intersection of the two partitions are
1636 any loads or stores. Common loads are the situation that happens
1637 most often. */
1638 EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1639 if (RDG_MEM_WRITE_STMT (rdg, i)
1640 || RDG_MEM_READS_STMT (rdg, i))
1641 return true;
1643 /* Then check whether the two partitions access the same memory object. */
1644 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1646 dr1 = datarefs_vec[i];
1648 if (!DR_BASE_ADDRESS (dr1)
1649 || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1650 continue;
1652 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1654 dr2 = datarefs_vec[j];
1656 if (!DR_BASE_ADDRESS (dr2)
1657 || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1658 continue;
1660 if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1661 && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1662 && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1663 && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1664 return true;
1668 return false;
1671 /* For each seed statement in STARTING_STMTS, this function builds
1672 partition for it by adding depended statements according to RDG.
1673 All partitions are recorded in PARTITIONS. */
1675 static void
1676 rdg_build_partitions (struct graph *rdg,
1677 vec<gimple *> starting_stmts,
1678 vec<partition *> *partitions)
1680 auto_bitmap processed;
1681 int i;
1682 gimple *stmt;
1684 FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1686 int v = rdg_vertex_for_stmt (rdg, stmt);
1688 if (dump_file && (dump_flags & TDF_DETAILS))
1689 fprintf (dump_file,
1690 "ldist asked to generate code for vertex %d\n", v);
1692 /* If the vertex is already contained in another partition so
1693 is the partition rooted at it. */
1694 if (bitmap_bit_p (processed, v))
1695 continue;
1697 partition *partition = build_rdg_partition_for_vertex (rdg, v);
1698 bitmap_ior_into (processed, partition->stmts);
1700 if (dump_file && (dump_flags & TDF_DETAILS))
1702 fprintf (dump_file, "ldist creates useful %s partition:\n",
1703 partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1704 bitmap_print (dump_file, partition->stmts, " ", "\n");
1707 partitions->safe_push (partition);
1710 /* All vertices should have been assigned to at least one partition now,
1711 other than vertices belonging to dead code. */
1714 /* Dump to FILE the PARTITIONS. */
1716 static void
1717 dump_rdg_partitions (FILE *file, vec<partition *> partitions)
1719 int i;
1720 partition *partition;
1722 FOR_EACH_VEC_ELT (partitions, i, partition)
1723 debug_bitmap_file (file, partition->stmts);
1726 /* Debug PARTITIONS. */
1727 extern void debug_rdg_partitions (vec<partition *> );
1729 DEBUG_FUNCTION void
1730 debug_rdg_partitions (vec<partition *> partitions)
1732 dump_rdg_partitions (stderr, partitions);
1735 /* Returns the number of read and write operations in the RDG. */
1737 static int
1738 number_of_rw_in_rdg (struct graph *rdg)
1740 int i, res = 0;
1742 for (i = 0; i < rdg->n_vertices; i++)
1744 if (RDG_MEM_WRITE_STMT (rdg, i))
1745 ++res;
1747 if (RDG_MEM_READS_STMT (rdg, i))
1748 ++res;
1751 return res;
1754 /* Returns the number of read and write operations in a PARTITION of
1755 the RDG. */
1757 static int
1758 number_of_rw_in_partition (struct graph *rdg, partition *partition)
1760 int res = 0;
1761 unsigned i;
1762 bitmap_iterator ii;
1764 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
1766 if (RDG_MEM_WRITE_STMT (rdg, i))
1767 ++res;
1769 if (RDG_MEM_READS_STMT (rdg, i))
1770 ++res;
1773 return res;
1776 /* Returns true when one of the PARTITIONS contains all the read or
1777 write operations of RDG. */
1779 static bool
1780 partition_contains_all_rw (struct graph *rdg,
1781 vec<partition *> partitions)
1783 int i;
1784 partition *partition;
1785 int nrw = number_of_rw_in_rdg (rdg);
1787 FOR_EACH_VEC_ELT (partitions, i, partition)
1788 if (nrw == number_of_rw_in_partition (rdg, partition))
1789 return true;
1791 return false;
1794 /* Compute partition dependence created by the data references in DRS1
1795 and DRS2, modify and return DIR according to that. IF ALIAS_DDR is
1796 not NULL, we record dependence introduced by possible alias between
1797 two data references in ALIAS_DDRS; otherwise, we simply ignore such
1798 dependence as if it doesn't exist at all. */
1800 static int
1801 pg_add_dependence_edges (struct graph *rdg, int dir,
1802 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
1804 unsigned i, j;
1805 bitmap_iterator bi, bj;
1806 data_reference_p dr1, dr2, saved_dr1;
1808 /* dependence direction - 0 is no dependence, -1 is back,
1809 1 is forth, 2 is both (we can stop then, merging will occur). */
1810 EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
1812 dr1 = datarefs_vec[i];
1814 EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
1816 int res, this_dir = 1;
1817 ddr_p ddr;
1819 dr2 = datarefs_vec[j];
1821 /* Skip all <read, read> data dependence. */
1822 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1823 continue;
1825 saved_dr1 = dr1;
1826 /* Re-shuffle data-refs to be in topological order. */
1827 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1828 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1830 std::swap (dr1, dr2);
1831 this_dir = -this_dir;
1833 ddr = get_data_dependence (rdg, dr1, dr2);
1834 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1836 this_dir = 0;
1837 res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
1838 DR_BASE_ADDRESS (dr2));
1839 /* Be conservative. If data references are not well analyzed,
1840 or the two data references have the same base address and
1841 offset, add dependence and consider it alias to each other.
1842 In other words, the dependence can not be resolved by
1843 runtime alias check. */
1844 if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
1845 || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
1846 || !DR_INIT (dr1) || !DR_INIT (dr2)
1847 || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
1848 || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
1849 || res == 0)
1850 this_dir = 2;
1851 /* Data dependence could be resolved by runtime alias check,
1852 record it in ALIAS_DDRS. */
1853 else if (alias_ddrs != NULL)
1854 alias_ddrs->safe_push (ddr);
1855 /* Or simply ignore it. */
1857 else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
1859 if (DDR_REVERSED_P (ddr))
1860 this_dir = -this_dir;
1862 /* Known dependences can still be unordered througout the
1863 iteration space, see gcc.dg/tree-ssa/ldist-16.c. */
1864 if (DDR_NUM_DIST_VECTS (ddr) != 1)
1865 this_dir = 2;
1866 /* If the overlap is exact preserve stmt order. */
1867 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1869 /* Else as the distance vector is lexicographic positive swap
1870 the dependence direction. */
1871 else
1872 this_dir = -this_dir;
1874 else
1875 this_dir = 0;
1876 if (this_dir == 2)
1877 return 2;
1878 else if (dir == 0)
1879 dir = this_dir;
1880 else if (this_dir != 0 && dir != this_dir)
1881 return 2;
1882 /* Shuffle "back" dr1. */
1883 dr1 = saved_dr1;
1886 return dir;
1889 /* Compare postorder number of the partition graph vertices V1 and V2. */
1891 static int
1892 pgcmp (const void *v1_, const void *v2_)
1894 const vertex *v1 = (const vertex *)v1_;
1895 const vertex *v2 = (const vertex *)v2_;
1896 return v2->post - v1->post;
1899 /* Data attached to vertices of partition dependence graph. */
1900 struct pg_vdata
1902 /* ID of the corresponding partition. */
1903 int id;
1904 /* The partition. */
1905 struct partition *partition;
1908 /* Data attached to edges of partition dependence graph. */
1909 struct pg_edata
1911 /* If the dependence edge can be resolved by runtime alias check,
1912 this vector contains data dependence relations for runtime alias
1913 check. On the other hand, if the dependence edge is introduced
1914 because of compilation time known data dependence, this vector
1915 contains nothing. */
1916 vec<ddr_p> alias_ddrs;
1919 /* Callback data for traversing edges in graph. */
1920 struct pg_edge_callback_data
1922 /* Bitmap contains strong connected components should be merged. */
1923 bitmap sccs_to_merge;
1924 /* Array constains component information for all vertices. */
1925 int *vertices_component;
1926 /* Vector to record all data dependence relations which are needed
1927 to break strong connected components by runtime alias checks. */
1928 vec<ddr_p> *alias_ddrs;
1931 /* Initialize vertice's data for partition dependence graph PG with
1932 PARTITIONS. */
1934 static void
1935 init_partition_graph_vertices (struct graph *pg,
1936 vec<struct partition *> *partitions)
1938 int i;
1939 partition *partition;
1940 struct pg_vdata *data;
1942 for (i = 0; partitions->iterate (i, &partition); ++i)
1944 data = new pg_vdata;
1945 pg->vertices[i].data = data;
1946 data->id = i;
1947 data->partition = partition;
1951 /* Add edge <I, J> to partition dependence graph PG. Attach vector of data
1952 dependence relations to the EDGE if DDRS isn't NULL. */
1954 static void
1955 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
1957 struct graph_edge *e = add_edge (pg, i, j);
1959 /* If the edge is attached with data dependence relations, it means this
1960 dependence edge can be resolved by runtime alias checks. */
1961 if (ddrs != NULL)
1963 struct pg_edata *data = new pg_edata;
1965 gcc_assert (ddrs->length () > 0);
1966 e->data = data;
1967 data->alias_ddrs = vNULL;
1968 data->alias_ddrs.safe_splice (*ddrs);
1972 /* Callback function for graph travesal algorithm. It returns true
1973 if edge E should skipped when traversing the graph. */
1975 static bool
1976 pg_skip_alias_edge (struct graph_edge *e)
1978 struct pg_edata *data = (struct pg_edata *)e->data;
1979 return (data != NULL && data->alias_ddrs.length () > 0);
1982 /* Callback function freeing data attached to edge E of graph. */
1984 static void
1985 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
1987 if (e->data != NULL)
1989 struct pg_edata *data = (struct pg_edata *)e->data;
1990 data->alias_ddrs.release ();
1991 delete data;
1995 /* Free data attached to vertice of partition dependence graph PG. */
1997 static void
1998 free_partition_graph_vdata (struct graph *pg)
2000 int i;
2001 struct pg_vdata *data;
2003 for (i = 0; i < pg->n_vertices; ++i)
2005 data = (struct pg_vdata *)pg->vertices[i].data;
2006 delete data;
2010 /* Build and return partition dependence graph for PARTITIONS. RDG is
2011 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
2012 is true, data dependence caused by possible alias between references
2013 is ignored, as if it doesn't exist at all; otherwise all depdendences
2014 are considered. */
2016 static struct graph *
2017 build_partition_graph (struct graph *rdg,
2018 vec<struct partition *> *partitions,
2019 bool ignore_alias_p)
2021 int i, j;
2022 struct partition *partition1, *partition2;
2023 graph *pg = new_graph (partitions->length ());
2024 auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2026 alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2028 init_partition_graph_vertices (pg, partitions);
2030 for (i = 0; partitions->iterate (i, &partition1); ++i)
2032 for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2034 /* dependence direction - 0 is no dependence, -1 is back,
2035 1 is forth, 2 is both (we can stop then, merging will occur). */
2036 int dir = 0;
2038 /* If the first partition has reduction, add back edge; if the
2039 second partition has reduction, add forth edge. This makes
2040 sure that reduction partition will be sorted as the last one. */
2041 if (partition_reduction_p (partition1))
2042 dir = -1;
2043 else if (partition_reduction_p (partition2))
2044 dir = 1;
2046 /* Cleanup the temporary vector. */
2047 alias_ddrs.truncate (0);
2049 dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2050 partition2->datarefs, alias_ddrs_p);
2052 /* Add edge to partition graph if there exists dependence. There
2053 are two types of edges. One type edge is caused by compilation
2054 time known dependence, this type can not be resolved by runtime
2055 alias check. The other type can be resolved by runtime alias
2056 check. */
2057 if (dir == 1 || dir == 2
2058 || alias_ddrs.length () > 0)
2060 /* Attach data dependence relations to edge that can be resolved
2061 by runtime alias check. */
2062 bool alias_edge_p = (dir != 1 && dir != 2);
2063 add_partition_graph_edge (pg, i, j,
2064 (alias_edge_p) ? &alias_ddrs : NULL);
2066 if (dir == -1 || dir == 2
2067 || alias_ddrs.length () > 0)
2069 /* Attach data dependence relations to edge that can be resolved
2070 by runtime alias check. */
2071 bool alias_edge_p = (dir != -1 && dir != 2);
2072 add_partition_graph_edge (pg, j, i,
2073 (alias_edge_p) ? &alias_ddrs : NULL);
2077 return pg;
2080 /* Sort partitions in PG in descending post order and store them in
2081 PARTITIONS. */
2083 static void
2084 sort_partitions_by_post_order (struct graph *pg,
2085 vec<struct partition *> *partitions)
2087 int i;
2088 struct pg_vdata *data;
2090 /* Now order the remaining nodes in descending postorder. */
2091 qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2092 partitions->truncate (0);
2093 for (i = 0; i < pg->n_vertices; ++i)
2095 data = (struct pg_vdata *)pg->vertices[i].data;
2096 if (data->partition)
2097 partitions->safe_push (data->partition);
2101 /* Given reduced dependence graph RDG merge strong connected components
2102 of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by
2103 possible alias between references is ignored, as if it doesn't exist
2104 at all; otherwise all depdendences are considered. */
2106 static void
2107 merge_dep_scc_partitions (struct graph *rdg,
2108 vec<struct partition *> *partitions,
2109 bool ignore_alias_p)
2111 struct partition *partition1, *partition2;
2112 struct pg_vdata *data;
2113 graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2114 int i, j, num_sccs = graphds_scc (pg, NULL);
2116 /* Strong connected compoenent means dependence cycle, we cannot distribute
2117 them. So fuse them together. */
2118 if ((unsigned) num_sccs < partitions->length ())
2120 for (i = 0; i < num_sccs; ++i)
2122 for (j = 0; partitions->iterate (j, &partition1); ++j)
2123 if (pg->vertices[j].component == i)
2124 break;
2125 for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2126 if (pg->vertices[j].component == i)
2128 partition_merge_into (NULL, partition1,
2129 partition2, FUSE_SAME_SCC);
2130 partition1->type = PTYPE_SEQUENTIAL;
2131 (*partitions)[j] = NULL;
2132 partition_free (partition2);
2133 data = (struct pg_vdata *)pg->vertices[j].data;
2134 data->partition = NULL;
2139 sort_partitions_by_post_order (pg, partitions);
2140 gcc_assert (partitions->length () == (unsigned)num_sccs);
2141 free_partition_graph_vdata (pg);
2142 free_graph (pg);
2145 /* Callback function for traversing edge E in graph G. DATA is private
2146 callback data. */
2148 static void
2149 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2151 int i, j, component;
2152 struct pg_edge_callback_data *cbdata;
2153 struct pg_edata *edata = (struct pg_edata *) e->data;
2155 /* If the edge doesn't have attached data dependence, it represents
2156 compilation time known dependences. This type dependence cannot
2157 be resolved by runtime alias check. */
2158 if (edata == NULL || edata->alias_ddrs.length () == 0)
2159 return;
2161 cbdata = (struct pg_edge_callback_data *) data;
2162 i = e->src;
2163 j = e->dest;
2164 component = cbdata->vertices_component[i];
2165 /* Vertices are topologically sorted according to compilation time
2166 known dependences, so we can break strong connected components
2167 by removing edges of the opposite direction, i.e, edges pointing
2168 from vertice with smaller post number to vertice with bigger post
2169 number. */
2170 if (g->vertices[i].post < g->vertices[j].post
2171 /* We only need to remove edges connecting vertices in the same
2172 strong connected component to break it. */
2173 && component == cbdata->vertices_component[j]
2174 /* Check if we want to break the strong connected component or not. */
2175 && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2176 cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2179 /* This is the main function breaking strong conected components in
2180 PARTITIONS giving reduced depdendence graph RDG. Store data dependence
2181 relations for runtime alias check in ALIAS_DDRS. */
2183 static void
2184 break_alias_scc_partitions (struct graph *rdg,
2185 vec<struct partition *> *partitions,
2186 vec<ddr_p> *alias_ddrs)
2188 int i, j, k, num_sccs, num_sccs_no_alias;
2189 /* Build partition dependence graph. */
2190 graph *pg = build_partition_graph (rdg, partitions, false);
2192 alias_ddrs->truncate (0);
2193 /* Find strong connected components in the graph, with all dependence edges
2194 considered. */
2195 num_sccs = graphds_scc (pg, NULL);
2196 /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2197 compilation time known dependences are merged before this function. */
2198 if ((unsigned) num_sccs < partitions->length ())
2200 struct pg_edge_callback_data cbdata;
2201 auto_bitmap sccs_to_merge;
2202 auto_vec<enum partition_type> scc_types;
2203 struct partition *partition, *first;
2205 /* If all partitions in a SCC have the same type, we can simply merge the
2206 SCC. This loop finds out such SCCS and record them in bitmap. */
2207 bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2208 for (i = 0; i < num_sccs; ++i)
2210 for (j = 0; partitions->iterate (j, &first); ++j)
2211 if (pg->vertices[j].component == i)
2212 break;
2213 for (++j; partitions->iterate (j, &partition); ++j)
2215 if (pg->vertices[j].component != i)
2216 continue;
2218 /* Note we Merge partitions of parallel type on purpose, though
2219 the result partition is sequential. The reason is vectorizer
2220 can do more accurate runtime alias check in this case. Also
2221 it results in more conservative distribution. */
2222 if (first->type != partition->type)
2224 bitmap_clear_bit (sccs_to_merge, i);
2225 break;
2230 /* Initialize callback data for traversing. */
2231 cbdata.sccs_to_merge = sccs_to_merge;
2232 cbdata.alias_ddrs = alias_ddrs;
2233 cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2234 /* Record the component information which will be corrupted by next
2235 graph scc finding call. */
2236 for (i = 0; i < pg->n_vertices; ++i)
2237 cbdata.vertices_component[i] = pg->vertices[i].component;
2239 /* Collect data dependences for runtime alias checks to break SCCs. */
2240 if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2242 /* Run SCC finding algorithm again, with alias dependence edges
2243 skipped. This is to topologically sort partitions according to
2244 compilation time known dependence. Note the topological order
2245 is stored in the form of pg's post order number. */
2246 num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2247 gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias);
2248 /* With topological order, we can construct two subgraphs L and R.
2249 L contains edge <x, y> where x < y in terms of post order, while
2250 R contains edge <x, y> where x > y. Edges for compilation time
2251 known dependence all fall in R, so we break SCCs by removing all
2252 (alias) edges of in subgraph L. */
2253 for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2256 /* For SCC that doesn't need to be broken, merge it. */
2257 for (i = 0; i < num_sccs; ++i)
2259 if (!bitmap_bit_p (sccs_to_merge, i))
2260 continue;
2262 for (j = 0; partitions->iterate (j, &first); ++j)
2263 if (cbdata.vertices_component[j] == i)
2264 break;
2265 for (k = j + 1; partitions->iterate (k, &partition); ++k)
2267 struct pg_vdata *data;
2269 if (cbdata.vertices_component[k] != i)
2270 continue;
2272 /* Update postorder number so that merged reduction partition is
2273 sorted after other partitions. */
2274 if (!partition_reduction_p (first)
2275 && partition_reduction_p (partition))
2277 gcc_assert (pg->vertices[k].post < pg->vertices[j].post);
2278 pg->vertices[j].post = pg->vertices[k].post;
2280 partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2281 (*partitions)[k] = NULL;
2282 partition_free (partition);
2283 data = (struct pg_vdata *)pg->vertices[k].data;
2284 gcc_assert (data->id == k);
2285 data->partition = NULL;
2286 /* The result partition of merged SCC must be sequential. */
2287 first->type = PTYPE_SEQUENTIAL;
2292 sort_partitions_by_post_order (pg, partitions);
2293 free_partition_graph_vdata (pg);
2294 for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2295 free_graph (pg);
2297 if (dump_file && (dump_flags & TDF_DETAILS))
2299 fprintf (dump_file, "Possible alias data dependence to break:\n");
2300 dump_data_dependence_relations (dump_file, *alias_ddrs);
2304 /* Compute and return an expression whose value is the segment length which
2305 will be accessed by DR in NITERS iterations. */
2307 static tree
2308 data_ref_segment_size (struct data_reference *dr, tree niters)
2310 tree segment_length;
2312 if (integer_zerop (DR_STEP (dr)))
2313 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2314 else
2315 segment_length = size_binop (MULT_EXPR,
2316 fold_convert (sizetype, DR_STEP (dr)),
2317 fold_convert (sizetype, niters));
2319 return segment_length;
2322 /* Return true if LOOP's latch is dominated by statement for data reference
2323 DR. */
2325 static inline bool
2326 latch_dominated_by_data_ref (struct loop *loop, data_reference *dr)
2328 return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2329 gimple_bb (DR_STMT (dr)));
2332 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2333 data dependence relations ALIAS_DDRS. */
2335 static void
2336 compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
2337 vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2339 unsigned int i;
2340 unsigned HOST_WIDE_INT factor = 1;
2341 tree niters_plus_one, niters = number_of_latch_executions (loop);
2343 gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2344 niters = fold_convert (sizetype, niters);
2345 niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2347 if (dump_file && (dump_flags & TDF_DETAILS))
2348 fprintf (dump_file, "Creating alias check pairs:\n");
2350 /* Iterate all data dependence relations and compute alias check pairs. */
2351 for (i = 0; i < alias_ddrs->length (); i++)
2353 ddr_p ddr = (*alias_ddrs)[i];
2354 struct data_reference *dr_a = DDR_A (ddr);
2355 struct data_reference *dr_b = DDR_B (ddr);
2356 tree seg_length_a, seg_length_b;
2357 int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2358 DR_BASE_ADDRESS (dr_b));
2360 if (comp_res == 0)
2361 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
2362 gcc_assert (comp_res != 0);
2364 if (latch_dominated_by_data_ref (loop, dr_a))
2365 seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2366 else
2367 seg_length_a = data_ref_segment_size (dr_a, niters);
2369 if (latch_dominated_by_data_ref (loop, dr_b))
2370 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2371 else
2372 seg_length_b = data_ref_segment_size (dr_b, niters);
2374 dr_with_seg_len_pair_t dr_with_seg_len_pair
2375 (dr_with_seg_len (dr_a, seg_length_a),
2376 dr_with_seg_len (dr_b, seg_length_b));
2378 /* Canonicalize pairs by sorting the two DR members. */
2379 if (comp_res > 0)
2380 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2382 comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2385 if (tree_fits_uhwi_p (niters))
2386 factor = tree_to_uhwi (niters);
2388 /* Prune alias check pairs. */
2389 prune_runtime_alias_test_list (comp_alias_pairs, factor);
2390 if (dump_file && (dump_flags & TDF_DETAILS))
2391 fprintf (dump_file,
2392 "Improved number of alias checks from %d to %d\n",
2393 alias_ddrs->length (), comp_alias_pairs->length ());
2396 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2397 checks and version LOOP under condition of these runtime alias checks. */
2399 static void
2400 version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs)
2402 profile_probability prob;
2403 basic_block cond_bb;
2404 struct loop *nloop;
2405 tree lhs, arg0, cond_expr = NULL_TREE;
2406 gimple_seq cond_stmts = NULL;
2407 gimple *call_stmt = NULL;
2408 auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2410 /* Generate code for runtime alias checks if necessary. */
2411 gcc_assert (alias_ddrs->length () > 0);
2413 if (dump_file && (dump_flags & TDF_DETAILS))
2414 fprintf (dump_file,
2415 "Version loop <%d> with runtime alias check\n", loop->num);
2417 compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2418 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2419 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2420 is_gimple_val, NULL_TREE);
2422 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */
2423 if (flag_tree_loop_vectorize)
2425 /* Generate internal function call for loop distribution alias check. */
2426 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2427 2, NULL_TREE, cond_expr);
2428 lhs = make_ssa_name (boolean_type_node);
2429 gimple_call_set_lhs (call_stmt, lhs);
2431 else
2432 lhs = cond_expr;
2434 prob = profile_probability::guessed_always ().apply_scale (9, 10);
2435 initialize_original_copy_tables ();
2436 nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2437 prob, prob.invert (), true);
2438 free_original_copy_tables ();
2439 /* Record the original loop number in newly generated loops. In case of
2440 distribution, the original loop will be distributed and the new loop
2441 is kept. */
2442 loop->orig_loop_num = nloop->num;
2443 nloop->orig_loop_num = nloop->num;
2444 nloop->dont_vectorize = true;
2445 nloop->force_vectorize = false;
2447 if (call_stmt)
2449 /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2450 loop could be destroyed. */
2451 arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2452 gimple_call_set_arg (call_stmt, 0, arg0);
2453 gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2456 if (cond_stmts)
2458 gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2459 gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2461 update_ssa (TODO_update_ssa);
2464 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2465 ALIAS_DDRS are data dependence relations for runtime alias check. */
2467 static inline bool
2468 version_for_distribution_p (vec<struct partition *> *partitions,
2469 vec<ddr_p> *alias_ddrs)
2471 /* No need to version loop if we have only one partition. */
2472 if (partitions->length () == 1)
2473 return false;
2475 /* Need to version loop if runtime alias check is necessary. */
2476 return (alias_ddrs->length () > 0);
2479 /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
2480 ALIAS_DDRS contains ddrs which need runtime alias check. */
2482 static void
2483 finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
2484 vec<ddr_p> *alias_ddrs)
2486 unsigned i;
2487 struct partition *partition, *a;
2489 if (partitions->length () == 1
2490 || alias_ddrs->length () > 0)
2491 return;
2493 unsigned num_builtin = 0, num_normal = 0;
2494 bool same_type_p = true;
2495 enum partition_type type = ((*partitions)[0])->type;
2496 for (i = 0; partitions->iterate (i, &partition); ++i)
2498 same_type_p &= (type == partition->type);
2499 if (partition->kind != PKIND_NORMAL)
2500 num_builtin++;
2501 else
2502 num_normal++;
2505 /* Don't distribute current loop into too many loops given we don't have
2506 memory stream cost model. Be even more conservative in case of loop
2507 nest distribution. */
2508 if ((same_type_p && num_builtin == 0)
2509 || (loop->inner != NULL
2510 && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2511 || (loop->inner == NULL
2512 && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2514 a = (*partitions)[0];
2515 for (i = 1; partitions->iterate (i, &partition); ++i)
2517 partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2518 partition_free (partition);
2520 partitions->truncate (1);
2524 /* Distributes the code from LOOP in such a way that producer statements
2525 are placed before consumer statements. Tries to separate only the
2526 statements from STMTS into separate loops. Returns the number of
2527 distributed loops. Set NB_CALLS to number of generated builtin calls.
2528 Set *DESTROY_P to whether LOOP needs to be destroyed. */
2530 static int
2531 distribute_loop (struct loop *loop, vec<gimple *> stmts,
2532 control_dependences *cd, int *nb_calls, bool *destroy_p)
2534 ddrs_table = new hash_table<ddr_hasher> (389);
2535 struct graph *rdg;
2536 partition *partition;
2537 bool any_builtin;
2538 int i, nbp;
2540 *destroy_p = false;
2541 *nb_calls = 0;
2542 loop_nest.create (0);
2543 if (!find_loop_nest (loop, &loop_nest))
2545 loop_nest.release ();
2546 delete ddrs_table;
2547 return 0;
2550 datarefs_vec.create (20);
2551 rdg = build_rdg (loop, cd);
2552 if (!rdg)
2554 if (dump_file && (dump_flags & TDF_DETAILS))
2555 fprintf (dump_file,
2556 "Loop %d not distributed: failed to build the RDG.\n",
2557 loop->num);
2559 loop_nest.release ();
2560 free_data_refs (datarefs_vec);
2561 delete ddrs_table;
2562 return 0;
2565 if (datarefs_vec.length () > MAX_DATAREFS_NUM)
2567 if (dump_file && (dump_flags & TDF_DETAILS))
2568 fprintf (dump_file,
2569 "Loop %d not distributed: too many memory references.\n",
2570 loop->num);
2572 free_rdg (rdg);
2573 loop_nest.release ();
2574 free_data_refs (datarefs_vec);
2575 delete ddrs_table;
2576 return 0;
2579 data_reference_p dref;
2580 for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
2581 dref->aux = (void *) (uintptr_t) i;
2583 if (dump_file && (dump_flags & TDF_DETAILS))
2584 dump_rdg (dump_file, rdg);
2586 auto_vec<struct partition *, 3> partitions;
2587 rdg_build_partitions (rdg, stmts, &partitions);
2589 auto_vec<ddr_p> alias_ddrs;
2591 auto_bitmap stmt_in_all_partitions;
2592 bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
2593 for (i = 1; partitions.iterate (i, &partition); ++i)
2594 bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
2596 any_builtin = false;
2597 FOR_EACH_VEC_ELT (partitions, i, partition)
2599 classify_partition (loop, rdg, partition, stmt_in_all_partitions);
2600 any_builtin |= partition_builtin_p (partition);
2603 /* If we are only distributing patterns but did not detect any,
2604 simply bail out. */
2605 if (!flag_tree_loop_distribution
2606 && !any_builtin)
2608 nbp = 0;
2609 goto ldist_done;
2612 /* If we are only distributing patterns fuse all partitions that
2613 were not classified as builtins. This also avoids chopping
2614 a loop into pieces, separated by builtin calls. That is, we
2615 only want no or a single loop body remaining. */
2616 struct partition *into;
2617 if (!flag_tree_loop_distribution)
2619 for (i = 0; partitions.iterate (i, &into); ++i)
2620 if (!partition_builtin_p (into))
2621 break;
2622 for (++i; partitions.iterate (i, &partition); ++i)
2623 if (!partition_builtin_p (partition))
2625 partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
2626 partitions.unordered_remove (i);
2627 partition_free (partition);
2628 i--;
2632 /* Due to limitations in the transform phase we have to fuse all
2633 reduction partitions into the last partition so the existing
2634 loop will contain all loop-closed PHI nodes. */
2635 for (i = 0; partitions.iterate (i, &into); ++i)
2636 if (partition_reduction_p (into))
2637 break;
2638 for (i = i + 1; partitions.iterate (i, &partition); ++i)
2639 if (partition_reduction_p (partition))
2641 partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
2642 partitions.unordered_remove (i);
2643 partition_free (partition);
2644 i--;
2647 /* Apply our simple cost model - fuse partitions with similar
2648 memory accesses. */
2649 for (i = 0; partitions.iterate (i, &into); ++i)
2651 bool changed = false;
2652 if (partition_builtin_p (into))
2653 continue;
2654 for (int j = i + 1;
2655 partitions.iterate (j, &partition); ++j)
2657 if (share_memory_accesses (rdg, into, partition))
2659 partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
2660 partitions.unordered_remove (j);
2661 partition_free (partition);
2662 j--;
2663 changed = true;
2666 /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
2667 accesses when 1 and 2 have similar accesses but not 0 and 1
2668 then in the next iteration we will fail to consider merging
2669 1 into 0,2. So try again if we did any merging into 0. */
2670 if (changed)
2671 i--;
2674 /* Build the partition dependency graph and fuse partitions in strong
2675 connected component. */
2676 if (partitions.length () > 1)
2678 /* Don't support loop nest distribution under runtime alias check
2679 since it's not likely to enable many vectorization opportunities. */
2680 if (loop->inner)
2681 merge_dep_scc_partitions (rdg, &partitions, false);
2682 else
2684 merge_dep_scc_partitions (rdg, &partitions, true);
2685 if (partitions.length () > 1)
2686 break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
2690 finalize_partitions (loop, &partitions, &alias_ddrs);
2692 nbp = partitions.length ();
2693 if (nbp == 0
2694 || (nbp == 1 && !partition_builtin_p (partitions[0]))
2695 || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
2697 nbp = 0;
2698 goto ldist_done;
2701 if (version_for_distribution_p (&partitions, &alias_ddrs))
2702 version_loop_by_alias_check (loop, &alias_ddrs);
2704 if (dump_file && (dump_flags & TDF_DETAILS))
2706 fprintf (dump_file,
2707 "distribute loop <%d> into partitions:\n", loop->num);
2708 dump_rdg_partitions (dump_file, partitions);
2711 FOR_EACH_VEC_ELT (partitions, i, partition)
2713 if (partition_builtin_p (partition))
2714 (*nb_calls)++;
2715 *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
2718 ldist_done:
2719 loop_nest.release ();
2720 free_data_refs (datarefs_vec);
2721 for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
2722 iter != ddrs_table->end (); ++iter)
2724 free_dependence_relation (*iter);
2725 *iter = NULL;
2727 delete ddrs_table;
2729 FOR_EACH_VEC_ELT (partitions, i, partition)
2730 partition_free (partition);
2732 free_rdg (rdg);
2733 return nbp - *nb_calls;
2736 /* Distribute all loops in the current function. */
2738 namespace {
2740 const pass_data pass_data_loop_distribution =
2742 GIMPLE_PASS, /* type */
2743 "ldist", /* name */
2744 OPTGROUP_LOOP, /* optinfo_flags */
2745 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
2746 ( PROP_cfg | PROP_ssa ), /* properties_required */
2747 0, /* properties_provided */
2748 0, /* properties_destroyed */
2749 0, /* todo_flags_start */
2750 0, /* todo_flags_finish */
2753 class pass_loop_distribution : public gimple_opt_pass
2755 public:
2756 pass_loop_distribution (gcc::context *ctxt)
2757 : gimple_opt_pass (pass_data_loop_distribution, ctxt)
2760 /* opt_pass methods: */
2761 virtual bool gate (function *)
2763 return flag_tree_loop_distribution
2764 || flag_tree_loop_distribute_patterns;
2767 virtual unsigned int execute (function *);
2769 }; // class pass_loop_distribution
2772 /* Given LOOP, this function records seed statements for distribution in
2773 WORK_LIST. Return false if there is nothing for distribution. */
2775 static bool
2776 find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list)
2778 basic_block *bbs = get_loop_body_in_dom_order (loop);
2780 /* Initialize the worklist with stmts we seed the partitions with. */
2781 for (unsigned i = 0; i < loop->num_nodes; ++i)
2783 for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
2784 !gsi_end_p (gsi); gsi_next (&gsi))
2786 gphi *phi = gsi.phi ();
2787 if (virtual_operand_p (gimple_phi_result (phi)))
2788 continue;
2789 /* Distribute stmts which have defs that are used outside of
2790 the loop. */
2791 if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
2792 continue;
2793 work_list->safe_push (phi);
2795 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2796 !gsi_end_p (gsi); gsi_next (&gsi))
2798 gimple *stmt = gsi_stmt (gsi);
2800 /* If there is a stmt with side-effects bail out - we
2801 cannot and should not distribute this loop. */
2802 if (gimple_has_side_effects (stmt))
2804 free (bbs);
2805 return false;
2808 /* Distribute stmts which have defs that are used outside of
2809 the loop. */
2810 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
2812 /* Otherwise only distribute stores for now. */
2813 else if (!gimple_vdef (stmt))
2814 continue;
2816 work_list->safe_push (stmt);
2819 free (bbs);
2820 return work_list->length () > 0;
2823 /* Given innermost LOOP, return the outermost enclosing loop that forms a
2824 perfect loop nest. */
2826 static struct loop *
2827 prepare_perfect_loop_nest (struct loop *loop)
2829 struct loop *outer = loop_outer (loop);
2830 tree niters = number_of_latch_executions (loop);
2832 /* TODO: We only support the innermost 2-level loop nest distribution
2833 because of compilation time issue for now. This should be relaxed
2834 in the future. */
2835 while (loop->inner == NULL
2836 && loop_outer (outer)
2837 && outer->inner == loop && loop->next == NULL
2838 && single_exit (outer)
2839 && optimize_loop_for_speed_p (outer)
2840 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
2841 && (niters = number_of_latch_executions (outer)) != NULL_TREE
2842 && niters != chrec_dont_know)
2844 loop = outer;
2845 outer = loop_outer (loop);
2848 return loop;
2851 unsigned int
2852 pass_loop_distribution::execute (function *fun)
2854 struct loop *loop;
2855 bool changed = false;
2856 basic_block bb;
2857 control_dependences *cd = NULL;
2858 auto_vec<loop_p> loops_to_be_destroyed;
2860 if (number_of_loops (fun) <= 1)
2861 return 0;
2863 /* Compute topological order for basic blocks. Topological order is
2864 needed because data dependence is computed for data references in
2865 lexicographical order. */
2866 if (bb_top_order_index == NULL)
2868 int rpo_num;
2869 int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun));
2871 bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
2872 bb_top_order_index_size = last_basic_block_for_fn (cfun);
2873 rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true);
2874 for (int i = 0; i < rpo_num; i++)
2875 bb_top_order_index[rpo[i]] = i;
2877 free (rpo);
2880 FOR_ALL_BB_FN (bb, fun)
2882 gimple_stmt_iterator gsi;
2883 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2884 gimple_set_uid (gsi_stmt (gsi), -1);
2885 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2886 gimple_set_uid (gsi_stmt (gsi), -1);
2889 /* We can at the moment only distribute non-nested loops, thus restrict
2890 walking to innermost loops. */
2891 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
2893 /* Don't distribute multiple exit edges loop, or cold loop. */
2894 if (!single_exit (loop)
2895 || !optimize_loop_for_speed_p (loop))
2896 continue;
2898 /* Don't distribute loop if niters is unknown. */
2899 tree niters = number_of_latch_executions (loop);
2900 if (niters == NULL_TREE || niters == chrec_dont_know)
2901 continue;
2903 /* Get the perfect loop nest for distribution. */
2904 loop = prepare_perfect_loop_nest (loop);
2905 for (; loop; loop = loop->inner)
2907 auto_vec<gimple *> work_list;
2908 if (!find_seed_stmts_for_distribution (loop, &work_list))
2909 break;
2911 const char *str = loop->inner ? " nest" : "";
2912 location_t loc = find_loop_location (loop);
2913 if (!cd)
2915 calculate_dominance_info (CDI_DOMINATORS);
2916 calculate_dominance_info (CDI_POST_DOMINATORS);
2917 cd = new control_dependences ();
2918 free_dominance_info (CDI_POST_DOMINATORS);
2921 bool destroy_p;
2922 int nb_generated_loops, nb_generated_calls;
2923 nb_generated_loops = distribute_loop (loop, work_list, cd,
2924 &nb_generated_calls,
2925 &destroy_p);
2926 if (destroy_p)
2927 loops_to_be_destroyed.safe_push (loop);
2929 if (nb_generated_loops + nb_generated_calls > 0)
2931 changed = true;
2932 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
2933 loc, "Loop%s %d distributed: split to %d loops "
2934 "and %d library calls.\n", str, loop->num,
2935 nb_generated_loops, nb_generated_calls);
2937 break;
2940 if (dump_file && (dump_flags & TDF_DETAILS))
2941 fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
2945 if (cd)
2946 delete cd;
2948 if (bb_top_order_index != NULL)
2950 free (bb_top_order_index);
2951 bb_top_order_index = NULL;
2952 bb_top_order_index_size = 0;
2955 if (changed)
2957 /* Destroy loop bodies that could not be reused. Do this late as we
2958 otherwise can end up refering to stale data in control dependences. */
2959 unsigned i;
2960 FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
2961 destroy_loop (loop);
2963 /* Cached scalar evolutions now may refer to wrong or non-existing
2964 loops. */
2965 scev_reset_htab ();
2966 mark_virtual_operands_for_renaming (fun);
2967 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
2970 checking_verify_loop_structure ();
2972 return 0;
2975 } // anon namespace
2977 gimple_opt_pass *
2978 make_pass_loop_distribution (gcc::context *ctxt)
2980 return new pass_loop_distribution (ctxt);