* tree-loop-distribution.c (struct partition): Remove unused field
[official-gcc.git] / gcc / tree-loop-distribution.c
blob3db3d6ec21ef4c0a784edd684a6b3a2a977247e5
1 /* Loop distribution.
2 Copyright (C) 2006-2017 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 /* This pass performs loop distribution: for example, the loop
24 |DO I = 2, N
25 | A(I) = B(I) + C
26 | D(I) = A(I-1)*E
27 |ENDDO
29 is transformed to
31 |DOALL I = 2, N
32 | A(I) = B(I) + C
33 |ENDDO
35 |DOALL I = 2, N
36 | D(I) = A(I-1)*E
37 |ENDDO
39 Loop distribution is the dual of loop fusion. It separates statements
40 of a loop (or loop nest) into multiple loops (or loop nests) with the
41 same loop header. The major goal is to separate statements which may
42 be vectorized from those that can't. This pass implements distribution
43 in the following steps:
45 1) Seed partitions with specific type statements. For now we support
46 two types seed statements: statement defining variable used outside
47 of loop; statement storing to memory.
48 2) Build reduced dependence graph (RDG) for loop to be distributed.
49 The vertices (RDG:V) model all statements in the loop and the edges
50 (RDG:E) model flow and control dependencies between statements.
51 3) Apart from RDG, compute data dependencies between memory references.
52 4) Starting from seed statement, build up partition by adding depended
53 statements according to RDG's dependence information. Partition is
54 classified as parallel type if it can be executed paralleled; or as
55 sequential type if it can't. Parallel type partition is further
56 classified as different builtin kinds if it can be implemented as
57 builtin function calls.
58 5) Build partition dependence graph (PG) based on data dependencies.
59 The vertices (PG:V) model all partitions and the edges (PG:E) model
60 all data dependencies between every partitions pair. In general,
61 data dependence is either compilation time known or unknown. In C
62 family languages, there exists quite amount compilation time unknown
63 dependencies because of possible alias relation of data references.
64 We categorize PG's edge to two types: "true" edge that represents
65 compilation time known data dependencies; "alias" edge for all other
66 data dependencies.
67 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge
68 partitions in each strong connected component (SCC) correspondingly.
69 Build new PG for merged partitions.
70 7) Traverse PG again and this time with both "true" and "alias" edges
71 included. We try to break SCCs by removing some edges. Because
72 SCCs by "true" edges are all fused in step 6), we can break SCCs
73 by removing some "alias" edges. It's NP-hard to choose optimal
74 edge set, fortunately simple approximation is good enough for us
75 given the small problem scale.
76 8) Collect all data dependencies of the removed "alias" edges. Create
77 runtime alias checks for collected data dependencies.
78 9) Version loop under the condition of runtime alias checks. Given
79 loop distribution generally introduces additional overhead, it is
80 only useful if vectorization is achieved in distributed loop. We
81 version loop with internal function call IFN_LOOP_DIST_ALIAS. If
82 no distributed loop can be vectorized, we simply remove distributed
83 loops and recover to the original one.
85 TODO:
86 1) We only distribute innermost loops now. This pass should handle loop
87 nests in the future.
88 2) We only fuse partitions in SCC now. A better fusion algorithm is
89 desired to minimize loop overhead, maximize parallelism and maximize
90 data reuse. */
92 #include "config.h"
93 #include "system.h"
94 #include "coretypes.h"
95 #include "backend.h"
96 #include "tree.h"
97 #include "gimple.h"
98 #include "cfghooks.h"
99 #include "tree-pass.h"
100 #include "ssa.h"
101 #include "gimple-pretty-print.h"
102 #include "fold-const.h"
103 #include "cfganal.h"
104 #include "gimple-iterator.h"
105 #include "gimplify-me.h"
106 #include "stor-layout.h"
107 #include "tree-cfg.h"
108 #include "tree-ssa-loop-manip.h"
109 #include "tree-ssa-loop.h"
110 #include "tree-into-ssa.h"
111 #include "tree-ssa.h"
112 #include "cfgloop.h"
113 #include "tree-scalar-evolution.h"
114 #include "params.h"
115 #include "tree-vectorizer.h"
118 #define MAX_DATAREFS_NUM \
119 ((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
121 /* Hashtable helpers. */
123 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
125 static inline hashval_t hash (const data_dependence_relation *);
126 static inline bool equal (const data_dependence_relation *,
127 const data_dependence_relation *);
130 /* Hash function for data dependence. */
132 inline hashval_t
133 ddr_hasher::hash (const data_dependence_relation *ddr)
135 inchash::hash h;
136 h.add_ptr (DDR_A (ddr));
137 h.add_ptr (DDR_B (ddr));
138 return h.end ();
141 /* Hash table equality function for data dependence. */
143 inline bool
144 ddr_hasher::equal (const data_dependence_relation *ddr1,
145 const data_dependence_relation *ddr2)
147 return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
150 /* The loop (nest) to be distributed. */
151 static vec<loop_p> loop_nest;
153 /* Vector of data references in the loop to be distributed. */
154 static vec<data_reference_p> datarefs_vec;
156 /* Store index of data reference in aux field. */
157 #define DR_INDEX(dr) ((uintptr_t) (dr)->aux)
159 /* Hash table for data dependence relation in the loop to be distributed. */
160 static hash_table<ddr_hasher> *ddrs_table;
162 /* A Reduced Dependence Graph (RDG) vertex representing a statement. */
163 struct rdg_vertex
165 /* The statement represented by this vertex. */
166 gimple *stmt;
168 /* Vector of data-references in this statement. */
169 vec<data_reference_p> datarefs;
171 /* True when the statement contains a write to memory. */
172 bool has_mem_write;
174 /* True when the statement contains a read from memory. */
175 bool has_mem_reads;
178 #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt
179 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
180 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
181 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
182 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
183 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
184 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
185 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
187 /* Data dependence type. */
189 enum rdg_dep_type
191 /* Read After Write (RAW). */
192 flow_dd = 'f',
194 /* Control dependence (execute conditional on). */
195 control_dd = 'c'
198 /* Dependence information attached to an edge of the RDG. */
200 struct rdg_edge
202 /* Type of the dependence. */
203 enum rdg_dep_type type;
206 #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type
208 /* Dump vertex I in RDG to FILE. */
210 static void
211 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
213 struct vertex *v = &(rdg->vertices[i]);
214 struct graph_edge *e;
216 fprintf (file, "(vertex %d: (%s%s) (in:", i,
217 RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
218 RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
220 if (v->pred)
221 for (e = v->pred; e; e = e->pred_next)
222 fprintf (file, " %d", e->src);
224 fprintf (file, ") (out:");
226 if (v->succ)
227 for (e = v->succ; e; e = e->succ_next)
228 fprintf (file, " %d", e->dest);
230 fprintf (file, ")\n");
231 print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
232 fprintf (file, ")\n");
235 /* Call dump_rdg_vertex on stderr. */
237 DEBUG_FUNCTION void
238 debug_rdg_vertex (struct graph *rdg, int i)
240 dump_rdg_vertex (stderr, rdg, i);
243 /* Dump the reduced dependence graph RDG to FILE. */
245 static void
246 dump_rdg (FILE *file, struct graph *rdg)
248 fprintf (file, "(rdg\n");
249 for (int i = 0; i < rdg->n_vertices; i++)
250 dump_rdg_vertex (file, rdg, i);
251 fprintf (file, ")\n");
254 /* Call dump_rdg on stderr. */
256 DEBUG_FUNCTION void
257 debug_rdg (struct graph *rdg)
259 dump_rdg (stderr, rdg);
262 static void
263 dot_rdg_1 (FILE *file, struct graph *rdg)
265 int i;
266 pretty_printer buffer;
267 pp_needs_newline (&buffer) = false;
268 buffer.buffer->stream = file;
270 fprintf (file, "digraph RDG {\n");
272 for (i = 0; i < rdg->n_vertices; i++)
274 struct vertex *v = &(rdg->vertices[i]);
275 struct graph_edge *e;
277 fprintf (file, "%d [label=\"[%d] ", i, i);
278 pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
279 pp_flush (&buffer);
280 fprintf (file, "\"]\n");
282 /* Highlight reads from memory. */
283 if (RDG_MEM_READS_STMT (rdg, i))
284 fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
286 /* Highlight stores to memory. */
287 if (RDG_MEM_WRITE_STMT (rdg, i))
288 fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
290 if (v->succ)
291 for (e = v->succ; e; e = e->succ_next)
292 switch (RDGE_TYPE (e))
294 case flow_dd:
295 /* These are the most common dependences: don't print these. */
296 fprintf (file, "%d -> %d \n", i, e->dest);
297 break;
299 case control_dd:
300 fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
301 break;
303 default:
304 gcc_unreachable ();
308 fprintf (file, "}\n\n");
311 /* Display the Reduced Dependence Graph using dotty. */
313 DEBUG_FUNCTION void
314 dot_rdg (struct graph *rdg)
316 /* When debugging, you may want to enable the following code. */
317 #ifdef HAVE_POPEN
318 FILE *file = popen ("dot -Tx11", "w");
319 if (!file)
320 return;
321 dot_rdg_1 (file, rdg);
322 fflush (file);
323 close (fileno (file));
324 pclose (file);
325 #else
326 dot_rdg_1 (stderr, rdg);
327 #endif
330 /* Returns the index of STMT in RDG. */
332 static int
333 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
335 int index = gimple_uid (stmt);
336 gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
337 return index;
340 /* Creates dependence edges in RDG for all the uses of DEF. IDEF is
341 the index of DEF in RDG. */
343 static void
344 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
346 use_operand_p imm_use_p;
347 imm_use_iterator iterator;
349 FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
351 struct graph_edge *e;
352 int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
354 if (use < 0)
355 continue;
357 e = add_edge (rdg, idef, use);
358 e->data = XNEW (struct rdg_edge);
359 RDGE_TYPE (e) = flow_dd;
363 /* Creates an edge for the control dependences of BB to the vertex V. */
365 static void
366 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
367 int v, control_dependences *cd)
369 bitmap_iterator bi;
370 unsigned edge_n;
371 EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
372 0, edge_n, bi)
374 basic_block cond_bb = cd->get_edge_src (edge_n);
375 gimple *stmt = last_stmt (cond_bb);
376 if (stmt && is_ctrl_stmt (stmt))
378 struct graph_edge *e;
379 int c = rdg_vertex_for_stmt (rdg, stmt);
380 if (c < 0)
381 continue;
383 e = add_edge (rdg, c, v);
384 e->data = XNEW (struct rdg_edge);
385 RDGE_TYPE (e) = control_dd;
390 /* Creates the edges of the reduced dependence graph RDG. */
392 static void
393 create_rdg_flow_edges (struct graph *rdg)
395 int i;
396 def_operand_p def_p;
397 ssa_op_iter iter;
399 for (i = 0; i < rdg->n_vertices; i++)
400 FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
401 iter, SSA_OP_DEF)
402 create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
405 /* Creates the edges of the reduced dependence graph RDG. */
407 static void
408 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
410 int i;
412 for (i = 0; i < rdg->n_vertices; i++)
414 gimple *stmt = RDG_STMT (rdg, i);
415 if (gimple_code (stmt) == GIMPLE_PHI)
417 edge_iterator ei;
418 edge e;
419 FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
420 if (flow_bb_inside_loop_p (loop, e->src))
421 create_edge_for_control_dependence (rdg, e->src, i, cd);
423 else
424 create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
428 /* Build the vertices of the reduced dependence graph RDG. Return false
429 if that failed. */
431 static bool
432 create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop)
434 int i;
435 gimple *stmt;
437 FOR_EACH_VEC_ELT (stmts, i, stmt)
439 struct vertex *v = &(rdg->vertices[i]);
441 /* Record statement to vertex mapping. */
442 gimple_set_uid (stmt, i);
444 v->data = XNEW (struct rdg_vertex);
445 RDGV_STMT (v) = stmt;
446 RDGV_DATAREFS (v).create (0);
447 RDGV_HAS_MEM_WRITE (v) = false;
448 RDGV_HAS_MEM_READS (v) = false;
449 if (gimple_code (stmt) == GIMPLE_PHI)
450 continue;
452 unsigned drp = datarefs_vec.length ();
453 if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
454 return false;
455 for (unsigned j = drp; j < datarefs_vec.length (); ++j)
457 data_reference_p dr = datarefs_vec[j];
458 if (DR_IS_READ (dr))
459 RDGV_HAS_MEM_READS (v) = true;
460 else
461 RDGV_HAS_MEM_WRITE (v) = true;
462 RDGV_DATAREFS (v).safe_push (dr);
465 return true;
468 /* Array mapping basic block's index to its topological order. */
469 static int *bb_top_order_index;
470 /* And size of the array. */
471 static int bb_top_order_index_size;
473 /* If X has a smaller topological sort number than Y, returns -1;
474 if greater, returns 1. */
476 static int
477 bb_top_order_cmp (const void *x, const void *y)
479 basic_block bb1 = *(const basic_block *) x;
480 basic_block bb2 = *(const basic_block *) y;
482 gcc_assert (bb1->index < bb_top_order_index_size
483 && bb2->index < bb_top_order_index_size);
484 gcc_assert (bb1 == bb2
485 || bb_top_order_index[bb1->index]
486 != bb_top_order_index[bb2->index]);
488 return (bb_top_order_index[bb1->index] - bb_top_order_index[bb2->index]);
491 /* Initialize STMTS with all the statements of LOOP. We use topological
492 order to discover all statements. The order is important because
493 generate_loops_for_partition is using the same traversal for identifying
494 statements in loop copies. */
496 static void
497 stmts_from_loop (struct loop *loop, vec<gimple *> *stmts)
499 unsigned int i;
500 basic_block *bbs = get_loop_body_in_custom_order (loop, bb_top_order_cmp);
502 for (i = 0; i < loop->num_nodes; i++)
504 basic_block bb = bbs[i];
506 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
507 gsi_next (&bsi))
508 if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
509 stmts->safe_push (bsi.phi ());
511 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
512 gsi_next (&bsi))
514 gimple *stmt = gsi_stmt (bsi);
515 if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
516 stmts->safe_push (stmt);
520 free (bbs);
523 /* Free the reduced dependence graph RDG. */
525 static void
526 free_rdg (struct graph *rdg)
528 int i;
530 for (i = 0; i < rdg->n_vertices; i++)
532 struct vertex *v = &(rdg->vertices[i]);
533 struct graph_edge *e;
535 for (e = v->succ; e; e = e->succ_next)
536 free (e->data);
538 if (v->data)
540 gimple_set_uid (RDGV_STMT (v), -1);
541 (RDGV_DATAREFS (v)).release ();
542 free (v->data);
546 free_graph (rdg);
549 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
550 LOOP, and one edge per flow dependence or control dependence from control
551 dependence CD. During visiting each statement, data references are also
552 collected and recorded in global data DATAREFS_VEC. */
554 static struct graph *
555 build_rdg (struct loop *loop, control_dependences *cd)
557 struct graph *rdg;
559 /* Create the RDG vertices from the stmts of the loop nest. */
560 auto_vec<gimple *, 10> stmts;
561 stmts_from_loop (loop, &stmts);
562 rdg = new_graph (stmts.length ());
563 if (!create_rdg_vertices (rdg, stmts, loop))
565 free_rdg (rdg);
566 return NULL;
568 stmts.release ();
570 create_rdg_flow_edges (rdg);
571 if (cd)
572 create_rdg_cd_edges (rdg, cd, loop);
574 return rdg;
578 /* Kind of distributed loop. */
579 enum partition_kind {
580 PKIND_NORMAL, PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
583 /* Type of distributed loop. */
584 enum partition_type {
585 /* The distributed loop can be executed parallelly. */
586 PTYPE_PARALLEL = 0,
587 /* The distributed loop has to be executed sequentially. */
588 PTYPE_SEQUENTIAL
591 /* Partition for loop distribution. */
592 struct partition
594 /* Statements of the partition. */
595 bitmap stmts;
596 /* True if the partition defines variable which is used outside of loop. */
597 bool reduction_p;
598 /* For builtin partition, true if it executes one iteration more than
599 number of loop (latch) iterations. */
600 bool plus_one;
601 enum partition_kind kind;
602 enum partition_type type;
603 /* data-references a kind != PKIND_NORMAL partition is about. */
604 data_reference_p main_dr;
605 data_reference_p secondary_dr;
606 /* Number of loop (latch) iterations. */
607 tree niter;
608 /* Data references in the partition. */
609 bitmap datarefs;
613 /* Allocate and initialize a partition from BITMAP. */
615 static partition *
616 partition_alloc (void)
618 partition *partition = XCNEW (struct partition);
619 partition->stmts = BITMAP_ALLOC (NULL);
620 partition->reduction_p = false;
621 partition->kind = PKIND_NORMAL;
622 partition->datarefs = BITMAP_ALLOC (NULL);
623 return partition;
626 /* Free PARTITION. */
628 static void
629 partition_free (partition *partition)
631 BITMAP_FREE (partition->stmts);
632 BITMAP_FREE (partition->datarefs);
633 free (partition);
636 /* Returns true if the partition can be generated as a builtin. */
638 static bool
639 partition_builtin_p (partition *partition)
641 return partition->kind != PKIND_NORMAL;
644 /* Returns true if the partition contains a reduction. */
646 static bool
647 partition_reduction_p (partition *partition)
649 return partition->reduction_p;
652 /* Partitions are fused because of different reasons. */
653 enum fuse_type
655 FUSE_NON_BUILTIN = 0,
656 FUSE_REDUCTION = 1,
657 FUSE_SHARE_REF = 2,
658 FUSE_SAME_SCC = 3,
659 FUSE_FINALIZE = 4
662 /* Description on different fusing reason. */
663 static const char *fuse_message[] = {
664 "they are non-builtins",
665 "they have reductions",
666 "they have shared memory refs",
667 "they are in the same dependence scc",
668 "there is no point to distribute loop"};
670 static void
671 update_type_for_merge (struct graph *, partition *, partition *);
673 /* Merge PARTITION into the partition DEST. RDG is the reduced dependence
674 graph and we update type for result partition if it is non-NULL. */
676 static void
677 partition_merge_into (struct graph *rdg, partition *dest,
678 partition *partition, enum fuse_type ft)
680 if (dump_file && (dump_flags & TDF_DETAILS))
682 fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
683 fprintf (dump_file, " Part 1: ");
684 dump_bitmap (dump_file, dest->stmts);
685 fprintf (dump_file, " Part 2: ");
686 dump_bitmap (dump_file, partition->stmts);
689 dest->kind = PKIND_NORMAL;
690 if (dest->type == PTYPE_PARALLEL)
691 dest->type = partition->type;
693 bitmap_ior_into (dest->stmts, partition->stmts);
694 if (partition_reduction_p (partition))
695 dest->reduction_p = true;
697 /* Further check if any data dependence prevents us from executing the
698 new partition parallelly. */
699 if (dest->type == PTYPE_PARALLEL && rdg != NULL)
700 update_type_for_merge (rdg, dest, partition);
702 bitmap_ior_into (dest->datarefs, partition->datarefs);
706 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
707 the LOOP. */
709 static bool
710 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
712 imm_use_iterator imm_iter;
713 use_operand_p use_p;
715 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
717 gimple *use_stmt = USE_STMT (use_p);
718 if (!is_gimple_debug (use_stmt)
719 && loop != loop_containing_stmt (use_stmt))
720 return true;
723 return false;
726 /* Returns true when STMT defines a scalar variable used after the
727 loop LOOP. */
729 static bool
730 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
732 def_operand_p def_p;
733 ssa_op_iter op_iter;
735 if (gimple_code (stmt) == GIMPLE_PHI)
736 return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
738 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
739 if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
740 return true;
742 return false;
745 /* Return a copy of LOOP placed before LOOP. */
747 static struct loop *
748 copy_loop_before (struct loop *loop)
750 struct loop *res;
751 edge preheader = loop_preheader_edge (loop);
753 initialize_original_copy_tables ();
754 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
755 gcc_assert (res != NULL);
756 free_original_copy_tables ();
757 delete_update_ssa ();
759 return res;
762 /* Creates an empty basic block after LOOP. */
764 static void
765 create_bb_after_loop (struct loop *loop)
767 edge exit = single_exit (loop);
769 if (!exit)
770 return;
772 split_edge (exit);
775 /* Generate code for PARTITION from the code in LOOP. The loop is
776 copied when COPY_P is true. All the statements not flagged in the
777 PARTITION bitmap are removed from the loop or from its copy. The
778 statements are indexed in sequence inside a basic block, and the
779 basic blocks of a loop are taken in dom order. */
781 static void
782 generate_loops_for_partition (struct loop *loop, partition *partition,
783 bool copy_p)
785 unsigned i;
786 basic_block *bbs;
788 if (copy_p)
790 int orig_loop_num = loop->orig_loop_num;
791 loop = copy_loop_before (loop);
792 gcc_assert (loop != NULL);
793 loop->orig_loop_num = orig_loop_num;
794 create_preheader (loop, CP_SIMPLE_PREHEADERS);
795 create_bb_after_loop (loop);
797 else
799 /* Origin number is set to the new versioned loop's num. */
800 gcc_assert (loop->orig_loop_num != loop->num);
803 /* Remove stmts not in the PARTITION bitmap. */
804 bbs = get_loop_body_in_dom_order (loop);
806 if (MAY_HAVE_DEBUG_STMTS)
807 for (i = 0; i < loop->num_nodes; i++)
809 basic_block bb = bbs[i];
811 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
812 gsi_next (&bsi))
814 gphi *phi = bsi.phi ();
815 if (!virtual_operand_p (gimple_phi_result (phi))
816 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
817 reset_debug_uses (phi);
820 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
822 gimple *stmt = gsi_stmt (bsi);
823 if (gimple_code (stmt) != GIMPLE_LABEL
824 && !is_gimple_debug (stmt)
825 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
826 reset_debug_uses (stmt);
830 for (i = 0; i < loop->num_nodes; i++)
832 basic_block bb = bbs[i];
834 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
836 gphi *phi = bsi.phi ();
837 if (!virtual_operand_p (gimple_phi_result (phi))
838 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
839 remove_phi_node (&bsi, true);
840 else
841 gsi_next (&bsi);
844 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
846 gimple *stmt = gsi_stmt (bsi);
847 if (gimple_code (stmt) != GIMPLE_LABEL
848 && !is_gimple_debug (stmt)
849 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
851 /* Choose an arbitrary path through the empty CFG part
852 that this unnecessary control stmt controls. */
853 if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
855 gimple_cond_make_false (cond_stmt);
856 update_stmt (stmt);
858 else if (gimple_code (stmt) == GIMPLE_SWITCH)
860 gswitch *switch_stmt = as_a <gswitch *> (stmt);
861 gimple_switch_set_index
862 (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
863 update_stmt (stmt);
865 else
867 unlink_stmt_vdef (stmt);
868 gsi_remove (&bsi, true);
869 release_defs (stmt);
870 continue;
873 gsi_next (&bsi);
877 free (bbs);
880 /* Build the size argument for a memory operation call. */
882 static tree
883 build_size_arg_loc (location_t loc, data_reference_p dr, tree nb_iter,
884 bool plus_one)
886 tree size = fold_convert_loc (loc, sizetype, nb_iter);
887 if (plus_one)
888 size = size_binop (PLUS_EXPR, size, size_one_node);
889 size = fold_build2_loc (loc, MULT_EXPR, sizetype, size,
890 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))));
891 size = fold_convert_loc (loc, size_type_node, size);
892 return size;
895 /* Build an address argument for a memory operation call. */
897 static tree
898 build_addr_arg_loc (location_t loc, data_reference_p dr, tree nb_bytes)
900 tree addr_base;
902 addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
903 addr_base = fold_convert_loc (loc, sizetype, addr_base);
905 /* Test for a negative stride, iterating over every element. */
906 if (tree_int_cst_sgn (DR_STEP (dr)) == -1)
908 addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base,
909 fold_convert_loc (loc, sizetype, nb_bytes));
910 addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base,
911 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))));
914 return fold_build_pointer_plus_loc (loc, DR_BASE_ADDRESS (dr), addr_base);
917 /* If VAL memory representation contains the same value in all bytes,
918 return that value, otherwise return -1.
919 E.g. for 0x24242424 return 0x24, for IEEE double
920 747708026454360457216.0 return 0x44, etc. */
922 static int
923 const_with_all_bytes_same (tree val)
925 unsigned char buf[64];
926 int i, len;
928 if (integer_zerop (val)
929 || (TREE_CODE (val) == CONSTRUCTOR
930 && !TREE_CLOBBER_P (val)
931 && CONSTRUCTOR_NELTS (val) == 0))
932 return 0;
934 if (real_zerop (val))
936 /* Only return 0 for +0.0, not for -0.0, which doesn't have
937 an all bytes same memory representation. Don't transform
938 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */
939 switch (TREE_CODE (val))
941 case REAL_CST:
942 if (!real_isneg (TREE_REAL_CST_PTR (val)))
943 return 0;
944 break;
945 case COMPLEX_CST:
946 if (!const_with_all_bytes_same (TREE_REALPART (val))
947 && !const_with_all_bytes_same (TREE_IMAGPART (val)))
948 return 0;
949 break;
950 case VECTOR_CST:
951 unsigned int j;
952 for (j = 0; j < VECTOR_CST_NELTS (val); ++j)
953 if (const_with_all_bytes_same (VECTOR_CST_ELT (val, j)))
954 break;
955 if (j == VECTOR_CST_NELTS (val))
956 return 0;
957 break;
958 default:
959 break;
963 if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
964 return -1;
966 len = native_encode_expr (val, buf, sizeof (buf));
967 if (len == 0)
968 return -1;
969 for (i = 1; i < len; i++)
970 if (buf[i] != buf[0])
971 return -1;
972 return buf[0];
975 /* Generate a call to memset for PARTITION in LOOP. */
977 static void
978 generate_memset_builtin (struct loop *loop, partition *partition)
980 gimple_stmt_iterator gsi;
981 gimple *stmt, *fn_call;
982 tree mem, fn, nb_bytes;
983 location_t loc;
984 tree val;
986 stmt = DR_STMT (partition->main_dr);
987 loc = gimple_location (stmt);
989 /* The new statements will be placed before LOOP. */
990 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
992 nb_bytes = build_size_arg_loc (loc, partition->main_dr, partition->niter,
993 partition->plus_one);
994 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
995 false, GSI_CONTINUE_LINKING);
996 mem = build_addr_arg_loc (loc, partition->main_dr, nb_bytes);
997 mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
998 false, GSI_CONTINUE_LINKING);
1000 /* This exactly matches the pattern recognition in classify_partition. */
1001 val = gimple_assign_rhs1 (stmt);
1002 /* Handle constants like 0x15151515 and similarly
1003 floating point constants etc. where all bytes are the same. */
1004 int bytev = const_with_all_bytes_same (val);
1005 if (bytev != -1)
1006 val = build_int_cst (integer_type_node, bytev);
1007 else if (TREE_CODE (val) == INTEGER_CST)
1008 val = fold_convert (integer_type_node, val);
1009 else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1011 tree tem = make_ssa_name (integer_type_node);
1012 gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1013 gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1014 val = tem;
1017 fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1018 fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1019 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1021 if (dump_file && (dump_flags & TDF_DETAILS))
1023 fprintf (dump_file, "generated memset");
1024 if (bytev == 0)
1025 fprintf (dump_file, " zero\n");
1026 else
1027 fprintf (dump_file, "\n");
1031 /* Generate a call to memcpy for PARTITION in LOOP. */
1033 static void
1034 generate_memcpy_builtin (struct loop *loop, partition *partition)
1036 gimple_stmt_iterator gsi;
1037 gimple *stmt, *fn_call;
1038 tree dest, src, fn, nb_bytes;
1039 location_t loc;
1040 enum built_in_function kind;
1042 stmt = DR_STMT (partition->main_dr);
1043 loc = gimple_location (stmt);
1045 /* The new statements will be placed before LOOP. */
1046 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1048 nb_bytes = build_size_arg_loc (loc, partition->main_dr, partition->niter,
1049 partition->plus_one);
1050 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1051 false, GSI_CONTINUE_LINKING);
1052 dest = build_addr_arg_loc (loc, partition->main_dr, nb_bytes);
1053 src = build_addr_arg_loc (loc, partition->secondary_dr, nb_bytes);
1054 if (partition->kind == PKIND_MEMCPY
1055 || ! ptr_derefs_may_alias_p (dest, src))
1056 kind = BUILT_IN_MEMCPY;
1057 else
1058 kind = BUILT_IN_MEMMOVE;
1060 dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1061 false, GSI_CONTINUE_LINKING);
1062 src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1063 false, GSI_CONTINUE_LINKING);
1064 fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1065 fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1066 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1068 if (dump_file && (dump_flags & TDF_DETAILS))
1070 if (kind == BUILT_IN_MEMCPY)
1071 fprintf (dump_file, "generated memcpy\n");
1072 else
1073 fprintf (dump_file, "generated memmove\n");
1077 /* Remove and destroy the loop LOOP. */
1079 static void
1080 destroy_loop (struct loop *loop)
1082 unsigned nbbs = loop->num_nodes;
1083 edge exit = single_exit (loop);
1084 basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1085 basic_block *bbs;
1086 unsigned i;
1088 bbs = get_loop_body_in_dom_order (loop);
1090 redirect_edge_pred (exit, src);
1091 exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1092 exit->flags |= EDGE_FALLTHRU;
1093 cancel_loop_tree (loop);
1094 rescan_loop_exit (exit, false, true);
1096 i = nbbs;
1099 /* We have made sure to not leave any dangling uses of SSA
1100 names defined in the loop. With the exception of virtuals.
1101 Make sure we replace all uses of virtual defs that will remain
1102 outside of the loop with the bare symbol as delete_basic_block
1103 will release them. */
1104 --i;
1105 for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1106 gsi_next (&gsi))
1108 gphi *phi = gsi.phi ();
1109 if (virtual_operand_p (gimple_phi_result (phi)))
1110 mark_virtual_phi_result_for_renaming (phi);
1112 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);
1113 gsi_next (&gsi))
1115 gimple *stmt = gsi_stmt (gsi);
1116 tree vdef = gimple_vdef (stmt);
1117 if (vdef && TREE_CODE (vdef) == SSA_NAME)
1118 mark_virtual_operand_for_renaming (vdef);
1120 delete_basic_block (bbs[i]);
1122 while (i != 0);
1124 free (bbs);
1126 set_immediate_dominator (CDI_DOMINATORS, dest,
1127 recompute_dominator (CDI_DOMINATORS, dest));
1130 /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */
1132 static bool
1133 generate_code_for_partition (struct loop *loop,
1134 partition *partition, bool copy_p)
1136 switch (partition->kind)
1138 case PKIND_NORMAL:
1139 /* Reductions all have to be in the last partition. */
1140 gcc_assert (!partition_reduction_p (partition)
1141 || !copy_p);
1142 generate_loops_for_partition (loop, partition, copy_p);
1143 return false;
1145 case PKIND_MEMSET:
1146 generate_memset_builtin (loop, partition);
1147 break;
1149 case PKIND_MEMCPY:
1150 case PKIND_MEMMOVE:
1151 generate_memcpy_builtin (loop, partition);
1152 break;
1154 default:
1155 gcc_unreachable ();
1158 /* Common tail for partitions we turn into a call. If this was the last
1159 partition for which we generate code, we have to destroy the loop. */
1160 if (!copy_p)
1161 return true;
1162 return false;
1165 /* Return data dependence relation for data references A and B. The two
1166 data references must be in lexicographic order wrto reduced dependence
1167 graph RDG. We firstly try to find ddr from global ddr hash table. If
1168 it doesn't exist, compute the ddr and cache it. */
1170 static data_dependence_relation *
1171 get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b)
1173 struct data_dependence_relation ent, **slot;
1174 struct data_dependence_relation *ddr;
1176 gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1177 gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1178 <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1179 ent.a = a;
1180 ent.b = b;
1181 slot = ddrs_table->find_slot (&ent, INSERT);
1182 if (*slot == NULL)
1184 ddr = initialize_data_dependence_relation (a, b, loop_nest);
1185 compute_affine_dependence (ddr, loop_nest[0]);
1186 *slot = ddr;
1189 return *slot;
1192 /* In reduced dependence graph RDG for loop distribution, return true if
1193 dependence between references DR1 and DR2 leads to a dependence cycle
1194 and such dependence cycle can't be resolved by runtime alias check. */
1196 static bool
1197 data_dep_in_cycle_p (struct graph *rdg,
1198 data_reference_p dr1, data_reference_p dr2)
1200 struct data_dependence_relation *ddr;
1202 /* Re-shuffle data-refs to be in topological order. */
1203 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1204 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1205 std::swap (dr1, dr2);
1207 ddr = get_data_dependence (rdg, dr1, dr2);
1209 /* In case of no data dependence. */
1210 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1211 return false;
1212 /* For unknown data dependence or known data dependence which can't be
1213 expressed in classic distance vector, we check if it can be resolved
1214 by runtime alias check. If yes, we still consider data dependence
1215 as won't introduce data dependence cycle. */
1216 else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1217 || DDR_NUM_DIST_VECTS (ddr) == 0)
1218 return !runtime_alias_check_p (ddr, NULL, true);
1219 else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1220 return true;
1221 else if (DDR_REVERSED_P (ddr)
1222 || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1223 return false;
1225 return true;
1228 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
1229 PARTITION1's type after merging PARTITION2 into PARTITION1. */
1231 static void
1232 update_type_for_merge (struct graph *rdg,
1233 partition *partition1, partition *partition2)
1235 unsigned i, j;
1236 bitmap_iterator bi, bj;
1237 data_reference_p dr1, dr2;
1239 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1241 unsigned start = (partition1 == partition2) ? i + 1 : 0;
1243 dr1 = datarefs_vec[i];
1244 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1246 dr2 = datarefs_vec[j];
1247 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1248 continue;
1250 /* Partition can only be executed sequentially if there is any
1251 data dependence cycle. */
1252 if (data_dep_in_cycle_p (rdg, dr1, dr2))
1254 partition1->type = PTYPE_SEQUENTIAL;
1255 return;
1261 /* Returns a partition with all the statements needed for computing
1262 the vertex V of the RDG, also including the loop exit conditions. */
1264 static partition *
1265 build_rdg_partition_for_vertex (struct graph *rdg, int v)
1267 partition *partition = partition_alloc ();
1268 auto_vec<int, 3> nodes;
1269 unsigned i, j;
1270 int x;
1271 data_reference_p dr;
1273 graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1275 FOR_EACH_VEC_ELT (nodes, i, x)
1277 bitmap_set_bit (partition->stmts, x);
1279 for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1281 unsigned idx = (unsigned) DR_INDEX (dr);
1282 gcc_assert (idx < datarefs_vec.length ());
1284 /* Partition can only be executed sequentially if there is any
1285 unknown data reference. */
1286 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1287 || !DR_INIT (dr) || !DR_STEP (dr))
1288 partition->type = PTYPE_SEQUENTIAL;
1290 bitmap_set_bit (partition->datarefs, idx);
1294 if (partition->type == PTYPE_SEQUENTIAL)
1295 return partition;
1297 /* Further check if any data dependence prevents us from executing the
1298 partition parallelly. */
1299 update_type_for_merge (rdg, partition, partition);
1301 return partition;
1304 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
1305 For the moment we detect memset, memcpy and memmove patterns. Bitmap
1306 STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */
1308 static void
1309 classify_partition (loop_p loop, struct graph *rdg, partition *partition,
1310 bitmap stmt_in_all_partitions)
1312 bitmap_iterator bi;
1313 unsigned i;
1314 tree nb_iter;
1315 data_reference_p single_load, single_store;
1316 bool volatiles_p = false, plus_one = false, has_reduction = false;
1318 partition->kind = PKIND_NORMAL;
1319 partition->main_dr = NULL;
1320 partition->secondary_dr = NULL;
1321 partition->niter = NULL_TREE;
1322 partition->plus_one = false;
1324 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1326 gimple *stmt = RDG_STMT (rdg, i);
1328 if (gimple_has_volatile_ops (stmt))
1329 volatiles_p = true;
1331 /* If the stmt is not included by all partitions and there is uses
1332 outside of the loop, then mark the partition as reduction. */
1333 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1335 /* Due to limitation in the transform phase we have to fuse all
1336 reduction partitions. As a result, this could cancel valid
1337 loop distribution especially for loop that induction variable
1338 is used outside of loop. To workaround this issue, we skip
1339 marking partition as reudction if the reduction stmt belongs
1340 to all partitions. In such case, reduction will be computed
1341 correctly no matter how partitions are fused/distributed. */
1342 if (!bitmap_bit_p (stmt_in_all_partitions, i))
1344 partition->reduction_p = true;
1345 return;
1347 has_reduction = true;
1351 /* Perform general partition disqualification for builtins. */
1352 if (volatiles_p
1353 /* Simple workaround to prevent classifying the partition as builtin
1354 if it contains any use outside of loop. */
1355 || has_reduction
1356 || !flag_tree_loop_distribute_patterns)
1357 return;
1359 /* Detect memset and memcpy. */
1360 single_load = NULL;
1361 single_store = NULL;
1362 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1364 gimple *stmt = RDG_STMT (rdg, i);
1365 data_reference_p dr;
1366 unsigned j;
1368 if (gimple_code (stmt) == GIMPLE_PHI)
1369 continue;
1371 /* Any scalar stmts are ok. */
1372 if (!gimple_vuse (stmt))
1373 continue;
1375 /* Otherwise just regular loads/stores. */
1376 if (!gimple_assign_single_p (stmt))
1377 return;
1379 /* But exactly one store and/or load. */
1380 for (j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1382 tree type = TREE_TYPE (DR_REF (dr));
1384 /* The memset, memcpy and memmove library calls are only
1385 able to deal with generic address space. */
1386 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1387 return;
1389 if (DR_IS_READ (dr))
1391 if (single_load != NULL)
1392 return;
1393 single_load = dr;
1395 else
1397 if (single_store != NULL)
1398 return;
1399 single_store = dr;
1404 if (!single_store)
1405 return;
1407 nb_iter = number_of_latch_executions (loop);
1408 gcc_assert (nb_iter && nb_iter != chrec_dont_know);
1409 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
1410 gimple_bb (DR_STMT (single_store))))
1411 plus_one = true;
1413 if (single_store && !single_load)
1415 gimple *stmt = DR_STMT (single_store);
1416 tree rhs = gimple_assign_rhs1 (stmt);
1417 if (const_with_all_bytes_same (rhs) == -1
1418 && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1419 || (TYPE_MODE (TREE_TYPE (rhs))
1420 != TYPE_MODE (unsigned_char_type_node))))
1421 return;
1422 if (TREE_CODE (rhs) == SSA_NAME
1423 && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1424 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1425 return;
1426 if (!adjacent_dr_p (single_store)
1427 || !dominated_by_p (CDI_DOMINATORS,
1428 loop->latch, gimple_bb (stmt)))
1429 return;
1430 partition->kind = PKIND_MEMSET;
1431 partition->main_dr = single_store;
1432 partition->niter = nb_iter;
1433 partition->plus_one = plus_one;
1435 else if (single_store && single_load)
1437 gimple *store = DR_STMT (single_store);
1438 gimple *load = DR_STMT (single_load);
1439 /* Direct aggregate copy or via an SSA name temporary. */
1440 if (load != store
1441 && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1442 return;
1443 if (!adjacent_dr_p (single_store)
1444 || !adjacent_dr_p (single_load)
1445 || !operand_equal_p (DR_STEP (single_store),
1446 DR_STEP (single_load), 0)
1447 || !dominated_by_p (CDI_DOMINATORS,
1448 loop->latch, gimple_bb (store)))
1449 return;
1450 /* Now check that if there is a dependence this dependence is
1451 of a suitable form for memmove. */
1452 ddr_p ddr = get_data_dependence (rdg, single_load, single_store);
1453 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1454 return;
1456 if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
1458 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1459 return;
1461 lambda_vector dist_v;
1462 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1464 int dist = dist_v[index_in_loop_nest (loop->num,
1465 DDR_LOOP_NEST (ddr))];
1466 if (dist > 0 && !DDR_REVERSED_P (ddr))
1467 return;
1469 partition->kind = PKIND_MEMMOVE;
1471 else
1472 partition->kind = PKIND_MEMCPY;
1473 partition->main_dr = single_store;
1474 partition->secondary_dr = single_load;
1475 partition->niter = nb_iter;
1476 partition->plus_one = plus_one;
1480 /* Returns true when PARTITION1 and PARTITION2 access the same memory
1481 object in RDG. */
1483 static bool
1484 share_memory_accesses (struct graph *rdg,
1485 partition *partition1, partition *partition2)
1487 unsigned i, j;
1488 bitmap_iterator bi, bj;
1489 data_reference_p dr1, dr2;
1491 /* First check whether in the intersection of the two partitions are
1492 any loads or stores. Common loads are the situation that happens
1493 most often. */
1494 EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1495 if (RDG_MEM_WRITE_STMT (rdg, i)
1496 || RDG_MEM_READS_STMT (rdg, i))
1497 return true;
1499 /* Then check whether the two partitions access the same memory object. */
1500 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1502 dr1 = datarefs_vec[i];
1504 if (!DR_BASE_ADDRESS (dr1)
1505 || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1506 continue;
1508 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1510 dr2 = datarefs_vec[j];
1512 if (!DR_BASE_ADDRESS (dr2)
1513 || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1514 continue;
1516 if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1517 && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1518 && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1519 && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1520 return true;
1524 return false;
1527 /* For each seed statement in STARTING_STMTS, this function builds
1528 partition for it by adding depended statements according to RDG.
1529 All partitions are recorded in PARTITIONS. */
1531 static void
1532 rdg_build_partitions (struct graph *rdg,
1533 vec<gimple *> starting_stmts,
1534 vec<partition *> *partitions)
1536 auto_bitmap processed;
1537 int i;
1538 gimple *stmt;
1540 FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1542 int v = rdg_vertex_for_stmt (rdg, stmt);
1544 if (dump_file && (dump_flags & TDF_DETAILS))
1545 fprintf (dump_file,
1546 "ldist asked to generate code for vertex %d\n", v);
1548 /* If the vertex is already contained in another partition so
1549 is the partition rooted at it. */
1550 if (bitmap_bit_p (processed, v))
1551 continue;
1553 partition *partition = build_rdg_partition_for_vertex (rdg, v);
1554 bitmap_ior_into (processed, partition->stmts);
1556 if (dump_file && (dump_flags & TDF_DETAILS))
1558 fprintf (dump_file, "ldist creates useful %s partition:\n",
1559 partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1560 bitmap_print (dump_file, partition->stmts, " ", "\n");
1563 partitions->safe_push (partition);
1566 /* All vertices should have been assigned to at least one partition now,
1567 other than vertices belonging to dead code. */
1570 /* Dump to FILE the PARTITIONS. */
1572 static void
1573 dump_rdg_partitions (FILE *file, vec<partition *> partitions)
1575 int i;
1576 partition *partition;
1578 FOR_EACH_VEC_ELT (partitions, i, partition)
1579 debug_bitmap_file (file, partition->stmts);
1582 /* Debug PARTITIONS. */
1583 extern void debug_rdg_partitions (vec<partition *> );
1585 DEBUG_FUNCTION void
1586 debug_rdg_partitions (vec<partition *> partitions)
1588 dump_rdg_partitions (stderr, partitions);
1591 /* Returns the number of read and write operations in the RDG. */
1593 static int
1594 number_of_rw_in_rdg (struct graph *rdg)
1596 int i, res = 0;
1598 for (i = 0; i < rdg->n_vertices; i++)
1600 if (RDG_MEM_WRITE_STMT (rdg, i))
1601 ++res;
1603 if (RDG_MEM_READS_STMT (rdg, i))
1604 ++res;
1607 return res;
1610 /* Returns the number of read and write operations in a PARTITION of
1611 the RDG. */
1613 static int
1614 number_of_rw_in_partition (struct graph *rdg, partition *partition)
1616 int res = 0;
1617 unsigned i;
1618 bitmap_iterator ii;
1620 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
1622 if (RDG_MEM_WRITE_STMT (rdg, i))
1623 ++res;
1625 if (RDG_MEM_READS_STMT (rdg, i))
1626 ++res;
1629 return res;
1632 /* Returns true when one of the PARTITIONS contains all the read or
1633 write operations of RDG. */
1635 static bool
1636 partition_contains_all_rw (struct graph *rdg,
1637 vec<partition *> partitions)
1639 int i;
1640 partition *partition;
1641 int nrw = number_of_rw_in_rdg (rdg);
1643 FOR_EACH_VEC_ELT (partitions, i, partition)
1644 if (nrw == number_of_rw_in_partition (rdg, partition))
1645 return true;
1647 return false;
1650 /* Compute partition dependence created by the data references in DRS1
1651 and DRS2, modify and return DIR according to that. IF ALIAS_DDR is
1652 not NULL, we record dependence introduced by possible alias between
1653 two data references in ALIAS_DDRS; otherwise, we simply ignore such
1654 dependence as if it doesn't exist at all. */
1656 static int
1657 pg_add_dependence_edges (struct graph *rdg, int dir,
1658 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
1660 unsigned i, j;
1661 bitmap_iterator bi, bj;
1662 data_reference_p dr1, dr2, saved_dr1;
1664 /* dependence direction - 0 is no dependence, -1 is back,
1665 1 is forth, 2 is both (we can stop then, merging will occur). */
1666 EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
1668 dr1 = datarefs_vec[i];
1670 EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
1672 int res, this_dir = 1;
1673 ddr_p ddr;
1675 dr2 = datarefs_vec[j];
1677 /* Skip all <read, read> data dependence. */
1678 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1679 continue;
1681 saved_dr1 = dr1;
1682 /* Re-shuffle data-refs to be in topological order. */
1683 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1684 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1686 std::swap (dr1, dr2);
1687 this_dir = -this_dir;
1689 ddr = get_data_dependence (rdg, dr1, dr2);
1690 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1692 this_dir = 0;
1693 res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
1694 DR_BASE_ADDRESS (dr2));
1695 /* Be conservative. If data references are not well analyzed,
1696 or the two data references have the same base address and
1697 offset, add dependence and consider it alias to each other.
1698 In other words, the dependence can not be resolved by
1699 runtime alias check. */
1700 if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
1701 || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
1702 || !DR_INIT (dr1) || !DR_INIT (dr2)
1703 || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
1704 || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
1705 || res == 0)
1706 this_dir = 2;
1707 /* Data dependence could be resolved by runtime alias check,
1708 record it in ALIAS_DDRS. */
1709 else if (alias_ddrs != NULL)
1710 alias_ddrs->safe_push (ddr);
1711 /* Or simply ignore it. */
1713 else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
1715 if (DDR_REVERSED_P (ddr))
1716 this_dir = -this_dir;
1718 /* Known dependences can still be unordered througout the
1719 iteration space, see gcc.dg/tree-ssa/ldist-16.c. */
1720 if (DDR_NUM_DIST_VECTS (ddr) != 1)
1721 this_dir = 2;
1722 /* If the overlap is exact preserve stmt order. */
1723 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1725 /* Else as the distance vector is lexicographic positive swap
1726 the dependence direction. */
1727 else
1728 this_dir = -this_dir;
1730 else
1731 this_dir = 0;
1732 if (this_dir == 2)
1733 return 2;
1734 else if (dir == 0)
1735 dir = this_dir;
1736 else if (this_dir != 0 && dir != this_dir)
1737 return 2;
1738 /* Shuffle "back" dr1. */
1739 dr1 = saved_dr1;
1742 return dir;
1745 /* Compare postorder number of the partition graph vertices V1 and V2. */
1747 static int
1748 pgcmp (const void *v1_, const void *v2_)
1750 const vertex *v1 = (const vertex *)v1_;
1751 const vertex *v2 = (const vertex *)v2_;
1752 return v2->post - v1->post;
1755 /* Data attached to vertices of partition dependence graph. */
1756 struct pg_vdata
1758 /* ID of the corresponding partition. */
1759 int id;
1760 /* The partition. */
1761 struct partition *partition;
1764 /* Data attached to edges of partition dependence graph. */
1765 struct pg_edata
1767 /* If the dependence edge can be resolved by runtime alias check,
1768 this vector contains data dependence relations for runtime alias
1769 check. On the other hand, if the dependence edge is introduced
1770 because of compilation time known data dependence, this vector
1771 contains nothing. */
1772 vec<ddr_p> alias_ddrs;
1775 /* Callback data for traversing edges in graph. */
1776 struct pg_edge_callback_data
1778 /* Bitmap contains strong connected components should be merged. */
1779 bitmap sccs_to_merge;
1780 /* Array constains component information for all vertices. */
1781 int *vertices_component;
1782 /* Vector to record all data dependence relations which are needed
1783 to break strong connected components by runtime alias checks. */
1784 vec<ddr_p> *alias_ddrs;
1787 /* Initialize vertice's data for partition dependence graph PG with
1788 PARTITIONS. */
1790 static void
1791 init_partition_graph_vertices (struct graph *pg,
1792 vec<struct partition *> *partitions)
1794 int i;
1795 partition *partition;
1796 struct pg_vdata *data;
1798 for (i = 0; partitions->iterate (i, &partition); ++i)
1800 data = new pg_vdata;
1801 pg->vertices[i].data = data;
1802 data->id = i;
1803 data->partition = partition;
1807 /* Add edge <I, J> to partition dependence graph PG. Attach vector of data
1808 dependence relations to the EDGE if DDRS isn't NULL. */
1810 static void
1811 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
1813 struct graph_edge *e = add_edge (pg, i, j);
1815 /* If the edge is attached with data dependence relations, it means this
1816 dependence edge can be resolved by runtime alias checks. */
1817 if (ddrs != NULL)
1819 struct pg_edata *data = new pg_edata;
1821 gcc_assert (ddrs->length () > 0);
1822 e->data = data;
1823 data->alias_ddrs = vNULL;
1824 data->alias_ddrs.safe_splice (*ddrs);
1828 /* Callback function for graph travesal algorithm. It returns true
1829 if edge E should skipped when traversing the graph. */
1831 static bool
1832 pg_skip_alias_edge (struct graph_edge *e)
1834 struct pg_edata *data = (struct pg_edata *)e->data;
1835 return (data != NULL && data->alias_ddrs.length () > 0);
1838 /* Callback function freeing data attached to edge E of graph. */
1840 static void
1841 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
1843 if (e->data != NULL)
1845 struct pg_edata *data = (struct pg_edata *)e->data;
1846 data->alias_ddrs.release ();
1847 delete data;
1851 /* Free data attached to vertice of partition dependence graph PG. */
1853 static void
1854 free_partition_graph_vdata (struct graph *pg)
1856 int i;
1857 struct pg_vdata *data;
1859 for (i = 0; i < pg->n_vertices; ++i)
1861 data = (struct pg_vdata *)pg->vertices[i].data;
1862 delete data;
1866 /* Build and return partition dependence graph for PARTITIONS. RDG is
1867 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
1868 is true, data dependence caused by possible alias between references
1869 is ignored, as if it doesn't exist at all; otherwise all depdendences
1870 are considered. */
1872 static struct graph *
1873 build_partition_graph (struct graph *rdg,
1874 vec<struct partition *> *partitions,
1875 bool ignore_alias_p)
1877 int i, j;
1878 struct partition *partition1, *partition2;
1879 graph *pg = new_graph (partitions->length ());
1880 auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
1882 alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
1884 init_partition_graph_vertices (pg, partitions);
1886 for (i = 0; partitions->iterate (i, &partition1); ++i)
1888 for (j = i + 1; partitions->iterate (j, &partition2); ++j)
1890 /* dependence direction - 0 is no dependence, -1 is back,
1891 1 is forth, 2 is both (we can stop then, merging will occur). */
1892 int dir = 0;
1894 /* If the first partition has reduction, add back edge; if the
1895 second partition has reduction, add forth edge. This makes
1896 sure that reduction partition will be sorted as the last one. */
1897 if (partition_reduction_p (partition1))
1898 dir = -1;
1899 else if (partition_reduction_p (partition2))
1900 dir = 1;
1902 /* Cleanup the temporary vector. */
1903 alias_ddrs.truncate (0);
1905 dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
1906 partition2->datarefs, alias_ddrs_p);
1908 /* Add edge to partition graph if there exists dependence. There
1909 are two types of edges. One type edge is caused by compilation
1910 time known dependence, this type can not be resolved by runtime
1911 alias check. The other type can be resolved by runtime alias
1912 check. */
1913 if (dir == 1 || dir == 2
1914 || alias_ddrs.length () > 0)
1916 /* Attach data dependence relations to edge that can be resolved
1917 by runtime alias check. */
1918 bool alias_edge_p = (dir != 1 && dir != 2);
1919 add_partition_graph_edge (pg, i, j,
1920 (alias_edge_p) ? &alias_ddrs : NULL);
1922 if (dir == -1 || dir == 2
1923 || alias_ddrs.length () > 0)
1925 /* Attach data dependence relations to edge that can be resolved
1926 by runtime alias check. */
1927 bool alias_edge_p = (dir != -1 && dir != 2);
1928 add_partition_graph_edge (pg, j, i,
1929 (alias_edge_p) ? &alias_ddrs : NULL);
1933 return pg;
1936 /* Sort partitions in PG by post order and store them in PARTITIONS. */
1938 static void
1939 sort_partitions_by_post_order (struct graph *pg,
1940 vec<struct partition *> *partitions)
1942 int i;
1943 struct pg_vdata *data;
1945 /* Now order the remaining nodes in postorder. */
1946 qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
1947 partitions->truncate (0);
1948 for (i = 0; i < pg->n_vertices; ++i)
1950 data = (struct pg_vdata *)pg->vertices[i].data;
1951 if (data->partition)
1952 partitions->safe_push (data->partition);
1956 /* Given reduced dependence graph RDG merge strong connected components
1957 of PARTITIONS. In this function, data dependence caused by possible
1958 alias between references is ignored, as if it doesn't exist at all. */
1960 static void
1961 merge_dep_scc_partitions (struct graph *rdg,
1962 vec<struct partition *> *partitions)
1964 struct partition *partition1, *partition2;
1965 struct pg_vdata *data;
1966 graph *pg = build_partition_graph (rdg, partitions, true);
1967 int i, j, num_sccs = graphds_scc (pg, NULL);
1969 /* Strong connected compoenent means dependence cycle, we cannot distribute
1970 them. So fuse them together. */
1971 if ((unsigned) num_sccs < partitions->length ())
1973 for (i = 0; i < num_sccs; ++i)
1975 for (j = 0; partitions->iterate (j, &partition1); ++j)
1976 if (pg->vertices[j].component == i)
1977 break;
1978 for (j = j + 1; partitions->iterate (j, &partition2); ++j)
1979 if (pg->vertices[j].component == i)
1981 partition_merge_into (NULL, partition1,
1982 partition2, FUSE_SAME_SCC);
1983 partition1->type = PTYPE_SEQUENTIAL;
1984 (*partitions)[j] = NULL;
1985 partition_free (partition2);
1986 data = (struct pg_vdata *)pg->vertices[j].data;
1987 data->partition = NULL;
1992 sort_partitions_by_post_order (pg, partitions);
1993 gcc_assert (partitions->length () == (unsigned)num_sccs);
1994 free_partition_graph_vdata (pg);
1995 free_graph (pg);
1998 /* Callback function for traversing edge E in graph G. DATA is private
1999 callback data. */
2001 static void
2002 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2004 int i, j, component;
2005 struct pg_edge_callback_data *cbdata;
2006 struct pg_edata *edata = (struct pg_edata *) e->data;
2008 /* If the edge doesn't have attached data dependence, it represents
2009 compilation time known dependences. This type dependence cannot
2010 be resolved by runtime alias check. */
2011 if (edata == NULL || edata->alias_ddrs.length () == 0)
2012 return;
2014 cbdata = (struct pg_edge_callback_data *) data;
2015 i = e->src;
2016 j = e->dest;
2017 component = cbdata->vertices_component[i];
2018 /* Vertices are topologically sorted according to compilation time
2019 known dependences, so we can break strong connected components
2020 by removing edges of the opposite direction, i.e, edges pointing
2021 from vertice with smaller post number to vertice with bigger post
2022 number. */
2023 if (g->vertices[i].post < g->vertices[j].post
2024 /* We only need to remove edges connecting vertices in the same
2025 strong connected component to break it. */
2026 && component == cbdata->vertices_component[j]
2027 /* Check if we want to break the strong connected component or not. */
2028 && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2029 cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2032 /* This is the main function breaking strong conected components in
2033 PARTITIONS giving reduced depdendence graph RDG. Store data dependence
2034 relations for runtime alias check in ALIAS_DDRS. */
2036 static void
2037 break_alias_scc_partitions (struct graph *rdg,
2038 vec<struct partition *> *partitions,
2039 vec<ddr_p> *alias_ddrs)
2041 int i, j, num_sccs, num_sccs_no_alias;
2042 /* Build partition dependence graph. */
2043 graph *pg = build_partition_graph (rdg, partitions, false);
2045 alias_ddrs->truncate (0);
2046 /* Find strong connected components in the graph, with all dependence edges
2047 considered. */
2048 num_sccs = graphds_scc (pg, NULL);
2049 /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2050 compilation time known dependences are merged before this function. */
2051 if ((unsigned) num_sccs < partitions->length ())
2053 struct pg_edge_callback_data cbdata;
2054 auto_bitmap sccs_to_merge;
2055 auto_vec<enum partition_type> scc_types;
2056 struct partition *partition, *first;
2058 /* If all paritions in a SCC has the same type, we can simply merge the
2059 SCC. This loop finds out such SCCS and record them in bitmap. */
2060 bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2061 for (i = 0; i < num_sccs; ++i)
2063 for (j = 0; partitions->iterate (j, &first); ++j)
2064 if (pg->vertices[j].component == i)
2065 break;
2066 for (++j; partitions->iterate (j, &partition); ++j)
2068 if (pg->vertices[j].component != i)
2069 continue;
2071 if (first->type != partition->type)
2073 bitmap_clear_bit (sccs_to_merge, i);
2074 break;
2079 /* Initialize callback data for traversing. */
2080 cbdata.sccs_to_merge = sccs_to_merge;
2081 cbdata.alias_ddrs = alias_ddrs;
2082 cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2083 /* Record the component information which will be corrupted by next
2084 graph scc finding call. */
2085 for (i = 0; i < pg->n_vertices; ++i)
2086 cbdata.vertices_component[i] = pg->vertices[i].component;
2088 /* Collect data dependences for runtime alias checks to break SCCs. */
2089 if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2091 /* Run SCC finding algorithm again, with alias dependence edges
2092 skipped. This is to topologically sort paritions according to
2093 compilation time known dependence. Note the topological order
2094 is stored in the form of pg's post order number. */
2095 num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2096 gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias);
2097 /* With topological order, we can construct two subgraphs L and R.
2098 L contains edge <x, y> where x < y in terms of post order, while
2099 R contains edge <x, y> where x > y. Edges for compilation time
2100 known dependence all fall in R, so we break SCCs by removing all
2101 (alias) edges of in subgraph L. */
2102 for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2105 /* For SCC that doesn't need to be broken, merge it. */
2106 for (i = 0; i < num_sccs; ++i)
2108 if (!bitmap_bit_p (sccs_to_merge, i))
2109 continue;
2111 for (j = 0; partitions->iterate (j, &first); ++j)
2112 if (cbdata.vertices_component[j] == i)
2113 break;
2114 for (++j; partitions->iterate (j, &partition); ++j)
2116 struct pg_vdata *data;
2118 if (cbdata.vertices_component[j] != i)
2119 continue;
2121 partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2122 (*partitions)[j] = NULL;
2123 partition_free (partition);
2124 data = (struct pg_vdata *)pg->vertices[j].data;
2125 gcc_assert (data->id == j);
2126 data->partition = NULL;
2131 sort_partitions_by_post_order (pg, partitions);
2132 free_partition_graph_vdata (pg);
2133 for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2134 free_graph (pg);
2136 if (dump_file && (dump_flags & TDF_DETAILS))
2138 fprintf (dump_file, "Possible alias data dependence to break:\n");
2139 dump_data_dependence_relations (dump_file, *alias_ddrs);
2143 /* Compute and return an expression whose value is the segment length which
2144 will be accessed by DR in NITERS iterations. */
2146 static tree
2147 data_ref_segment_size (struct data_reference *dr, tree niters)
2149 tree segment_length;
2151 if (integer_zerop (DR_STEP (dr)))
2152 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2153 else
2154 segment_length = size_binop (MULT_EXPR,
2155 fold_convert (sizetype, DR_STEP (dr)),
2156 fold_convert (sizetype, niters));
2158 return segment_length;
2161 /* Return true if LOOP's latch is dominated by statement for data reference
2162 DR. */
2164 static inline bool
2165 latch_dominated_by_data_ref (struct loop *loop, data_reference *dr)
2167 return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2168 gimple_bb (DR_STMT (dr)));
2171 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2172 data dependence relations ALIAS_DDRS. */
2174 static void
2175 compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
2176 vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2178 unsigned int i;
2179 unsigned HOST_WIDE_INT factor = 1;
2180 tree niters_plus_one, niters = number_of_latch_executions (loop);
2182 gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2183 niters = fold_convert (sizetype, niters);
2184 niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2186 if (dump_file && (dump_flags & TDF_DETAILS))
2187 fprintf (dump_file, "Creating alias check pairs:\n");
2189 /* Iterate all data dependence relations and compute alias check pairs. */
2190 for (i = 0; i < alias_ddrs->length (); i++)
2192 ddr_p ddr = (*alias_ddrs)[i];
2193 struct data_reference *dr_a = DDR_A (ddr);
2194 struct data_reference *dr_b = DDR_B (ddr);
2195 tree seg_length_a, seg_length_b;
2196 int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2197 DR_BASE_ADDRESS (dr_b));
2199 if (comp_res == 0)
2200 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
2201 gcc_assert (comp_res != 0);
2203 if (latch_dominated_by_data_ref (loop, dr_a))
2204 seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2205 else
2206 seg_length_a = data_ref_segment_size (dr_a, niters);
2208 if (latch_dominated_by_data_ref (loop, dr_b))
2209 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2210 else
2211 seg_length_b = data_ref_segment_size (dr_b, niters);
2213 dr_with_seg_len_pair_t dr_with_seg_len_pair
2214 (dr_with_seg_len (dr_a, seg_length_a),
2215 dr_with_seg_len (dr_b, seg_length_b));
2217 /* Canonicalize pairs by sorting the two DR members. */
2218 if (comp_res > 0)
2219 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2221 comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2224 if (tree_fits_uhwi_p (niters))
2225 factor = tree_to_uhwi (niters);
2227 /* Prune alias check pairs. */
2228 prune_runtime_alias_test_list (comp_alias_pairs, factor);
2229 if (dump_file && (dump_flags & TDF_DETAILS))
2230 fprintf (dump_file,
2231 "Improved number of alias checks from %d to %d\n",
2232 alias_ddrs->length (), comp_alias_pairs->length ());
2235 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2236 checks and version LOOP under condition of these runtime alias checks. */
2238 static void
2239 version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs)
2241 profile_probability prob;
2242 basic_block cond_bb;
2243 struct loop *nloop;
2244 tree lhs, arg0, cond_expr = NULL_TREE;
2245 gimple_seq cond_stmts = NULL;
2246 gimple *call_stmt = NULL;
2247 auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2249 /* Generate code for runtime alias checks if necessary. */
2250 gcc_assert (alias_ddrs->length () > 0);
2252 if (dump_file && (dump_flags & TDF_DETAILS))
2253 fprintf (dump_file,
2254 "Version loop <%d> with runtime alias check\n", loop->num);
2256 compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2257 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2258 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2259 is_gimple_val, NULL_TREE);
2261 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */
2262 if (flag_tree_loop_vectorize)
2264 /* Generate internal function call for loop distribution alias check. */
2265 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2266 2, NULL_TREE, cond_expr);
2267 lhs = make_ssa_name (boolean_type_node);
2268 gimple_call_set_lhs (call_stmt, lhs);
2270 else
2271 lhs = cond_expr;
2273 prob = profile_probability::guessed_always ().apply_scale (9, 10);
2274 initialize_original_copy_tables ();
2275 nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2276 prob, prob.invert (), true);
2277 free_original_copy_tables ();
2278 /* Record the original loop number in newly generated loops. In case of
2279 distribution, the original loop will be distributed and the new loop
2280 is kept. */
2281 loop->orig_loop_num = nloop->num;
2282 nloop->orig_loop_num = nloop->num;
2283 nloop->dont_vectorize = true;
2284 nloop->force_vectorize = false;
2286 if (call_stmt)
2288 /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2289 loop could be destroyed. */
2290 arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2291 gimple_call_set_arg (call_stmt, 0, arg0);
2292 gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2295 if (cond_stmts)
2297 gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2298 gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2300 update_ssa (TODO_update_ssa);
2303 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2304 ALIAS_DDRS are data dependence relations for runtime alias check. */
2306 static inline bool
2307 version_for_distribution_p (vec<struct partition *> *partitions,
2308 vec<ddr_p> *alias_ddrs)
2310 /* No need to version loop if we have only one partition. */
2311 if (partitions->length () == 1)
2312 return false;
2314 /* Need to version loop if runtime alias check is necessary. */
2315 return (alias_ddrs->length () > 0);
2318 /* Fuse all partitions if necessary before finalizing distribution. */
2320 static void
2321 finalize_partitions (vec<struct partition *> *partitions,
2322 vec<ddr_p> *alias_ddrs)
2324 unsigned i;
2325 struct partition *a, *partition;
2327 if (partitions->length () == 1
2328 || alias_ddrs->length () > 0)
2329 return;
2331 a = (*partitions)[0];
2332 if (a->kind != PKIND_NORMAL)
2333 return;
2335 for (i = 1; partitions->iterate (i, &partition); ++i)
2337 /* Don't fuse if partition has different type or it is a builtin. */
2338 if (partition->type != a->type
2339 || partition->kind != PKIND_NORMAL)
2340 return;
2343 /* Fuse all partitions. */
2344 for (i = 1; partitions->iterate (i, &partition); ++i)
2346 partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2347 partition_free (partition);
2349 partitions->truncate (1);
2352 /* Distributes the code from LOOP in such a way that producer statements
2353 are placed before consumer statements. Tries to separate only the
2354 statements from STMTS into separate loops. Returns the number of
2355 distributed loops. Set NB_CALLS to number of generated builtin calls.
2356 Set *DESTROY_P to whether LOOP needs to be destroyed. */
2358 static int
2359 distribute_loop (struct loop *loop, vec<gimple *> stmts,
2360 control_dependences *cd, int *nb_calls, bool *destroy_p)
2362 ddrs_table = new hash_table<ddr_hasher> (389);
2363 struct graph *rdg;
2364 partition *partition;
2365 bool any_builtin;
2366 int i, nbp;
2368 *destroy_p = false;
2369 *nb_calls = 0;
2370 loop_nest.create (0);
2371 if (!find_loop_nest (loop, &loop_nest))
2373 loop_nest.release ();
2374 delete ddrs_table;
2375 return 0;
2378 datarefs_vec.create (20);
2379 rdg = build_rdg (loop, cd);
2380 if (!rdg)
2382 if (dump_file && (dump_flags & TDF_DETAILS))
2383 fprintf (dump_file,
2384 "Loop %d not distributed: failed to build the RDG.\n",
2385 loop->num);
2387 loop_nest.release ();
2388 free_data_refs (datarefs_vec);
2389 delete ddrs_table;
2390 return 0;
2393 if (datarefs_vec.length () > MAX_DATAREFS_NUM)
2395 if (dump_file && (dump_flags & TDF_DETAILS))
2396 fprintf (dump_file,
2397 "Loop %d not distributed: too many memory references.\n",
2398 loop->num);
2400 free_rdg (rdg);
2401 loop_nest.release ();
2402 free_data_refs (datarefs_vec);
2403 delete ddrs_table;
2404 return 0;
2407 data_reference_p dref;
2408 for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
2409 dref->aux = (void *) (uintptr_t) i;
2411 if (dump_file && (dump_flags & TDF_DETAILS))
2412 dump_rdg (dump_file, rdg);
2414 auto_vec<struct partition *, 3> partitions;
2415 rdg_build_partitions (rdg, stmts, &partitions);
2417 auto_vec<ddr_p> alias_ddrs;
2419 auto_bitmap stmt_in_all_partitions;
2420 bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
2421 for (i = 1; partitions.iterate (i, &partition); ++i)
2422 bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
2424 any_builtin = false;
2425 FOR_EACH_VEC_ELT (partitions, i, partition)
2427 classify_partition (loop, rdg, partition, stmt_in_all_partitions);
2428 any_builtin |= partition_builtin_p (partition);
2431 /* If we are only distributing patterns but did not detect any,
2432 simply bail out. */
2433 if (!flag_tree_loop_distribution
2434 && !any_builtin)
2436 nbp = 0;
2437 goto ldist_done;
2440 /* If we are only distributing patterns fuse all partitions that
2441 were not classified as builtins. This also avoids chopping
2442 a loop into pieces, separated by builtin calls. That is, we
2443 only want no or a single loop body remaining. */
2444 struct partition *into;
2445 if (!flag_tree_loop_distribution)
2447 for (i = 0; partitions.iterate (i, &into); ++i)
2448 if (!partition_builtin_p (into))
2449 break;
2450 for (++i; partitions.iterate (i, &partition); ++i)
2451 if (!partition_builtin_p (partition))
2453 partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
2454 partitions.unordered_remove (i);
2455 partition_free (partition);
2456 i--;
2460 /* Due to limitations in the transform phase we have to fuse all
2461 reduction partitions into the last partition so the existing
2462 loop will contain all loop-closed PHI nodes. */
2463 for (i = 0; partitions.iterate (i, &into); ++i)
2464 if (partition_reduction_p (into))
2465 break;
2466 for (i = i + 1; partitions.iterate (i, &partition); ++i)
2467 if (partition_reduction_p (partition))
2469 partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
2470 partitions.unordered_remove (i);
2471 partition_free (partition);
2472 i--;
2475 /* Apply our simple cost model - fuse partitions with similar
2476 memory accesses. */
2477 for (i = 0; partitions.iterate (i, &into); ++i)
2479 bool changed = false;
2480 if (partition_builtin_p (into))
2481 continue;
2482 for (int j = i + 1;
2483 partitions.iterate (j, &partition); ++j)
2485 if (share_memory_accesses (rdg, into, partition))
2487 partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
2488 partitions.unordered_remove (j);
2489 partition_free (partition);
2490 j--;
2491 changed = true;
2494 /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
2495 accesses when 1 and 2 have similar accesses but not 0 and 1
2496 then in the next iteration we will fail to consider merging
2497 1 into 0,2. So try again if we did any merging into 0. */
2498 if (changed)
2499 i--;
2502 /* Build the partition dependency graph. */
2503 if (partitions.length () > 1)
2505 merge_dep_scc_partitions (rdg, &partitions);
2506 alias_ddrs.truncate (0);
2507 if (partitions.length () > 1)
2508 break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
2511 finalize_partitions (&partitions, &alias_ddrs);
2513 nbp = partitions.length ();
2514 if (nbp == 0
2515 || (nbp == 1 && !partition_builtin_p (partitions[0]))
2516 || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
2518 nbp = 0;
2519 goto ldist_done;
2522 if (version_for_distribution_p (&partitions, &alias_ddrs))
2523 version_loop_by_alias_check (loop, &alias_ddrs);
2525 if (dump_file && (dump_flags & TDF_DETAILS))
2527 fprintf (dump_file,
2528 "distribute loop <%d> into partitions:\n", loop->num);
2529 dump_rdg_partitions (dump_file, partitions);
2532 FOR_EACH_VEC_ELT (partitions, i, partition)
2534 if (partition_builtin_p (partition))
2535 (*nb_calls)++;
2536 *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
2539 ldist_done:
2540 loop_nest.release ();
2541 free_data_refs (datarefs_vec);
2542 for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
2543 iter != ddrs_table->end (); ++iter)
2545 free_dependence_relation (*iter);
2546 *iter = NULL;
2548 delete ddrs_table;
2550 FOR_EACH_VEC_ELT (partitions, i, partition)
2551 partition_free (partition);
2553 free_rdg (rdg);
2554 return nbp - *nb_calls;
2557 /* Distribute all loops in the current function. */
2559 namespace {
2561 const pass_data pass_data_loop_distribution =
2563 GIMPLE_PASS, /* type */
2564 "ldist", /* name */
2565 OPTGROUP_LOOP, /* optinfo_flags */
2566 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
2567 ( PROP_cfg | PROP_ssa ), /* properties_required */
2568 0, /* properties_provided */
2569 0, /* properties_destroyed */
2570 0, /* todo_flags_start */
2571 0, /* todo_flags_finish */
2574 class pass_loop_distribution : public gimple_opt_pass
2576 public:
2577 pass_loop_distribution (gcc::context *ctxt)
2578 : gimple_opt_pass (pass_data_loop_distribution, ctxt)
2581 /* opt_pass methods: */
2582 virtual bool gate (function *)
2584 return flag_tree_loop_distribution
2585 || flag_tree_loop_distribute_patterns;
2588 virtual unsigned int execute (function *);
2590 }; // class pass_loop_distribution
2592 unsigned int
2593 pass_loop_distribution::execute (function *fun)
2595 struct loop *loop;
2596 bool changed = false;
2597 basic_block bb;
2598 control_dependences *cd = NULL;
2599 auto_vec<loop_p> loops_to_be_destroyed;
2601 if (number_of_loops (fun) <= 1)
2602 return 0;
2604 /* Compute topological order for basic blocks. Topological order is
2605 needed because data dependence is computed for data references in
2606 lexicographical order. */
2607 if (bb_top_order_index == NULL)
2609 int rpo_num;
2610 int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun));
2612 bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
2613 bb_top_order_index_size = last_basic_block_for_fn (cfun);
2614 rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true);
2615 for (int i = 0; i < rpo_num; i++)
2616 bb_top_order_index[rpo[i]] = i;
2618 free (rpo);
2621 FOR_ALL_BB_FN (bb, fun)
2623 gimple_stmt_iterator gsi;
2624 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2625 gimple_set_uid (gsi_stmt (gsi), -1);
2626 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2627 gimple_set_uid (gsi_stmt (gsi), -1);
2630 /* We can at the moment only distribute non-nested loops, thus restrict
2631 walking to innermost loops. */
2632 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
2634 auto_vec<gimple *> work_list;
2635 basic_block *bbs;
2636 int num = loop->num;
2637 unsigned int i;
2639 /* If the loop doesn't have a single exit we will fail anyway,
2640 so do that early. */
2641 if (!single_exit (loop))
2642 continue;
2644 /* Only optimize hot loops. */
2645 if (!optimize_loop_for_speed_p (loop))
2646 continue;
2648 /* Don't distribute loop if niters is unknown. */
2649 tree niters = number_of_latch_executions (loop);
2650 if (niters == NULL_TREE || niters == chrec_dont_know)
2651 continue;
2653 /* Initialize the worklist with stmts we seed the partitions with. */
2654 bbs = get_loop_body_in_dom_order (loop);
2655 for (i = 0; i < loop->num_nodes; ++i)
2657 for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
2658 !gsi_end_p (gsi);
2659 gsi_next (&gsi))
2661 gphi *phi = gsi.phi ();
2662 if (virtual_operand_p (gimple_phi_result (phi)))
2663 continue;
2664 /* Distribute stmts which have defs that are used outside of
2665 the loop. */
2666 if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
2667 continue;
2668 work_list.safe_push (phi);
2670 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2671 !gsi_end_p (gsi);
2672 gsi_next (&gsi))
2674 gimple *stmt = gsi_stmt (gsi);
2676 /* If there is a stmt with side-effects bail out - we
2677 cannot and should not distribute this loop. */
2678 if (gimple_has_side_effects (stmt))
2680 work_list.truncate (0);
2681 goto out;
2684 /* Distribute stmts which have defs that are used outside of
2685 the loop. */
2686 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
2688 /* Otherwise only distribute stores for now. */
2689 else if (!gimple_vdef (stmt))
2690 continue;
2692 work_list.safe_push (stmt);
2695 out:
2696 free (bbs);
2698 int nb_generated_loops = 0;
2699 int nb_generated_calls = 0;
2700 location_t loc = find_loop_location (loop);
2701 if (work_list.length () > 0)
2703 if (!cd)
2705 calculate_dominance_info (CDI_DOMINATORS);
2706 calculate_dominance_info (CDI_POST_DOMINATORS);
2707 cd = new control_dependences ();
2708 free_dominance_info (CDI_POST_DOMINATORS);
2710 bool destroy_p;
2711 nb_generated_loops = distribute_loop (loop, work_list, cd,
2712 &nb_generated_calls,
2713 &destroy_p);
2714 if (destroy_p)
2715 loops_to_be_destroyed.safe_push (loop);
2718 if (nb_generated_loops + nb_generated_calls > 0)
2720 changed = true;
2721 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
2722 loc, "Loop %d distributed: split to %d loops "
2723 "and %d library calls.\n",
2724 num, nb_generated_loops, nb_generated_calls);
2726 else if (dump_file && (dump_flags & TDF_DETAILS))
2727 fprintf (dump_file, "Loop %d is the same.\n", num);
2730 if (cd)
2731 delete cd;
2733 if (bb_top_order_index != NULL)
2735 free (bb_top_order_index);
2736 bb_top_order_index = NULL;
2737 bb_top_order_index_size = 0;
2740 if (changed)
2742 /* Destroy loop bodies that could not be reused. Do this late as we
2743 otherwise can end up refering to stale data in control dependences. */
2744 unsigned i;
2745 FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
2746 destroy_loop (loop);
2748 /* Cached scalar evolutions now may refer to wrong or non-existing
2749 loops. */
2750 scev_reset_htab ();
2751 mark_virtual_operands_for_renaming (fun);
2752 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
2755 checking_verify_loop_structure ();
2757 return 0;
2760 } // anon namespace
2762 gimple_opt_pass *
2763 make_pass_loop_distribution (gcc::context *ctxt)
2765 return new pass_loop_distribution (ctxt);