* builtins.def (BUILT_IN_SETJMP): Revert latest change.
[official-gcc.git] / gcc / tree-loop-distribution.c
blob26b8b9a3751ae843b802fa90923d1b02e1c91cfd
1 /* Loop distribution.
2 Copyright (C) 2006-2017 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 /* This pass performs loop distribution: for example, the loop
24 |DO I = 2, N
25 | A(I) = B(I) + C
26 | D(I) = A(I-1)*E
27 |ENDDO
29 is transformed to
31 |DOALL I = 2, N
32 | A(I) = B(I) + C
33 |ENDDO
35 |DOALL I = 2, N
36 | D(I) = A(I-1)*E
37 |ENDDO
39 Loop distribution is the dual of loop fusion. It separates statements
40 of a loop (or loop nest) into multiple loops (or loop nests) with the
41 same loop header. The major goal is to separate statements which may
42 be vectorized from those that can't. This pass implements distribution
43 in the following steps:
45 1) Seed partitions with specific type statements. For now we support
46 two types seed statements: statement defining variable used outside
47 of loop; statement storing to memory.
48 2) Build reduced dependence graph (RDG) for loop to be distributed.
49 The vertices (RDG:V) model all statements in the loop and the edges
50 (RDG:E) model flow and control dependencies between statements.
51 3) Apart from RDG, compute data dependencies between memory references.
52 4) Starting from seed statement, build up partition by adding depended
53 statements according to RDG's dependence information. Partition is
54 classified as parallel type if it can be executed paralleled; or as
55 sequential type if it can't. Parallel type partition is further
56 classified as different builtin kinds if it can be implemented as
57 builtin function calls.
58 5) Build partition dependence graph (PG) based on data dependencies.
59 The vertices (PG:V) model all partitions and the edges (PG:E) model
60 all data dependencies between every partitions pair. In general,
61 data dependence is either compilation time known or unknown. In C
62 family languages, there exists quite amount compilation time unknown
63 dependencies because of possible alias relation of data references.
64 We categorize PG's edge to two types: "true" edge that represents
65 compilation time known data dependencies; "alias" edge for all other
66 data dependencies.
67 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge
68 partitions in each strong connected component (SCC) correspondingly.
69 Build new PG for merged partitions.
70 7) Traverse PG again and this time with both "true" and "alias" edges
71 included. We try to break SCCs by removing some edges. Because
72 SCCs by "true" edges are all fused in step 6), we can break SCCs
73 by removing some "alias" edges. It's NP-hard to choose optimal
74 edge set, fortunately simple approximation is good enough for us
75 given the small problem scale.
76 8) Collect all data dependencies of the removed "alias" edges. Create
77 runtime alias checks for collected data dependencies.
78 9) Version loop under the condition of runtime alias checks. Given
79 loop distribution generally introduces additional overhead, it is
80 only useful if vectorization is achieved in distributed loop. We
81 version loop with internal function call IFN_LOOP_DIST_ALIAS. If
82 no distributed loop can be vectorized, we simply remove distributed
83 loops and recover to the original one.
85 TODO:
86 1) We only distribute innermost loops now. This pass should handle loop
87 nests in the future.
88 2) We only fuse partitions in SCC now. A better fusion algorithm is
89 desired to minimize loop overhead, maximize parallelism and maximize
90 data reuse. */
92 #include "config.h"
93 #include "system.h"
94 #include "coretypes.h"
95 #include "backend.h"
96 #include "tree.h"
97 #include "gimple.h"
98 #include "cfghooks.h"
99 #include "tree-pass.h"
100 #include "ssa.h"
101 #include "gimple-pretty-print.h"
102 #include "fold-const.h"
103 #include "cfganal.h"
104 #include "gimple-iterator.h"
105 #include "gimplify-me.h"
106 #include "stor-layout.h"
107 #include "tree-cfg.h"
108 #include "tree-ssa-loop-manip.h"
109 #include "tree-ssa-loop.h"
110 #include "tree-into-ssa.h"
111 #include "tree-ssa.h"
112 #include "cfgloop.h"
113 #include "tree-scalar-evolution.h"
114 #include "params.h"
115 #include "tree-vectorizer.h"
118 #define MAX_DATAREFS_NUM \
119 ((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
121 /* Hashtable helpers. */
123 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
125 static inline hashval_t hash (const data_dependence_relation *);
126 static inline bool equal (const data_dependence_relation *,
127 const data_dependence_relation *);
130 /* Hash function for data dependence. */
132 inline hashval_t
133 ddr_hasher::hash (const data_dependence_relation *ddr)
135 inchash::hash h;
136 h.add_ptr (DDR_A (ddr));
137 h.add_ptr (DDR_B (ddr));
138 return h.end ();
141 /* Hash table equality function for data dependence. */
143 inline bool
144 ddr_hasher::equal (const data_dependence_relation *ddr1,
145 const data_dependence_relation *ddr2)
147 return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
150 /* The loop (nest) to be distributed. */
151 static vec<loop_p> loop_nest;
153 /* Vector of data references in the loop to be distributed. */
154 static vec<data_reference_p> datarefs_vec;
156 /* Store index of data reference in aux field. */
157 #define DR_INDEX(dr) ((uintptr_t) (dr)->aux)
159 /* Hash table for data dependence relation in the loop to be distributed. */
160 static hash_table<ddr_hasher> *ddrs_table;
162 /* A Reduced Dependence Graph (RDG) vertex representing a statement. */
163 struct rdg_vertex
165 /* The statement represented by this vertex. */
166 gimple *stmt;
168 /* Vector of data-references in this statement. */
169 vec<data_reference_p> datarefs;
171 /* True when the statement contains a write to memory. */
172 bool has_mem_write;
174 /* True when the statement contains a read from memory. */
175 bool has_mem_reads;
178 #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt
179 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
180 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
181 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
182 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
183 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
184 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
185 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
187 /* Data dependence type. */
189 enum rdg_dep_type
191 /* Read After Write (RAW). */
192 flow_dd = 'f',
194 /* Control dependence (execute conditional on). */
195 control_dd = 'c'
198 /* Dependence information attached to an edge of the RDG. */
200 struct rdg_edge
202 /* Type of the dependence. */
203 enum rdg_dep_type type;
206 #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type
208 /* Dump vertex I in RDG to FILE. */
210 static void
211 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
213 struct vertex *v = &(rdg->vertices[i]);
214 struct graph_edge *e;
216 fprintf (file, "(vertex %d: (%s%s) (in:", i,
217 RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
218 RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
220 if (v->pred)
221 for (e = v->pred; e; e = e->pred_next)
222 fprintf (file, " %d", e->src);
224 fprintf (file, ") (out:");
226 if (v->succ)
227 for (e = v->succ; e; e = e->succ_next)
228 fprintf (file, " %d", e->dest);
230 fprintf (file, ")\n");
231 print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
232 fprintf (file, ")\n");
235 /* Call dump_rdg_vertex on stderr. */
237 DEBUG_FUNCTION void
238 debug_rdg_vertex (struct graph *rdg, int i)
240 dump_rdg_vertex (stderr, rdg, i);
243 /* Dump the reduced dependence graph RDG to FILE. */
245 static void
246 dump_rdg (FILE *file, struct graph *rdg)
248 fprintf (file, "(rdg\n");
249 for (int i = 0; i < rdg->n_vertices; i++)
250 dump_rdg_vertex (file, rdg, i);
251 fprintf (file, ")\n");
254 /* Call dump_rdg on stderr. */
256 DEBUG_FUNCTION void
257 debug_rdg (struct graph *rdg)
259 dump_rdg (stderr, rdg);
262 static void
263 dot_rdg_1 (FILE *file, struct graph *rdg)
265 int i;
266 pretty_printer buffer;
267 pp_needs_newline (&buffer) = false;
268 buffer.buffer->stream = file;
270 fprintf (file, "digraph RDG {\n");
272 for (i = 0; i < rdg->n_vertices; i++)
274 struct vertex *v = &(rdg->vertices[i]);
275 struct graph_edge *e;
277 fprintf (file, "%d [label=\"[%d] ", i, i);
278 pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
279 pp_flush (&buffer);
280 fprintf (file, "\"]\n");
282 /* Highlight reads from memory. */
283 if (RDG_MEM_READS_STMT (rdg, i))
284 fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
286 /* Highlight stores to memory. */
287 if (RDG_MEM_WRITE_STMT (rdg, i))
288 fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
290 if (v->succ)
291 for (e = v->succ; e; e = e->succ_next)
292 switch (RDGE_TYPE (e))
294 case flow_dd:
295 /* These are the most common dependences: don't print these. */
296 fprintf (file, "%d -> %d \n", i, e->dest);
297 break;
299 case control_dd:
300 fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
301 break;
303 default:
304 gcc_unreachable ();
308 fprintf (file, "}\n\n");
311 /* Display the Reduced Dependence Graph using dotty. */
313 DEBUG_FUNCTION void
314 dot_rdg (struct graph *rdg)
316 /* When debugging, you may want to enable the following code. */
317 #ifdef HAVE_POPEN
318 FILE *file = popen ("dot -Tx11", "w");
319 if (!file)
320 return;
321 dot_rdg_1 (file, rdg);
322 fflush (file);
323 close (fileno (file));
324 pclose (file);
325 #else
326 dot_rdg_1 (stderr, rdg);
327 #endif
330 /* Returns the index of STMT in RDG. */
332 static int
333 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
335 int index = gimple_uid (stmt);
336 gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
337 return index;
340 /* Creates dependence edges in RDG for all the uses of DEF. IDEF is
341 the index of DEF in RDG. */
343 static void
344 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
346 use_operand_p imm_use_p;
347 imm_use_iterator iterator;
349 FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
351 struct graph_edge *e;
352 int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
354 if (use < 0)
355 continue;
357 e = add_edge (rdg, idef, use);
358 e->data = XNEW (struct rdg_edge);
359 RDGE_TYPE (e) = flow_dd;
363 /* Creates an edge for the control dependences of BB to the vertex V. */
365 static void
366 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
367 int v, control_dependences *cd)
369 bitmap_iterator bi;
370 unsigned edge_n;
371 EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
372 0, edge_n, bi)
374 basic_block cond_bb = cd->get_edge_src (edge_n);
375 gimple *stmt = last_stmt (cond_bb);
376 if (stmt && is_ctrl_stmt (stmt))
378 struct graph_edge *e;
379 int c = rdg_vertex_for_stmt (rdg, stmt);
380 if (c < 0)
381 continue;
383 e = add_edge (rdg, c, v);
384 e->data = XNEW (struct rdg_edge);
385 RDGE_TYPE (e) = control_dd;
390 /* Creates the edges of the reduced dependence graph RDG. */
392 static void
393 create_rdg_flow_edges (struct graph *rdg)
395 int i;
396 def_operand_p def_p;
397 ssa_op_iter iter;
399 for (i = 0; i < rdg->n_vertices; i++)
400 FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
401 iter, SSA_OP_DEF)
402 create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
405 /* Creates the edges of the reduced dependence graph RDG. */
407 static void
408 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
410 int i;
412 for (i = 0; i < rdg->n_vertices; i++)
414 gimple *stmt = RDG_STMT (rdg, i);
415 if (gimple_code (stmt) == GIMPLE_PHI)
417 edge_iterator ei;
418 edge e;
419 FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
420 if (flow_bb_inside_loop_p (loop, e->src))
421 create_edge_for_control_dependence (rdg, e->src, i, cd);
423 else
424 create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
428 /* Build the vertices of the reduced dependence graph RDG. Return false
429 if that failed. */
431 static bool
432 create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop)
434 int i;
435 gimple *stmt;
437 FOR_EACH_VEC_ELT (stmts, i, stmt)
439 struct vertex *v = &(rdg->vertices[i]);
441 /* Record statement to vertex mapping. */
442 gimple_set_uid (stmt, i);
444 v->data = XNEW (struct rdg_vertex);
445 RDGV_STMT (v) = stmt;
446 RDGV_DATAREFS (v).create (0);
447 RDGV_HAS_MEM_WRITE (v) = false;
448 RDGV_HAS_MEM_READS (v) = false;
449 if (gimple_code (stmt) == GIMPLE_PHI)
450 continue;
452 unsigned drp = datarefs_vec.length ();
453 if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
454 return false;
455 for (unsigned j = drp; j < datarefs_vec.length (); ++j)
457 data_reference_p dr = datarefs_vec[j];
458 if (DR_IS_READ (dr))
459 RDGV_HAS_MEM_READS (v) = true;
460 else
461 RDGV_HAS_MEM_WRITE (v) = true;
462 RDGV_DATAREFS (v).safe_push (dr);
465 return true;
468 /* Array mapping basic block's index to its topological order. */
469 static int *bb_top_order_index;
470 /* And size of the array. */
471 static int bb_top_order_index_size;
473 /* If X has a smaller topological sort number than Y, returns -1;
474 if greater, returns 1. */
476 static int
477 bb_top_order_cmp (const void *x, const void *y)
479 basic_block bb1 = *(const basic_block *) x;
480 basic_block bb2 = *(const basic_block *) y;
482 gcc_assert (bb1->index < bb_top_order_index_size
483 && bb2->index < bb_top_order_index_size);
484 gcc_assert (bb1 == bb2
485 || bb_top_order_index[bb1->index]
486 != bb_top_order_index[bb2->index]);
488 return (bb_top_order_index[bb1->index] - bb_top_order_index[bb2->index]);
491 /* Initialize STMTS with all the statements of LOOP. We use topological
492 order to discover all statements. The order is important because
493 generate_loops_for_partition is using the same traversal for identifying
494 statements in loop copies. */
496 static void
497 stmts_from_loop (struct loop *loop, vec<gimple *> *stmts)
499 unsigned int i;
500 basic_block *bbs = get_loop_body_in_custom_order (loop, bb_top_order_cmp);
502 for (i = 0; i < loop->num_nodes; i++)
504 basic_block bb = bbs[i];
506 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
507 gsi_next (&bsi))
508 if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
509 stmts->safe_push (bsi.phi ());
511 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
512 gsi_next (&bsi))
514 gimple *stmt = gsi_stmt (bsi);
515 if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
516 stmts->safe_push (stmt);
520 free (bbs);
523 /* Free the reduced dependence graph RDG. */
525 static void
526 free_rdg (struct graph *rdg)
528 int i;
530 for (i = 0; i < rdg->n_vertices; i++)
532 struct vertex *v = &(rdg->vertices[i]);
533 struct graph_edge *e;
535 for (e = v->succ; e; e = e->succ_next)
536 free (e->data);
538 if (v->data)
540 gimple_set_uid (RDGV_STMT (v), -1);
541 (RDGV_DATAREFS (v)).release ();
542 free (v->data);
546 free_graph (rdg);
549 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
550 LOOP, and one edge per flow dependence or control dependence from control
551 dependence CD. During visiting each statement, data references are also
552 collected and recorded in global data DATAREFS_VEC. */
554 static struct graph *
555 build_rdg (struct loop *loop, control_dependences *cd)
557 struct graph *rdg;
559 /* Create the RDG vertices from the stmts of the loop nest. */
560 auto_vec<gimple *, 10> stmts;
561 stmts_from_loop (loop, &stmts);
562 rdg = new_graph (stmts.length ());
563 if (!create_rdg_vertices (rdg, stmts, loop))
565 free_rdg (rdg);
566 return NULL;
568 stmts.release ();
570 create_rdg_flow_edges (rdg);
571 if (cd)
572 create_rdg_cd_edges (rdg, cd, loop);
574 return rdg;
578 /* Kind of distributed loop. */
579 enum partition_kind {
580 PKIND_NORMAL, PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
583 /* Type of distributed loop. */
584 enum partition_type {
585 /* The distributed loop can be executed parallelly. */
586 PTYPE_PARALLEL = 0,
587 /* The distributed loop has to be executed sequentially. */
588 PTYPE_SEQUENTIAL
591 /* Partition for loop distribution. */
592 struct partition
594 /* Statements of the partition. */
595 bitmap stmts;
596 /* Loops of the partition. */
597 bitmap loops;
598 /* True if the partition defines variable which is used outside of loop. */
599 bool reduction_p;
600 /* For builtin partition, true if it executes one iteration more than
601 number of loop (latch) iterations. */
602 bool plus_one;
603 enum partition_kind kind;
604 enum partition_type type;
605 /* data-references a kind != PKIND_NORMAL partition is about. */
606 data_reference_p main_dr;
607 data_reference_p secondary_dr;
608 /* Number of loop (latch) iterations. */
609 tree niter;
610 /* Data references in the partition. */
611 bitmap datarefs;
615 /* Allocate and initialize a partition from BITMAP. */
617 static partition *
618 partition_alloc (void)
620 partition *partition = XCNEW (struct partition);
621 partition->stmts = BITMAP_ALLOC (NULL);
622 partition->loops = BITMAP_ALLOC (NULL);
623 partition->reduction_p = false;
624 partition->kind = PKIND_NORMAL;
625 partition->datarefs = BITMAP_ALLOC (NULL);
626 return partition;
629 /* Free PARTITION. */
631 static void
632 partition_free (partition *partition)
634 BITMAP_FREE (partition->stmts);
635 BITMAP_FREE (partition->loops);
636 BITMAP_FREE (partition->datarefs);
637 free (partition);
640 /* Returns true if the partition can be generated as a builtin. */
642 static bool
643 partition_builtin_p (partition *partition)
645 return partition->kind != PKIND_NORMAL;
648 /* Returns true if the partition contains a reduction. */
650 static bool
651 partition_reduction_p (partition *partition)
653 return partition->reduction_p;
656 /* Partitions are fused because of different reasons. */
657 enum fuse_type
659 FUSE_NON_BUILTIN = 0,
660 FUSE_REDUCTION = 1,
661 FUSE_SHARE_REF = 2,
662 FUSE_SAME_SCC = 3,
663 FUSE_FINALIZE = 4
666 /* Description on different fusing reason. */
667 static const char *fuse_message[] = {
668 "they are non-builtins",
669 "they have reductions",
670 "they have shared memory refs",
671 "they are in the same dependence scc",
672 "there is no point to distribute loop"};
674 static void
675 update_type_for_merge (struct graph *, partition *, partition *);
677 /* Merge PARTITION into the partition DEST. RDG is the reduced dependence
678 graph and we update type for result partition if it is non-NULL. */
680 static void
681 partition_merge_into (struct graph *rdg, partition *dest,
682 partition *partition, enum fuse_type ft)
684 if (dump_file && (dump_flags & TDF_DETAILS))
686 fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
687 fprintf (dump_file, " Part 1: ");
688 dump_bitmap (dump_file, dest->stmts);
689 fprintf (dump_file, " Part 2: ");
690 dump_bitmap (dump_file, partition->stmts);
693 dest->kind = PKIND_NORMAL;
694 if (dest->type == PTYPE_PARALLEL)
695 dest->type = partition->type;
697 bitmap_ior_into (dest->stmts, partition->stmts);
698 if (partition_reduction_p (partition))
699 dest->reduction_p = true;
701 /* Further check if any data dependence prevents us from executing the
702 new partition parallelly. */
703 if (dest->type == PTYPE_PARALLEL && rdg != NULL)
704 update_type_for_merge (rdg, dest, partition);
706 bitmap_ior_into (dest->datarefs, partition->datarefs);
710 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
711 the LOOP. */
713 static bool
714 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
716 imm_use_iterator imm_iter;
717 use_operand_p use_p;
719 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
721 gimple *use_stmt = USE_STMT (use_p);
722 if (!is_gimple_debug (use_stmt)
723 && loop != loop_containing_stmt (use_stmt))
724 return true;
727 return false;
730 /* Returns true when STMT defines a scalar variable used after the
731 loop LOOP. */
733 static bool
734 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
736 def_operand_p def_p;
737 ssa_op_iter op_iter;
739 if (gimple_code (stmt) == GIMPLE_PHI)
740 return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
742 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
743 if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
744 return true;
746 return false;
749 /* Return a copy of LOOP placed before LOOP. */
751 static struct loop *
752 copy_loop_before (struct loop *loop)
754 struct loop *res;
755 edge preheader = loop_preheader_edge (loop);
757 initialize_original_copy_tables ();
758 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
759 gcc_assert (res != NULL);
760 free_original_copy_tables ();
761 delete_update_ssa ();
763 return res;
766 /* Creates an empty basic block after LOOP. */
768 static void
769 create_bb_after_loop (struct loop *loop)
771 edge exit = single_exit (loop);
773 if (!exit)
774 return;
776 split_edge (exit);
779 /* Generate code for PARTITION from the code in LOOP. The loop is
780 copied when COPY_P is true. All the statements not flagged in the
781 PARTITION bitmap are removed from the loop or from its copy. The
782 statements are indexed in sequence inside a basic block, and the
783 basic blocks of a loop are taken in dom order. */
785 static void
786 generate_loops_for_partition (struct loop *loop, partition *partition,
787 bool copy_p)
789 unsigned i;
790 basic_block *bbs;
792 if (copy_p)
794 int orig_loop_num = loop->orig_loop_num;
795 loop = copy_loop_before (loop);
796 gcc_assert (loop != NULL);
797 loop->orig_loop_num = orig_loop_num;
798 create_preheader (loop, CP_SIMPLE_PREHEADERS);
799 create_bb_after_loop (loop);
801 else
803 /* Origin number is set to the new versioned loop's num. */
804 gcc_assert (loop->orig_loop_num != loop->num);
807 /* Remove stmts not in the PARTITION bitmap. */
808 bbs = get_loop_body_in_dom_order (loop);
810 if (MAY_HAVE_DEBUG_STMTS)
811 for (i = 0; i < loop->num_nodes; i++)
813 basic_block bb = bbs[i];
815 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
816 gsi_next (&bsi))
818 gphi *phi = bsi.phi ();
819 if (!virtual_operand_p (gimple_phi_result (phi))
820 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
821 reset_debug_uses (phi);
824 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
826 gimple *stmt = gsi_stmt (bsi);
827 if (gimple_code (stmt) != GIMPLE_LABEL
828 && !is_gimple_debug (stmt)
829 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
830 reset_debug_uses (stmt);
834 for (i = 0; i < loop->num_nodes; i++)
836 basic_block bb = bbs[i];
838 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
840 gphi *phi = bsi.phi ();
841 if (!virtual_operand_p (gimple_phi_result (phi))
842 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
843 remove_phi_node (&bsi, true);
844 else
845 gsi_next (&bsi);
848 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
850 gimple *stmt = gsi_stmt (bsi);
851 if (gimple_code (stmt) != GIMPLE_LABEL
852 && !is_gimple_debug (stmt)
853 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
855 /* Choose an arbitrary path through the empty CFG part
856 that this unnecessary control stmt controls. */
857 if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
859 gimple_cond_make_false (cond_stmt);
860 update_stmt (stmt);
862 else if (gimple_code (stmt) == GIMPLE_SWITCH)
864 gswitch *switch_stmt = as_a <gswitch *> (stmt);
865 gimple_switch_set_index
866 (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
867 update_stmt (stmt);
869 else
871 unlink_stmt_vdef (stmt);
872 gsi_remove (&bsi, true);
873 release_defs (stmt);
874 continue;
877 gsi_next (&bsi);
881 free (bbs);
884 /* Build the size argument for a memory operation call. */
886 static tree
887 build_size_arg_loc (location_t loc, data_reference_p dr, tree nb_iter,
888 bool plus_one)
890 tree size = fold_convert_loc (loc, sizetype, nb_iter);
891 if (plus_one)
892 size = size_binop (PLUS_EXPR, size, size_one_node);
893 size = fold_build2_loc (loc, MULT_EXPR, sizetype, size,
894 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))));
895 size = fold_convert_loc (loc, size_type_node, size);
896 return size;
899 /* Build an address argument for a memory operation call. */
901 static tree
902 build_addr_arg_loc (location_t loc, data_reference_p dr, tree nb_bytes)
904 tree addr_base;
906 addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
907 addr_base = fold_convert_loc (loc, sizetype, addr_base);
909 /* Test for a negative stride, iterating over every element. */
910 if (tree_int_cst_sgn (DR_STEP (dr)) == -1)
912 addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base,
913 fold_convert_loc (loc, sizetype, nb_bytes));
914 addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base,
915 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))));
918 return fold_build_pointer_plus_loc (loc, DR_BASE_ADDRESS (dr), addr_base);
921 /* If VAL memory representation contains the same value in all bytes,
922 return that value, otherwise return -1.
923 E.g. for 0x24242424 return 0x24, for IEEE double
924 747708026454360457216.0 return 0x44, etc. */
926 static int
927 const_with_all_bytes_same (tree val)
929 unsigned char buf[64];
930 int i, len;
932 if (integer_zerop (val)
933 || (TREE_CODE (val) == CONSTRUCTOR
934 && !TREE_CLOBBER_P (val)
935 && CONSTRUCTOR_NELTS (val) == 0))
936 return 0;
938 if (real_zerop (val))
940 /* Only return 0 for +0.0, not for -0.0, which doesn't have
941 an all bytes same memory representation. Don't transform
942 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */
943 switch (TREE_CODE (val))
945 case REAL_CST:
946 if (!real_isneg (TREE_REAL_CST_PTR (val)))
947 return 0;
948 break;
949 case COMPLEX_CST:
950 if (!const_with_all_bytes_same (TREE_REALPART (val))
951 && !const_with_all_bytes_same (TREE_IMAGPART (val)))
952 return 0;
953 break;
954 case VECTOR_CST:
955 unsigned int j;
956 for (j = 0; j < VECTOR_CST_NELTS (val); ++j)
957 if (const_with_all_bytes_same (VECTOR_CST_ELT (val, j)))
958 break;
959 if (j == VECTOR_CST_NELTS (val))
960 return 0;
961 break;
962 default:
963 break;
967 if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
968 return -1;
970 len = native_encode_expr (val, buf, sizeof (buf));
971 if (len == 0)
972 return -1;
973 for (i = 1; i < len; i++)
974 if (buf[i] != buf[0])
975 return -1;
976 return buf[0];
979 /* Generate a call to memset for PARTITION in LOOP. */
981 static void
982 generate_memset_builtin (struct loop *loop, partition *partition)
984 gimple_stmt_iterator gsi;
985 gimple *stmt, *fn_call;
986 tree mem, fn, nb_bytes;
987 location_t loc;
988 tree val;
990 stmt = DR_STMT (partition->main_dr);
991 loc = gimple_location (stmt);
993 /* The new statements will be placed before LOOP. */
994 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
996 nb_bytes = build_size_arg_loc (loc, partition->main_dr, partition->niter,
997 partition->plus_one);
998 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
999 false, GSI_CONTINUE_LINKING);
1000 mem = build_addr_arg_loc (loc, partition->main_dr, nb_bytes);
1001 mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1002 false, GSI_CONTINUE_LINKING);
1004 /* This exactly matches the pattern recognition in classify_partition. */
1005 val = gimple_assign_rhs1 (stmt);
1006 /* Handle constants like 0x15151515 and similarly
1007 floating point constants etc. where all bytes are the same. */
1008 int bytev = const_with_all_bytes_same (val);
1009 if (bytev != -1)
1010 val = build_int_cst (integer_type_node, bytev);
1011 else if (TREE_CODE (val) == INTEGER_CST)
1012 val = fold_convert (integer_type_node, val);
1013 else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1015 tree tem = make_ssa_name (integer_type_node);
1016 gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1017 gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1018 val = tem;
1021 fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1022 fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1023 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1025 if (dump_file && (dump_flags & TDF_DETAILS))
1027 fprintf (dump_file, "generated memset");
1028 if (bytev == 0)
1029 fprintf (dump_file, " zero\n");
1030 else
1031 fprintf (dump_file, "\n");
1035 /* Generate a call to memcpy for PARTITION in LOOP. */
1037 static void
1038 generate_memcpy_builtin (struct loop *loop, partition *partition)
1040 gimple_stmt_iterator gsi;
1041 gimple *stmt, *fn_call;
1042 tree dest, src, fn, nb_bytes;
1043 location_t loc;
1044 enum built_in_function kind;
1046 stmt = DR_STMT (partition->main_dr);
1047 loc = gimple_location (stmt);
1049 /* The new statements will be placed before LOOP. */
1050 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1052 nb_bytes = build_size_arg_loc (loc, partition->main_dr, partition->niter,
1053 partition->plus_one);
1054 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1055 false, GSI_CONTINUE_LINKING);
1056 dest = build_addr_arg_loc (loc, partition->main_dr, nb_bytes);
1057 src = build_addr_arg_loc (loc, partition->secondary_dr, nb_bytes);
1058 if (partition->kind == PKIND_MEMCPY
1059 || ! ptr_derefs_may_alias_p (dest, src))
1060 kind = BUILT_IN_MEMCPY;
1061 else
1062 kind = BUILT_IN_MEMMOVE;
1064 dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1065 false, GSI_CONTINUE_LINKING);
1066 src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1067 false, GSI_CONTINUE_LINKING);
1068 fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1069 fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1070 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1072 if (dump_file && (dump_flags & TDF_DETAILS))
1074 if (kind == BUILT_IN_MEMCPY)
1075 fprintf (dump_file, "generated memcpy\n");
1076 else
1077 fprintf (dump_file, "generated memmove\n");
1081 /* Remove and destroy the loop LOOP. */
1083 static void
1084 destroy_loop (struct loop *loop)
1086 unsigned nbbs = loop->num_nodes;
1087 edge exit = single_exit (loop);
1088 basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1089 basic_block *bbs;
1090 unsigned i;
1092 bbs = get_loop_body_in_dom_order (loop);
1094 redirect_edge_pred (exit, src);
1095 exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1096 exit->flags |= EDGE_FALLTHRU;
1097 cancel_loop_tree (loop);
1098 rescan_loop_exit (exit, false, true);
1100 i = nbbs;
1103 /* We have made sure to not leave any dangling uses of SSA
1104 names defined in the loop. With the exception of virtuals.
1105 Make sure we replace all uses of virtual defs that will remain
1106 outside of the loop with the bare symbol as delete_basic_block
1107 will release them. */
1108 --i;
1109 for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1110 gsi_next (&gsi))
1112 gphi *phi = gsi.phi ();
1113 if (virtual_operand_p (gimple_phi_result (phi)))
1114 mark_virtual_phi_result_for_renaming (phi);
1116 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);
1117 gsi_next (&gsi))
1119 gimple *stmt = gsi_stmt (gsi);
1120 tree vdef = gimple_vdef (stmt);
1121 if (vdef && TREE_CODE (vdef) == SSA_NAME)
1122 mark_virtual_operand_for_renaming (vdef);
1124 delete_basic_block (bbs[i]);
1126 while (i != 0);
1128 free (bbs);
1130 set_immediate_dominator (CDI_DOMINATORS, dest,
1131 recompute_dominator (CDI_DOMINATORS, dest));
1134 /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */
1136 static bool
1137 generate_code_for_partition (struct loop *loop,
1138 partition *partition, bool copy_p)
1140 switch (partition->kind)
1142 case PKIND_NORMAL:
1143 /* Reductions all have to be in the last partition. */
1144 gcc_assert (!partition_reduction_p (partition)
1145 || !copy_p);
1146 generate_loops_for_partition (loop, partition, copy_p);
1147 return false;
1149 case PKIND_MEMSET:
1150 generate_memset_builtin (loop, partition);
1151 break;
1153 case PKIND_MEMCPY:
1154 case PKIND_MEMMOVE:
1155 generate_memcpy_builtin (loop, partition);
1156 break;
1158 default:
1159 gcc_unreachable ();
1162 /* Common tail for partitions we turn into a call. If this was the last
1163 partition for which we generate code, we have to destroy the loop. */
1164 if (!copy_p)
1165 return true;
1166 return false;
1169 /* Return data dependence relation for data references A and B. The two
1170 data references must be in lexicographic order wrto reduced dependence
1171 graph RDG. We firstly try to find ddr from global ddr hash table. If
1172 it doesn't exist, compute the ddr and cache it. */
1174 static data_dependence_relation *
1175 get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b)
1177 struct data_dependence_relation ent, **slot;
1178 struct data_dependence_relation *ddr;
1180 gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1181 gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1182 <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1183 ent.a = a;
1184 ent.b = b;
1185 slot = ddrs_table->find_slot (&ent, INSERT);
1186 if (*slot == NULL)
1188 ddr = initialize_data_dependence_relation (a, b, loop_nest);
1189 compute_affine_dependence (ddr, loop_nest[0]);
1190 *slot = ddr;
1193 return *slot;
1196 /* In reduced dependence graph RDG for loop distribution, return true if
1197 dependence between references DR1 and DR2 leads to a dependence cycle
1198 and such dependence cycle can't be resolved by runtime alias check. */
1200 static bool
1201 data_dep_in_cycle_p (struct graph *rdg,
1202 data_reference_p dr1, data_reference_p dr2)
1204 struct data_dependence_relation *ddr;
1206 /* Re-shuffle data-refs to be in topological order. */
1207 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1208 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1209 std::swap (dr1, dr2);
1211 ddr = get_data_dependence (rdg, dr1, dr2);
1213 /* In case of no data dependence. */
1214 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1215 return false;
1216 /* For unknown data dependence or known data dependence which can't be
1217 expressed in classic distance vector, we check if it can be resolved
1218 by runtime alias check. If yes, we still consider data dependence
1219 as won't introduce data dependence cycle. */
1220 else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1221 || DDR_NUM_DIST_VECTS (ddr) == 0)
1222 return !runtime_alias_check_p (ddr, NULL, true);
1223 else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1224 return true;
1225 else if (DDR_REVERSED_P (ddr)
1226 || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1227 return false;
1229 return true;
1232 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
1233 PARTITION1's type after merging PARTITION2 into PARTITION1. */
1235 static void
1236 update_type_for_merge (struct graph *rdg,
1237 partition *partition1, partition *partition2)
1239 unsigned i, j;
1240 bitmap_iterator bi, bj;
1241 data_reference_p dr1, dr2;
1243 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1245 unsigned start = (partition1 == partition2) ? i + 1 : 0;
1247 dr1 = datarefs_vec[i];
1248 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1250 dr2 = datarefs_vec[j];
1251 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1252 continue;
1254 /* Partition can only be executed sequentially if there is any
1255 data dependence cycle. */
1256 if (data_dep_in_cycle_p (rdg, dr1, dr2))
1258 partition1->type = PTYPE_SEQUENTIAL;
1259 return;
1265 /* Returns a partition with all the statements needed for computing
1266 the vertex V of the RDG, also including the loop exit conditions. */
1268 static partition *
1269 build_rdg_partition_for_vertex (struct graph *rdg, int v)
1271 partition *partition = partition_alloc ();
1272 auto_vec<int, 3> nodes;
1273 unsigned i, j;
1274 int x;
1275 data_reference_p dr;
1277 graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1279 FOR_EACH_VEC_ELT (nodes, i, x)
1281 bitmap_set_bit (partition->stmts, x);
1282 bitmap_set_bit (partition->loops,
1283 loop_containing_stmt (RDG_STMT (rdg, x))->num);
1285 for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1287 unsigned idx = (unsigned) DR_INDEX (dr);
1288 gcc_assert (idx < datarefs_vec.length ());
1290 /* Partition can only be executed sequentially if there is any
1291 unknown data reference. */
1292 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1293 || !DR_INIT (dr) || !DR_STEP (dr))
1294 partition->type = PTYPE_SEQUENTIAL;
1296 bitmap_set_bit (partition->datarefs, idx);
1300 if (partition->type == PTYPE_SEQUENTIAL)
1301 return partition;
1303 /* Further check if any data dependence prevents us from executing the
1304 partition parallelly. */
1305 update_type_for_merge (rdg, partition, partition);
1307 return partition;
1310 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
1311 For the moment we detect memset, memcpy and memmove patterns. Bitmap
1312 STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */
1314 static void
1315 classify_partition (loop_p loop, struct graph *rdg, partition *partition,
1316 bitmap stmt_in_all_partitions)
1318 bitmap_iterator bi;
1319 unsigned i;
1320 tree nb_iter;
1321 data_reference_p single_load, single_store;
1322 bool volatiles_p = false, plus_one = false, has_reduction = false;
1324 partition->kind = PKIND_NORMAL;
1325 partition->main_dr = NULL;
1326 partition->secondary_dr = NULL;
1327 partition->niter = NULL_TREE;
1328 partition->plus_one = false;
1330 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1332 gimple *stmt = RDG_STMT (rdg, i);
1334 if (gimple_has_volatile_ops (stmt))
1335 volatiles_p = true;
1337 /* If the stmt is not included by all partitions and there is uses
1338 outside of the loop, then mark the partition as reduction. */
1339 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1341 /* Due to limitation in the transform phase we have to fuse all
1342 reduction partitions. As a result, this could cancel valid
1343 loop distribution especially for loop that induction variable
1344 is used outside of loop. To workaround this issue, we skip
1345 marking partition as reudction if the reduction stmt belongs
1346 to all partitions. In such case, reduction will be computed
1347 correctly no matter how partitions are fused/distributed. */
1348 if (!bitmap_bit_p (stmt_in_all_partitions, i))
1350 partition->reduction_p = true;
1351 return;
1353 has_reduction = true;
1357 /* Perform general partition disqualification for builtins. */
1358 if (volatiles_p
1359 /* Simple workaround to prevent classifying the partition as builtin
1360 if it contains any use outside of loop. */
1361 || has_reduction
1362 || !flag_tree_loop_distribute_patterns)
1363 return;
1365 /* Detect memset and memcpy. */
1366 single_load = NULL;
1367 single_store = NULL;
1368 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1370 gimple *stmt = RDG_STMT (rdg, i);
1371 data_reference_p dr;
1372 unsigned j;
1374 if (gimple_code (stmt) == GIMPLE_PHI)
1375 continue;
1377 /* Any scalar stmts are ok. */
1378 if (!gimple_vuse (stmt))
1379 continue;
1381 /* Otherwise just regular loads/stores. */
1382 if (!gimple_assign_single_p (stmt))
1383 return;
1385 /* But exactly one store and/or load. */
1386 for (j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1388 tree type = TREE_TYPE (DR_REF (dr));
1390 /* The memset, memcpy and memmove library calls are only
1391 able to deal with generic address space. */
1392 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1393 return;
1395 if (DR_IS_READ (dr))
1397 if (single_load != NULL)
1398 return;
1399 single_load = dr;
1401 else
1403 if (single_store != NULL)
1404 return;
1405 single_store = dr;
1410 if (!single_store)
1411 return;
1413 nb_iter = number_of_latch_executions (loop);
1414 gcc_assert (nb_iter && nb_iter != chrec_dont_know);
1415 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
1416 gimple_bb (DR_STMT (single_store))))
1417 plus_one = true;
1419 if (single_store && !single_load)
1421 gimple *stmt = DR_STMT (single_store);
1422 tree rhs = gimple_assign_rhs1 (stmt);
1423 if (const_with_all_bytes_same (rhs) == -1
1424 && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1425 || (TYPE_MODE (TREE_TYPE (rhs))
1426 != TYPE_MODE (unsigned_char_type_node))))
1427 return;
1428 if (TREE_CODE (rhs) == SSA_NAME
1429 && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1430 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1431 return;
1432 if (!adjacent_dr_p (single_store)
1433 || !dominated_by_p (CDI_DOMINATORS,
1434 loop->latch, gimple_bb (stmt)))
1435 return;
1436 partition->kind = PKIND_MEMSET;
1437 partition->main_dr = single_store;
1438 partition->niter = nb_iter;
1439 partition->plus_one = plus_one;
1441 else if (single_store && single_load)
1443 gimple *store = DR_STMT (single_store);
1444 gimple *load = DR_STMT (single_load);
1445 /* Direct aggregate copy or via an SSA name temporary. */
1446 if (load != store
1447 && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1448 return;
1449 if (!adjacent_dr_p (single_store)
1450 || !adjacent_dr_p (single_load)
1451 || !operand_equal_p (DR_STEP (single_store),
1452 DR_STEP (single_load), 0)
1453 || !dominated_by_p (CDI_DOMINATORS,
1454 loop->latch, gimple_bb (store)))
1455 return;
1456 /* Now check that if there is a dependence this dependence is
1457 of a suitable form for memmove. */
1458 ddr_p ddr = get_data_dependence (rdg, single_load, single_store);
1459 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1460 return;
1462 if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
1464 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1465 return;
1467 lambda_vector dist_v;
1468 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1470 int dist = dist_v[index_in_loop_nest (loop->num,
1471 DDR_LOOP_NEST (ddr))];
1472 if (dist > 0 && !DDR_REVERSED_P (ddr))
1473 return;
1475 partition->kind = PKIND_MEMMOVE;
1477 else
1478 partition->kind = PKIND_MEMCPY;
1479 partition->main_dr = single_store;
1480 partition->secondary_dr = single_load;
1481 partition->niter = nb_iter;
1482 partition->plus_one = plus_one;
1486 /* Returns true when PARTITION1 and PARTITION2 access the same memory
1487 object in RDG. */
1489 static bool
1490 share_memory_accesses (struct graph *rdg,
1491 partition *partition1, partition *partition2)
1493 unsigned i, j;
1494 bitmap_iterator bi, bj;
1495 data_reference_p dr1, dr2;
1497 /* First check whether in the intersection of the two partitions are
1498 any loads or stores. Common loads are the situation that happens
1499 most often. */
1500 EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1501 if (RDG_MEM_WRITE_STMT (rdg, i)
1502 || RDG_MEM_READS_STMT (rdg, i))
1503 return true;
1505 /* Then check whether the two partitions access the same memory object. */
1506 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1508 dr1 = datarefs_vec[i];
1510 if (!DR_BASE_ADDRESS (dr1)
1511 || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1512 continue;
1514 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1516 dr2 = datarefs_vec[j];
1518 if (!DR_BASE_ADDRESS (dr2)
1519 || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1520 continue;
1522 if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1523 && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1524 && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1525 && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1526 return true;
1530 return false;
1533 /* For each seed statement in STARTING_STMTS, this function builds
1534 partition for it by adding depended statements according to RDG.
1535 All partitions are recorded in PARTITIONS. */
1537 static void
1538 rdg_build_partitions (struct graph *rdg,
1539 vec<gimple *> starting_stmts,
1540 vec<partition *> *partitions)
1542 auto_bitmap processed;
1543 int i;
1544 gimple *stmt;
1546 FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1548 int v = rdg_vertex_for_stmt (rdg, stmt);
1550 if (dump_file && (dump_flags & TDF_DETAILS))
1551 fprintf (dump_file,
1552 "ldist asked to generate code for vertex %d\n", v);
1554 /* If the vertex is already contained in another partition so
1555 is the partition rooted at it. */
1556 if (bitmap_bit_p (processed, v))
1557 continue;
1559 partition *partition = build_rdg_partition_for_vertex (rdg, v);
1560 bitmap_ior_into (processed, partition->stmts);
1562 if (dump_file && (dump_flags & TDF_DETAILS))
1564 fprintf (dump_file, "ldist creates useful %s partition:\n",
1565 partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1566 bitmap_print (dump_file, partition->stmts, " ", "\n");
1569 partitions->safe_push (partition);
1572 /* All vertices should have been assigned to at least one partition now,
1573 other than vertices belonging to dead code. */
1576 /* Dump to FILE the PARTITIONS. */
1578 static void
1579 dump_rdg_partitions (FILE *file, vec<partition *> partitions)
1581 int i;
1582 partition *partition;
1584 FOR_EACH_VEC_ELT (partitions, i, partition)
1585 debug_bitmap_file (file, partition->stmts);
1588 /* Debug PARTITIONS. */
1589 extern void debug_rdg_partitions (vec<partition *> );
1591 DEBUG_FUNCTION void
1592 debug_rdg_partitions (vec<partition *> partitions)
1594 dump_rdg_partitions (stderr, partitions);
1597 /* Returns the number of read and write operations in the RDG. */
1599 static int
1600 number_of_rw_in_rdg (struct graph *rdg)
1602 int i, res = 0;
1604 for (i = 0; i < rdg->n_vertices; i++)
1606 if (RDG_MEM_WRITE_STMT (rdg, i))
1607 ++res;
1609 if (RDG_MEM_READS_STMT (rdg, i))
1610 ++res;
1613 return res;
1616 /* Returns the number of read and write operations in a PARTITION of
1617 the RDG. */
1619 static int
1620 number_of_rw_in_partition (struct graph *rdg, partition *partition)
1622 int res = 0;
1623 unsigned i;
1624 bitmap_iterator ii;
1626 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
1628 if (RDG_MEM_WRITE_STMT (rdg, i))
1629 ++res;
1631 if (RDG_MEM_READS_STMT (rdg, i))
1632 ++res;
1635 return res;
1638 /* Returns true when one of the PARTITIONS contains all the read or
1639 write operations of RDG. */
1641 static bool
1642 partition_contains_all_rw (struct graph *rdg,
1643 vec<partition *> partitions)
1645 int i;
1646 partition *partition;
1647 int nrw = number_of_rw_in_rdg (rdg);
1649 FOR_EACH_VEC_ELT (partitions, i, partition)
1650 if (nrw == number_of_rw_in_partition (rdg, partition))
1651 return true;
1653 return false;
1656 /* Compute partition dependence created by the data references in DRS1
1657 and DRS2, modify and return DIR according to that. IF ALIAS_DDR is
1658 not NULL, we record dependence introduced by possible alias between
1659 two data references in ALIAS_DDRS; otherwise, we simply ignore such
1660 dependence as if it doesn't exist at all. */
1662 static int
1663 pg_add_dependence_edges (struct graph *rdg, int dir,
1664 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
1666 unsigned i, j;
1667 bitmap_iterator bi, bj;
1668 data_reference_p dr1, dr2, saved_dr1;
1670 /* dependence direction - 0 is no dependence, -1 is back,
1671 1 is forth, 2 is both (we can stop then, merging will occur). */
1672 EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
1674 dr1 = datarefs_vec[i];
1676 EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
1678 int res, this_dir = 1;
1679 ddr_p ddr;
1681 dr2 = datarefs_vec[j];
1683 /* Skip all <read, read> data dependence. */
1684 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1685 continue;
1687 saved_dr1 = dr1;
1688 /* Re-shuffle data-refs to be in topological order. */
1689 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1690 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1692 std::swap (dr1, dr2);
1693 this_dir = -this_dir;
1695 ddr = get_data_dependence (rdg, dr1, dr2);
1696 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1698 this_dir = 0;
1699 res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
1700 DR_BASE_ADDRESS (dr2));
1701 /* Be conservative. If data references are not well analyzed,
1702 or the two data references have the same base address and
1703 offset, add dependence and consider it alias to each other.
1704 In other words, the dependence can not be resolved by
1705 runtime alias check. */
1706 if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
1707 || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
1708 || !DR_INIT (dr1) || !DR_INIT (dr2)
1709 || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
1710 || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
1711 || res == 0)
1712 this_dir = 2;
1713 /* Data dependence could be resolved by runtime alias check,
1714 record it in ALIAS_DDRS. */
1715 else if (alias_ddrs != NULL)
1716 alias_ddrs->safe_push (ddr);
1717 /* Or simply ignore it. */
1719 else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
1721 if (DDR_REVERSED_P (ddr))
1722 this_dir = -this_dir;
1724 /* Known dependences can still be unordered througout the
1725 iteration space, see gcc.dg/tree-ssa/ldist-16.c. */
1726 if (DDR_NUM_DIST_VECTS (ddr) != 1)
1727 this_dir = 2;
1728 /* If the overlap is exact preserve stmt order. */
1729 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1731 /* Else as the distance vector is lexicographic positive swap
1732 the dependence direction. */
1733 else
1734 this_dir = -this_dir;
1736 else
1737 this_dir = 0;
1738 if (this_dir == 2)
1739 return 2;
1740 else if (dir == 0)
1741 dir = this_dir;
1742 else if (this_dir != 0 && dir != this_dir)
1743 return 2;
1744 /* Shuffle "back" dr1. */
1745 dr1 = saved_dr1;
1748 return dir;
1751 /* Compare postorder number of the partition graph vertices V1 and V2. */
1753 static int
1754 pgcmp (const void *v1_, const void *v2_)
1756 const vertex *v1 = (const vertex *)v1_;
1757 const vertex *v2 = (const vertex *)v2_;
1758 return v2->post - v1->post;
1761 /* Data attached to vertices of partition dependence graph. */
1762 struct pg_vdata
1764 /* ID of the corresponding partition. */
1765 int id;
1766 /* The partition. */
1767 struct partition *partition;
1770 /* Data attached to edges of partition dependence graph. */
1771 struct pg_edata
1773 /* If the dependence edge can be resolved by runtime alias check,
1774 this vector contains data dependence relations for runtime alias
1775 check. On the other hand, if the dependence edge is introduced
1776 because of compilation time known data dependence, this vector
1777 contains nothing. */
1778 vec<ddr_p> alias_ddrs;
1781 /* Callback data for traversing edges in graph. */
1782 struct pg_edge_callback_data
1784 /* Bitmap contains strong connected components should be merged. */
1785 bitmap sccs_to_merge;
1786 /* Array constains component information for all vertices. */
1787 int *vertices_component;
1788 /* Vector to record all data dependence relations which are needed
1789 to break strong connected components by runtime alias checks. */
1790 vec<ddr_p> *alias_ddrs;
1793 /* Initialize vertice's data for partition dependence graph PG with
1794 PARTITIONS. */
1796 static void
1797 init_partition_graph_vertices (struct graph *pg,
1798 vec<struct partition *> *partitions)
1800 int i;
1801 partition *partition;
1802 struct pg_vdata *data;
1804 for (i = 0; partitions->iterate (i, &partition); ++i)
1806 data = new pg_vdata;
1807 pg->vertices[i].data = data;
1808 data->id = i;
1809 data->partition = partition;
1813 /* Add edge <I, J> to partition dependence graph PG. Attach vector of data
1814 dependence relations to the EDGE if DDRS isn't NULL. */
1816 static void
1817 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
1819 struct graph_edge *e = add_edge (pg, i, j);
1821 /* If the edge is attached with data dependence relations, it means this
1822 dependence edge can be resolved by runtime alias checks. */
1823 if (ddrs != NULL)
1825 struct pg_edata *data = new pg_edata;
1827 gcc_assert (ddrs->length () > 0);
1828 e->data = data;
1829 data->alias_ddrs = vNULL;
1830 data->alias_ddrs.safe_splice (*ddrs);
1834 /* Callback function for graph travesal algorithm. It returns true
1835 if edge E should skipped when traversing the graph. */
1837 static bool
1838 pg_skip_alias_edge (struct graph_edge *e)
1840 struct pg_edata *data = (struct pg_edata *)e->data;
1841 return (data != NULL && data->alias_ddrs.length () > 0);
1844 /* Callback function freeing data attached to edge E of graph. */
1846 static void
1847 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
1849 if (e->data != NULL)
1851 struct pg_edata *data = (struct pg_edata *)e->data;
1852 data->alias_ddrs.release ();
1853 delete data;
1857 /* Free data attached to vertice of partition dependence graph PG. */
1859 static void
1860 free_partition_graph_vdata (struct graph *pg)
1862 int i;
1863 struct pg_vdata *data;
1865 for (i = 0; i < pg->n_vertices; ++i)
1867 data = (struct pg_vdata *)pg->vertices[i].data;
1868 delete data;
1872 /* Build and return partition dependence graph for PARTITIONS. RDG is
1873 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
1874 is true, data dependence caused by possible alias between references
1875 is ignored, as if it doesn't exist at all; otherwise all depdendences
1876 are considered. */
1878 static struct graph *
1879 build_partition_graph (struct graph *rdg,
1880 vec<struct partition *> *partitions,
1881 bool ignore_alias_p)
1883 int i, j;
1884 struct partition *partition1, *partition2;
1885 graph *pg = new_graph (partitions->length ());
1886 auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
1888 alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
1890 init_partition_graph_vertices (pg, partitions);
1892 for (i = 0; partitions->iterate (i, &partition1); ++i)
1894 for (j = i + 1; partitions->iterate (j, &partition2); ++j)
1896 /* dependence direction - 0 is no dependence, -1 is back,
1897 1 is forth, 2 is both (we can stop then, merging will occur). */
1898 int dir = 0;
1900 /* If the first partition has reduction, add back edge; if the
1901 second partition has reduction, add forth edge. This makes
1902 sure that reduction partition will be sorted as the last one. */
1903 if (partition_reduction_p (partition1))
1904 dir = -1;
1905 else if (partition_reduction_p (partition2))
1906 dir = 1;
1908 /* Cleanup the temporary vector. */
1909 alias_ddrs.truncate (0);
1911 dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
1912 partition2->datarefs, alias_ddrs_p);
1914 /* Add edge to partition graph if there exists dependence. There
1915 are two types of edges. One type edge is caused by compilation
1916 time known dependence, this type can not be resolved by runtime
1917 alias check. The other type can be resolved by runtime alias
1918 check. */
1919 if (dir == 1 || dir == 2
1920 || alias_ddrs.length () > 0)
1922 /* Attach data dependence relations to edge that can be resolved
1923 by runtime alias check. */
1924 bool alias_edge_p = (dir != 1 && dir != 2);
1925 add_partition_graph_edge (pg, i, j,
1926 (alias_edge_p) ? &alias_ddrs : NULL);
1928 if (dir == -1 || dir == 2
1929 || alias_ddrs.length () > 0)
1931 /* Attach data dependence relations to edge that can be resolved
1932 by runtime alias check. */
1933 bool alias_edge_p = (dir != -1 && dir != 2);
1934 add_partition_graph_edge (pg, j, i,
1935 (alias_edge_p) ? &alias_ddrs : NULL);
1939 return pg;
1942 /* Sort partitions in PG by post order and store them in PARTITIONS. */
1944 static void
1945 sort_partitions_by_post_order (struct graph *pg,
1946 vec<struct partition *> *partitions)
1948 int i;
1949 struct pg_vdata *data;
1951 /* Now order the remaining nodes in postorder. */
1952 qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
1953 partitions->truncate (0);
1954 for (i = 0; i < pg->n_vertices; ++i)
1956 data = (struct pg_vdata *)pg->vertices[i].data;
1957 if (data->partition)
1958 partitions->safe_push (data->partition);
1962 /* Given reduced dependence graph RDG merge strong connected components
1963 of PARTITIONS. In this function, data dependence caused by possible
1964 alias between references is ignored, as if it doesn't exist at all. */
1966 static void
1967 merge_dep_scc_partitions (struct graph *rdg,
1968 vec<struct partition *> *partitions)
1970 struct partition *partition1, *partition2;
1971 struct pg_vdata *data;
1972 graph *pg = build_partition_graph (rdg, partitions, true);
1973 int i, j, num_sccs = graphds_scc (pg, NULL);
1975 /* Strong connected compoenent means dependence cycle, we cannot distribute
1976 them. So fuse them together. */
1977 if ((unsigned) num_sccs < partitions->length ())
1979 for (i = 0; i < num_sccs; ++i)
1981 for (j = 0; partitions->iterate (j, &partition1); ++j)
1982 if (pg->vertices[j].component == i)
1983 break;
1984 for (j = j + 1; partitions->iterate (j, &partition2); ++j)
1985 if (pg->vertices[j].component == i)
1987 partition_merge_into (NULL, partition1,
1988 partition2, FUSE_SAME_SCC);
1989 partition1->type = PTYPE_SEQUENTIAL;
1990 (*partitions)[j] = NULL;
1991 partition_free (partition2);
1992 data = (struct pg_vdata *)pg->vertices[j].data;
1993 data->partition = NULL;
1998 sort_partitions_by_post_order (pg, partitions);
1999 gcc_assert (partitions->length () == (unsigned)num_sccs);
2000 free_partition_graph_vdata (pg);
2001 free_graph (pg);
2004 /* Callback function for traversing edge E in graph G. DATA is private
2005 callback data. */
2007 static void
2008 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2010 int i, j, component;
2011 struct pg_edge_callback_data *cbdata;
2012 struct pg_edata *edata = (struct pg_edata *) e->data;
2014 /* If the edge doesn't have attached data dependence, it represents
2015 compilation time known dependences. This type dependence cannot
2016 be resolved by runtime alias check. */
2017 if (edata == NULL || edata->alias_ddrs.length () == 0)
2018 return;
2020 cbdata = (struct pg_edge_callback_data *) data;
2021 i = e->src;
2022 j = e->dest;
2023 component = cbdata->vertices_component[i];
2024 /* Vertices are topologically sorted according to compilation time
2025 known dependences, so we can break strong connected components
2026 by removing edges of the opposite direction, i.e, edges pointing
2027 from vertice with smaller post number to vertice with bigger post
2028 number. */
2029 if (g->vertices[i].post < g->vertices[j].post
2030 /* We only need to remove edges connecting vertices in the same
2031 strong connected component to break it. */
2032 && component == cbdata->vertices_component[j]
2033 /* Check if we want to break the strong connected component or not. */
2034 && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2035 cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2038 /* This is the main function breaking strong conected components in
2039 PARTITIONS giving reduced depdendence graph RDG. Store data dependence
2040 relations for runtime alias check in ALIAS_DDRS. */
2042 static void
2043 break_alias_scc_partitions (struct graph *rdg,
2044 vec<struct partition *> *partitions,
2045 vec<ddr_p> *alias_ddrs)
2047 int i, j, num_sccs, num_sccs_no_alias;
2048 /* Build partition dependence graph. */
2049 graph *pg = build_partition_graph (rdg, partitions, false);
2051 alias_ddrs->truncate (0);
2052 /* Find strong connected components in the graph, with all dependence edges
2053 considered. */
2054 num_sccs = graphds_scc (pg, NULL);
2055 /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2056 compilation time known dependences are merged before this function. */
2057 if ((unsigned) num_sccs < partitions->length ())
2059 struct pg_edge_callback_data cbdata;
2060 auto_bitmap sccs_to_merge;
2061 auto_vec<enum partition_type> scc_types;
2062 struct partition *partition, *first;
2064 /* If all paritions in a SCC has the same type, we can simply merge the
2065 SCC. This loop finds out such SCCS and record them in bitmap. */
2066 bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2067 for (i = 0; i < num_sccs; ++i)
2069 for (j = 0; partitions->iterate (j, &first); ++j)
2070 if (pg->vertices[j].component == i)
2071 break;
2072 for (++j; partitions->iterate (j, &partition); ++j)
2074 if (pg->vertices[j].component != i)
2075 continue;
2077 if (first->type != partition->type)
2079 bitmap_clear_bit (sccs_to_merge, i);
2080 break;
2085 /* Initialize callback data for traversing. */
2086 cbdata.sccs_to_merge = sccs_to_merge;
2087 cbdata.alias_ddrs = alias_ddrs;
2088 cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2089 /* Record the component information which will be corrupted by next
2090 graph scc finding call. */
2091 for (i = 0; i < pg->n_vertices; ++i)
2092 cbdata.vertices_component[i] = pg->vertices[i].component;
2094 /* Collect data dependences for runtime alias checks to break SCCs. */
2095 if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2097 /* Run SCC finding algorithm again, with alias dependence edges
2098 skipped. This is to topologically sort paritions according to
2099 compilation time known dependence. Note the topological order
2100 is stored in the form of pg's post order number. */
2101 num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2102 gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias);
2103 /* With topological order, we can construct two subgraphs L and R.
2104 L contains edge <x, y> where x < y in terms of post order, while
2105 R contains edge <x, y> where x > y. Edges for compilation time
2106 known dependence all fall in R, so we break SCCs by removing all
2107 (alias) edges of in subgraph L. */
2108 for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2111 /* For SCC that doesn't need to be broken, merge it. */
2112 for (i = 0; i < num_sccs; ++i)
2114 if (!bitmap_bit_p (sccs_to_merge, i))
2115 continue;
2117 for (j = 0; partitions->iterate (j, &first); ++j)
2118 if (cbdata.vertices_component[j] == i)
2119 break;
2120 for (++j; partitions->iterate (j, &partition); ++j)
2122 struct pg_vdata *data;
2124 if (cbdata.vertices_component[j] != i)
2125 continue;
2127 partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2128 (*partitions)[j] = NULL;
2129 partition_free (partition);
2130 data = (struct pg_vdata *)pg->vertices[j].data;
2131 gcc_assert (data->id == j);
2132 data->partition = NULL;
2137 sort_partitions_by_post_order (pg, partitions);
2138 free_partition_graph_vdata (pg);
2139 for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2140 free_graph (pg);
2142 if (dump_file && (dump_flags & TDF_DETAILS))
2144 fprintf (dump_file, "Possible alias data dependence to break:\n");
2145 dump_data_dependence_relations (dump_file, *alias_ddrs);
2149 /* Compute and return an expression whose value is the segment length which
2150 will be accessed by DR in NITERS iterations. */
2152 static tree
2153 data_ref_segment_size (struct data_reference *dr, tree niters)
2155 tree segment_length;
2157 if (integer_zerop (DR_STEP (dr)))
2158 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2159 else
2160 segment_length = size_binop (MULT_EXPR,
2161 fold_convert (sizetype, DR_STEP (dr)),
2162 fold_convert (sizetype, niters));
2164 return segment_length;
2167 /* Return true if LOOP's latch is dominated by statement for data reference
2168 DR. */
2170 static inline bool
2171 latch_dominated_by_data_ref (struct loop *loop, data_reference *dr)
2173 return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2174 gimple_bb (DR_STMT (dr)));
2177 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2178 data dependence relations ALIAS_DDRS. */
2180 static void
2181 compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
2182 vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2184 unsigned int i;
2185 unsigned HOST_WIDE_INT factor = 1;
2186 tree niters_plus_one, niters = number_of_latch_executions (loop);
2188 gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2189 niters = fold_convert (sizetype, niters);
2190 niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2192 if (dump_file && (dump_flags & TDF_DETAILS))
2193 fprintf (dump_file, "Creating alias check pairs:\n");
2195 /* Iterate all data dependence relations and compute alias check pairs. */
2196 for (i = 0; i < alias_ddrs->length (); i++)
2198 ddr_p ddr = (*alias_ddrs)[i];
2199 struct data_reference *dr_a = DDR_A (ddr);
2200 struct data_reference *dr_b = DDR_B (ddr);
2201 tree seg_length_a, seg_length_b;
2202 int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2203 DR_BASE_ADDRESS (dr_b));
2205 if (comp_res == 0)
2206 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
2207 gcc_assert (comp_res != 0);
2209 if (latch_dominated_by_data_ref (loop, dr_a))
2210 seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2211 else
2212 seg_length_a = data_ref_segment_size (dr_a, niters);
2214 if (latch_dominated_by_data_ref (loop, dr_b))
2215 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2216 else
2217 seg_length_b = data_ref_segment_size (dr_b, niters);
2219 dr_with_seg_len_pair_t dr_with_seg_len_pair
2220 (dr_with_seg_len (dr_a, seg_length_a),
2221 dr_with_seg_len (dr_b, seg_length_b));
2223 /* Canonicalize pairs by sorting the two DR members. */
2224 if (comp_res > 0)
2225 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2227 comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2230 if (tree_fits_uhwi_p (niters))
2231 factor = tree_to_uhwi (niters);
2233 /* Prune alias check pairs. */
2234 prune_runtime_alias_test_list (comp_alias_pairs, factor);
2235 if (dump_file && (dump_flags & TDF_DETAILS))
2236 fprintf (dump_file,
2237 "Improved number of alias checks from %d to %d\n",
2238 alias_ddrs->length (), comp_alias_pairs->length ());
2241 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2242 checks and version LOOP under condition of these runtime alias checks. */
2244 static void
2245 version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs)
2247 profile_probability prob;
2248 basic_block cond_bb;
2249 struct loop *nloop;
2250 tree lhs, arg0, cond_expr = NULL_TREE;
2251 gimple_seq cond_stmts = NULL;
2252 gimple *call_stmt = NULL;
2253 auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2255 /* Generate code for runtime alias checks if necessary. */
2256 gcc_assert (alias_ddrs->length () > 0);
2258 if (dump_file && (dump_flags & TDF_DETAILS))
2259 fprintf (dump_file,
2260 "Version loop <%d> with runtime alias check\n", loop->num);
2262 compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2263 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2264 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2265 is_gimple_val, NULL_TREE);
2267 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */
2268 if (flag_tree_loop_vectorize)
2270 /* Generate internal function call for loop distribution alias check. */
2271 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2272 2, NULL_TREE, cond_expr);
2273 lhs = make_ssa_name (boolean_type_node);
2274 gimple_call_set_lhs (call_stmt, lhs);
2276 else
2277 lhs = cond_expr;
2279 prob = profile_probability::guessed_always ().apply_scale (9, 10);
2280 initialize_original_copy_tables ();
2281 nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2282 prob, prob.invert (), true);
2283 free_original_copy_tables ();
2284 /* Record the original loop number in newly generated loops. In case of
2285 distribution, the original loop will be distributed and the new loop
2286 is kept. */
2287 loop->orig_loop_num = nloop->num;
2288 nloop->orig_loop_num = nloop->num;
2289 nloop->dont_vectorize = true;
2290 nloop->force_vectorize = false;
2292 if (call_stmt)
2294 /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2295 loop could be destroyed. */
2296 arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2297 gimple_call_set_arg (call_stmt, 0, arg0);
2298 gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2301 if (cond_stmts)
2303 gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2304 gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2306 update_ssa (TODO_update_ssa);
2309 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2310 ALIAS_DDRS are data dependence relations for runtime alias check. */
2312 static inline bool
2313 version_for_distribution_p (vec<struct partition *> *partitions,
2314 vec<ddr_p> *alias_ddrs)
2316 /* No need to version loop if we have only one partition. */
2317 if (partitions->length () == 1)
2318 return false;
2320 /* Need to version loop if runtime alias check is necessary. */
2321 return (alias_ddrs->length () > 0);
2324 /* Fuse all partitions if necessary before finalizing distribution. */
2326 static void
2327 finalize_partitions (vec<struct partition *> *partitions,
2328 vec<ddr_p> *alias_ddrs)
2330 unsigned i;
2331 struct partition *a, *partition;
2333 if (partitions->length () == 1
2334 || alias_ddrs->length () > 0)
2335 return;
2337 a = (*partitions)[0];
2338 if (a->kind != PKIND_NORMAL)
2339 return;
2341 for (i = 1; partitions->iterate (i, &partition); ++i)
2343 /* Don't fuse if partition has different type or it is a builtin. */
2344 if (partition->type != a->type
2345 || partition->kind != PKIND_NORMAL)
2346 return;
2349 /* Fuse all partitions. */
2350 for (i = 1; partitions->iterate (i, &partition); ++i)
2352 partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2353 partition_free (partition);
2355 partitions->truncate (1);
2358 /* Distributes the code from LOOP in such a way that producer statements
2359 are placed before consumer statements. Tries to separate only the
2360 statements from STMTS into separate loops. Returns the number of
2361 distributed loops. Set NB_CALLS to number of generated builtin calls.
2362 Set *DESTROY_P to whether LOOP needs to be destroyed. */
2364 static int
2365 distribute_loop (struct loop *loop, vec<gimple *> stmts,
2366 control_dependences *cd, int *nb_calls, bool *destroy_p)
2368 ddrs_table = new hash_table<ddr_hasher> (389);
2369 struct graph *rdg;
2370 partition *partition;
2371 bool any_builtin;
2372 int i, nbp;
2374 *destroy_p = false;
2375 *nb_calls = 0;
2376 loop_nest.create (0);
2377 if (!find_loop_nest (loop, &loop_nest))
2379 loop_nest.release ();
2380 delete ddrs_table;
2381 return 0;
2384 datarefs_vec.create (20);
2385 rdg = build_rdg (loop, cd);
2386 if (!rdg)
2388 if (dump_file && (dump_flags & TDF_DETAILS))
2389 fprintf (dump_file,
2390 "Loop %d not distributed: failed to build the RDG.\n",
2391 loop->num);
2393 loop_nest.release ();
2394 free_data_refs (datarefs_vec);
2395 delete ddrs_table;
2396 return 0;
2399 if (datarefs_vec.length () > MAX_DATAREFS_NUM)
2401 if (dump_file && (dump_flags & TDF_DETAILS))
2402 fprintf (dump_file,
2403 "Loop %d not distributed: too many memory references.\n",
2404 loop->num);
2406 free_rdg (rdg);
2407 loop_nest.release ();
2408 free_data_refs (datarefs_vec);
2409 delete ddrs_table;
2410 return 0;
2413 data_reference_p dref;
2414 for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
2415 dref->aux = (void *) (uintptr_t) i;
2417 if (dump_file && (dump_flags & TDF_DETAILS))
2418 dump_rdg (dump_file, rdg);
2420 auto_vec<struct partition *, 3> partitions;
2421 rdg_build_partitions (rdg, stmts, &partitions);
2423 auto_vec<ddr_p> alias_ddrs;
2425 auto_bitmap stmt_in_all_partitions;
2426 bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
2427 for (i = 1; partitions.iterate (i, &partition); ++i)
2428 bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
2430 any_builtin = false;
2431 FOR_EACH_VEC_ELT (partitions, i, partition)
2433 classify_partition (loop, rdg, partition, stmt_in_all_partitions);
2434 any_builtin |= partition_builtin_p (partition);
2437 /* If we are only distributing patterns but did not detect any,
2438 simply bail out. */
2439 if (!flag_tree_loop_distribution
2440 && !any_builtin)
2442 nbp = 0;
2443 goto ldist_done;
2446 /* If we are only distributing patterns fuse all partitions that
2447 were not classified as builtins. This also avoids chopping
2448 a loop into pieces, separated by builtin calls. That is, we
2449 only want no or a single loop body remaining. */
2450 struct partition *into;
2451 if (!flag_tree_loop_distribution)
2453 for (i = 0; partitions.iterate (i, &into); ++i)
2454 if (!partition_builtin_p (into))
2455 break;
2456 for (++i; partitions.iterate (i, &partition); ++i)
2457 if (!partition_builtin_p (partition))
2459 partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
2460 partitions.unordered_remove (i);
2461 partition_free (partition);
2462 i--;
2466 /* Due to limitations in the transform phase we have to fuse all
2467 reduction partitions into the last partition so the existing
2468 loop will contain all loop-closed PHI nodes. */
2469 for (i = 0; partitions.iterate (i, &into); ++i)
2470 if (partition_reduction_p (into))
2471 break;
2472 for (i = i + 1; partitions.iterate (i, &partition); ++i)
2473 if (partition_reduction_p (partition))
2475 partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
2476 partitions.unordered_remove (i);
2477 partition_free (partition);
2478 i--;
2481 /* Apply our simple cost model - fuse partitions with similar
2482 memory accesses. */
2483 for (i = 0; partitions.iterate (i, &into); ++i)
2485 bool changed = false;
2486 if (partition_builtin_p (into))
2487 continue;
2488 for (int j = i + 1;
2489 partitions.iterate (j, &partition); ++j)
2491 if (share_memory_accesses (rdg, into, partition))
2493 partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
2494 partitions.unordered_remove (j);
2495 partition_free (partition);
2496 j--;
2497 changed = true;
2500 /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
2501 accesses when 1 and 2 have similar accesses but not 0 and 1
2502 then in the next iteration we will fail to consider merging
2503 1 into 0,2. So try again if we did any merging into 0. */
2504 if (changed)
2505 i--;
2508 /* Build the partition dependency graph. */
2509 if (partitions.length () > 1)
2511 merge_dep_scc_partitions (rdg, &partitions);
2512 alias_ddrs.truncate (0);
2513 if (partitions.length () > 1)
2514 break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
2517 finalize_partitions (&partitions, &alias_ddrs);
2519 nbp = partitions.length ();
2520 if (nbp == 0
2521 || (nbp == 1 && !partition_builtin_p (partitions[0]))
2522 || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
2524 nbp = 0;
2525 goto ldist_done;
2528 if (version_for_distribution_p (&partitions, &alias_ddrs))
2529 version_loop_by_alias_check (loop, &alias_ddrs);
2531 if (dump_file && (dump_flags & TDF_DETAILS))
2533 fprintf (dump_file,
2534 "distribute loop <%d> into partitions:\n", loop->num);
2535 dump_rdg_partitions (dump_file, partitions);
2538 FOR_EACH_VEC_ELT (partitions, i, partition)
2540 if (partition_builtin_p (partition))
2541 (*nb_calls)++;
2542 *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
2545 ldist_done:
2546 loop_nest.release ();
2547 free_data_refs (datarefs_vec);
2548 for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
2549 iter != ddrs_table->end (); ++iter)
2551 free_dependence_relation (*iter);
2552 *iter = NULL;
2554 delete ddrs_table;
2556 FOR_EACH_VEC_ELT (partitions, i, partition)
2557 partition_free (partition);
2559 free_rdg (rdg);
2560 return nbp - *nb_calls;
2563 /* Distribute all loops in the current function. */
2565 namespace {
2567 const pass_data pass_data_loop_distribution =
2569 GIMPLE_PASS, /* type */
2570 "ldist", /* name */
2571 OPTGROUP_LOOP, /* optinfo_flags */
2572 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
2573 ( PROP_cfg | PROP_ssa ), /* properties_required */
2574 0, /* properties_provided */
2575 0, /* properties_destroyed */
2576 0, /* todo_flags_start */
2577 0, /* todo_flags_finish */
2580 class pass_loop_distribution : public gimple_opt_pass
2582 public:
2583 pass_loop_distribution (gcc::context *ctxt)
2584 : gimple_opt_pass (pass_data_loop_distribution, ctxt)
2587 /* opt_pass methods: */
2588 virtual bool gate (function *)
2590 return flag_tree_loop_distribution
2591 || flag_tree_loop_distribute_patterns;
2594 virtual unsigned int execute (function *);
2596 }; // class pass_loop_distribution
2598 unsigned int
2599 pass_loop_distribution::execute (function *fun)
2601 struct loop *loop;
2602 bool changed = false;
2603 basic_block bb;
2604 control_dependences *cd = NULL;
2605 auto_vec<loop_p> loops_to_be_destroyed;
2607 if (number_of_loops (fun) <= 1)
2608 return 0;
2610 /* Compute topological order for basic blocks. Topological order is
2611 needed because data dependence is computed for data references in
2612 lexicographical order. */
2613 if (bb_top_order_index == NULL)
2615 int rpo_num;
2616 int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun));
2618 bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
2619 bb_top_order_index_size = last_basic_block_for_fn (cfun);
2620 rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true);
2621 for (int i = 0; i < rpo_num; i++)
2622 bb_top_order_index[rpo[i]] = i;
2624 free (rpo);
2627 FOR_ALL_BB_FN (bb, fun)
2629 gimple_stmt_iterator gsi;
2630 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2631 gimple_set_uid (gsi_stmt (gsi), -1);
2632 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2633 gimple_set_uid (gsi_stmt (gsi), -1);
2636 /* We can at the moment only distribute non-nested loops, thus restrict
2637 walking to innermost loops. */
2638 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
2640 auto_vec<gimple *> work_list;
2641 basic_block *bbs;
2642 int num = loop->num;
2643 unsigned int i;
2645 /* If the loop doesn't have a single exit we will fail anyway,
2646 so do that early. */
2647 if (!single_exit (loop))
2648 continue;
2650 /* Only optimize hot loops. */
2651 if (!optimize_loop_for_speed_p (loop))
2652 continue;
2654 /* Don't distribute loop if niters is unknown. */
2655 tree niters = number_of_latch_executions (loop);
2656 if (niters == NULL_TREE || niters == chrec_dont_know)
2657 continue;
2659 /* Initialize the worklist with stmts we seed the partitions with. */
2660 bbs = get_loop_body_in_dom_order (loop);
2661 for (i = 0; i < loop->num_nodes; ++i)
2663 for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
2664 !gsi_end_p (gsi);
2665 gsi_next (&gsi))
2667 gphi *phi = gsi.phi ();
2668 if (virtual_operand_p (gimple_phi_result (phi)))
2669 continue;
2670 /* Distribute stmts which have defs that are used outside of
2671 the loop. */
2672 if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
2673 continue;
2674 work_list.safe_push (phi);
2676 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2677 !gsi_end_p (gsi);
2678 gsi_next (&gsi))
2680 gimple *stmt = gsi_stmt (gsi);
2682 /* If there is a stmt with side-effects bail out - we
2683 cannot and should not distribute this loop. */
2684 if (gimple_has_side_effects (stmt))
2686 work_list.truncate (0);
2687 goto out;
2690 /* Distribute stmts which have defs that are used outside of
2691 the loop. */
2692 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
2694 /* Otherwise only distribute stores for now. */
2695 else if (!gimple_vdef (stmt))
2696 continue;
2698 work_list.safe_push (stmt);
2701 out:
2702 free (bbs);
2704 int nb_generated_loops = 0;
2705 int nb_generated_calls = 0;
2706 location_t loc = find_loop_location (loop);
2707 if (work_list.length () > 0)
2709 if (!cd)
2711 calculate_dominance_info (CDI_DOMINATORS);
2712 calculate_dominance_info (CDI_POST_DOMINATORS);
2713 cd = new control_dependences ();
2714 free_dominance_info (CDI_POST_DOMINATORS);
2716 bool destroy_p;
2717 nb_generated_loops = distribute_loop (loop, work_list, cd,
2718 &nb_generated_calls,
2719 &destroy_p);
2720 if (destroy_p)
2721 loops_to_be_destroyed.safe_push (loop);
2724 if (nb_generated_loops + nb_generated_calls > 0)
2726 changed = true;
2727 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
2728 loc, "Loop %d distributed: split to %d loops "
2729 "and %d library calls.\n",
2730 num, nb_generated_loops, nb_generated_calls);
2732 else if (dump_file && (dump_flags & TDF_DETAILS))
2733 fprintf (dump_file, "Loop %d is the same.\n", num);
2736 if (cd)
2737 delete cd;
2739 if (bb_top_order_index != NULL)
2741 free (bb_top_order_index);
2742 bb_top_order_index = NULL;
2743 bb_top_order_index_size = 0;
2746 if (changed)
2748 /* Destroy loop bodies that could not be reused. Do this late as we
2749 otherwise can end up refering to stale data in control dependences. */
2750 unsigned i;
2751 FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
2752 destroy_loop (loop);
2754 /* Cached scalar evolutions now may refer to wrong or non-existing
2755 loops. */
2756 scev_reset_htab ();
2757 mark_virtual_operands_for_renaming (fun);
2758 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
2761 checking_verify_loop_structure ();
2763 return 0;
2766 } // anon namespace
2768 gimple_opt_pass *
2769 make_pass_loop_distribution (gcc::context *ctxt)
2771 return new pass_loop_distribution (ctxt);