Add support for ARMv8-R architecture
[official-gcc.git] / gcc / tree-loop-distribution.c
blobbe0a6603ed228a2aec718c0187d38572676f3dc0
1 /* Loop distribution.
2 Copyright (C) 2006-2017 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 /* This pass performs loop distribution: for example, the loop
24 |DO I = 2, N
25 | A(I) = B(I) + C
26 | D(I) = A(I-1)*E
27 |ENDDO
29 is transformed to
31 |DOALL I = 2, N
32 | A(I) = B(I) + C
33 |ENDDO
35 |DOALL I = 2, N
36 | D(I) = A(I-1)*E
37 |ENDDO
39 Loop distribution is the dual of loop fusion. It separates statements
40 of a loop (or loop nest) into multiple loops (or loop nests) with the
41 same loop header. The major goal is to separate statements which may
42 be vectorized from those that can't. This pass implements distribution
43 in the following steps:
45 1) Seed partitions with specific type statements. For now we support
46 two types seed statements: statement defining variable used outside
47 of loop; statement storing to memory.
48 2) Build reduced dependence graph (RDG) for loop to be distributed.
49 The vertices (RDG:V) model all statements in the loop and the edges
50 (RDG:E) model flow and control dependencies between statements.
51 3) Apart from RDG, compute data dependencies between memory references.
52 4) Starting from seed statement, build up partition by adding depended
53 statements according to RDG's dependence information. Partition is
54 classified as parallel type if it can be executed paralleled; or as
55 sequential type if it can't. Parallel type partition is further
56 classified as different builtin kinds if it can be implemented as
57 builtin function calls.
58 5) Build partition dependence graph (PG) based on data dependencies.
59 The vertices (PG:V) model all partitions and the edges (PG:E) model
60 all data dependencies between every partitions pair. In general,
61 data dependence is either compilation time known or unknown. In C
62 family languages, there exists quite amount compilation time unknown
63 dependencies because of possible alias relation of data references.
64 We categorize PG's edge to two types: "true" edge that represents
65 compilation time known data dependencies; "alias" edge for all other
66 data dependencies.
67 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge
68 partitions in each strong connected component (SCC) correspondingly.
69 Build new PG for merged partitions.
70 7) Traverse PG again and this time with both "true" and "alias" edges
71 included. We try to break SCCs by removing some edges. Because
72 SCCs by "true" edges are all fused in step 6), we can break SCCs
73 by removing some "alias" edges. It's NP-hard to choose optimal
74 edge set, fortunately simple approximation is good enough for us
75 given the small problem scale.
76 8) Collect all data dependencies of the removed "alias" edges. Create
77 runtime alias checks for collected data dependencies.
78 9) Version loop under the condition of runtime alias checks. Given
79 loop distribution generally introduces additional overhead, it is
80 only useful if vectorization is achieved in distributed loop. We
81 version loop with internal function call IFN_LOOP_DIST_ALIAS. If
82 no distributed loop can be vectorized, we simply remove distributed
83 loops and recover to the original one.
85 TODO:
86 1) We only distribute innermost loops now. This pass should handle loop
87 nests in the future.
88 2) We only fuse partitions in SCC now. A better fusion algorithm is
89 desired to minimize loop overhead, maximize parallelism and maximize
90 data reuse. */
92 #include "config.h"
93 #include "system.h"
94 #include "coretypes.h"
95 #include "backend.h"
96 #include "tree.h"
97 #include "gimple.h"
98 #include "cfghooks.h"
99 #include "tree-pass.h"
100 #include "ssa.h"
101 #include "gimple-pretty-print.h"
102 #include "fold-const.h"
103 #include "cfganal.h"
104 #include "gimple-iterator.h"
105 #include "gimplify-me.h"
106 #include "stor-layout.h"
107 #include "tree-cfg.h"
108 #include "tree-ssa-loop-manip.h"
109 #include "tree-ssa-loop.h"
110 #include "tree-into-ssa.h"
111 #include "tree-ssa.h"
112 #include "cfgloop.h"
113 #include "tree-scalar-evolution.h"
114 #include "params.h"
115 #include "tree-vectorizer.h"
118 #define MAX_DATAREFS_NUM \
119 ((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
121 /* Hashtable helpers. */
123 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
125 static inline hashval_t hash (const data_dependence_relation *);
126 static inline bool equal (const data_dependence_relation *,
127 const data_dependence_relation *);
130 /* Hash function for data dependence. */
132 inline hashval_t
133 ddr_hasher::hash (const data_dependence_relation *ddr)
135 inchash::hash h;
136 h.add_ptr (DDR_A (ddr));
137 h.add_ptr (DDR_B (ddr));
138 return h.end ();
141 /* Hash table equality function for data dependence. */
143 inline bool
144 ddr_hasher::equal (const data_dependence_relation *ddr1,
145 const data_dependence_relation *ddr2)
147 return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
150 /* The loop (nest) to be distributed. */
151 static vec<loop_p> loop_nest;
153 /* Vector of data references in the loop to be distributed. */
154 static vec<data_reference_p> datarefs_vec;
156 /* Store index of data reference in aux field. */
157 #define DR_INDEX(dr) ((uintptr_t) (dr)->aux)
159 /* Hash table for data dependence relation in the loop to be distributed. */
160 static hash_table<ddr_hasher> ddrs_table (389);
163 /* A Reduced Dependence Graph (RDG) vertex representing a statement. */
164 struct rdg_vertex
166 /* The statement represented by this vertex. */
167 gimple *stmt;
169 /* Vector of data-references in this statement. */
170 vec<data_reference_p> datarefs;
172 /* True when the statement contains a write to memory. */
173 bool has_mem_write;
175 /* True when the statement contains a read from memory. */
176 bool has_mem_reads;
179 #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt
180 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
181 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
182 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
183 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
184 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
185 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
186 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
188 /* Data dependence type. */
190 enum rdg_dep_type
192 /* Read After Write (RAW). */
193 flow_dd = 'f',
195 /* Control dependence (execute conditional on). */
196 control_dd = 'c'
199 /* Dependence information attached to an edge of the RDG. */
201 struct rdg_edge
203 /* Type of the dependence. */
204 enum rdg_dep_type type;
207 #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type
209 /* Dump vertex I in RDG to FILE. */
211 static void
212 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
214 struct vertex *v = &(rdg->vertices[i]);
215 struct graph_edge *e;
217 fprintf (file, "(vertex %d: (%s%s) (in:", i,
218 RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
219 RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
221 if (v->pred)
222 for (e = v->pred; e; e = e->pred_next)
223 fprintf (file, " %d", e->src);
225 fprintf (file, ") (out:");
227 if (v->succ)
228 for (e = v->succ; e; e = e->succ_next)
229 fprintf (file, " %d", e->dest);
231 fprintf (file, ")\n");
232 print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
233 fprintf (file, ")\n");
236 /* Call dump_rdg_vertex on stderr. */
238 DEBUG_FUNCTION void
239 debug_rdg_vertex (struct graph *rdg, int i)
241 dump_rdg_vertex (stderr, rdg, i);
244 /* Dump the reduced dependence graph RDG to FILE. */
246 static void
247 dump_rdg (FILE *file, struct graph *rdg)
249 fprintf (file, "(rdg\n");
250 for (int i = 0; i < rdg->n_vertices; i++)
251 dump_rdg_vertex (file, rdg, i);
252 fprintf (file, ")\n");
255 /* Call dump_rdg on stderr. */
257 DEBUG_FUNCTION void
258 debug_rdg (struct graph *rdg)
260 dump_rdg (stderr, rdg);
263 static void
264 dot_rdg_1 (FILE *file, struct graph *rdg)
266 int i;
267 pretty_printer buffer;
268 pp_needs_newline (&buffer) = false;
269 buffer.buffer->stream = file;
271 fprintf (file, "digraph RDG {\n");
273 for (i = 0; i < rdg->n_vertices; i++)
275 struct vertex *v = &(rdg->vertices[i]);
276 struct graph_edge *e;
278 fprintf (file, "%d [label=\"[%d] ", i, i);
279 pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
280 pp_flush (&buffer);
281 fprintf (file, "\"]\n");
283 /* Highlight reads from memory. */
284 if (RDG_MEM_READS_STMT (rdg, i))
285 fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
287 /* Highlight stores to memory. */
288 if (RDG_MEM_WRITE_STMT (rdg, i))
289 fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
291 if (v->succ)
292 for (e = v->succ; e; e = e->succ_next)
293 switch (RDGE_TYPE (e))
295 case flow_dd:
296 /* These are the most common dependences: don't print these. */
297 fprintf (file, "%d -> %d \n", i, e->dest);
298 break;
300 case control_dd:
301 fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
302 break;
304 default:
305 gcc_unreachable ();
309 fprintf (file, "}\n\n");
312 /* Display the Reduced Dependence Graph using dotty. */
314 DEBUG_FUNCTION void
315 dot_rdg (struct graph *rdg)
317 /* When debugging, you may want to enable the following code. */
318 #ifdef HAVE_POPEN
319 FILE *file = popen ("dot -Tx11", "w");
320 if (!file)
321 return;
322 dot_rdg_1 (file, rdg);
323 fflush (file);
324 close (fileno (file));
325 pclose (file);
326 #else
327 dot_rdg_1 (stderr, rdg);
328 #endif
331 /* Returns the index of STMT in RDG. */
333 static int
334 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
336 int index = gimple_uid (stmt);
337 gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
338 return index;
341 /* Creates dependence edges in RDG for all the uses of DEF. IDEF is
342 the index of DEF in RDG. */
344 static void
345 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
347 use_operand_p imm_use_p;
348 imm_use_iterator iterator;
350 FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
352 struct graph_edge *e;
353 int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
355 if (use < 0)
356 continue;
358 e = add_edge (rdg, idef, use);
359 e->data = XNEW (struct rdg_edge);
360 RDGE_TYPE (e) = flow_dd;
364 /* Creates an edge for the control dependences of BB to the vertex V. */
366 static void
367 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
368 int v, control_dependences *cd)
370 bitmap_iterator bi;
371 unsigned edge_n;
372 EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
373 0, edge_n, bi)
375 basic_block cond_bb = cd->get_edge_src (edge_n);
376 gimple *stmt = last_stmt (cond_bb);
377 if (stmt && is_ctrl_stmt (stmt))
379 struct graph_edge *e;
380 int c = rdg_vertex_for_stmt (rdg, stmt);
381 if (c < 0)
382 continue;
384 e = add_edge (rdg, c, v);
385 e->data = XNEW (struct rdg_edge);
386 RDGE_TYPE (e) = control_dd;
391 /* Creates the edges of the reduced dependence graph RDG. */
393 static void
394 create_rdg_flow_edges (struct graph *rdg)
396 int i;
397 def_operand_p def_p;
398 ssa_op_iter iter;
400 for (i = 0; i < rdg->n_vertices; i++)
401 FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
402 iter, SSA_OP_DEF)
403 create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
406 /* Creates the edges of the reduced dependence graph RDG. */
408 static void
409 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
411 int i;
413 for (i = 0; i < rdg->n_vertices; i++)
415 gimple *stmt = RDG_STMT (rdg, i);
416 if (gimple_code (stmt) == GIMPLE_PHI)
418 edge_iterator ei;
419 edge e;
420 FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
421 if (flow_bb_inside_loop_p (loop, e->src))
422 create_edge_for_control_dependence (rdg, e->src, i, cd);
424 else
425 create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
429 /* Build the vertices of the reduced dependence graph RDG. Return false
430 if that failed. */
432 static bool
433 create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop)
435 int i;
436 gimple *stmt;
438 FOR_EACH_VEC_ELT (stmts, i, stmt)
440 struct vertex *v = &(rdg->vertices[i]);
442 /* Record statement to vertex mapping. */
443 gimple_set_uid (stmt, i);
445 v->data = XNEW (struct rdg_vertex);
446 RDGV_STMT (v) = stmt;
447 RDGV_DATAREFS (v).create (0);
448 RDGV_HAS_MEM_WRITE (v) = false;
449 RDGV_HAS_MEM_READS (v) = false;
450 if (gimple_code (stmt) == GIMPLE_PHI)
451 continue;
453 unsigned drp = datarefs_vec.length ();
454 if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
455 return false;
456 for (unsigned j = drp; j < datarefs_vec.length (); ++j)
458 data_reference_p dr = datarefs_vec[j];
459 if (DR_IS_READ (dr))
460 RDGV_HAS_MEM_READS (v) = true;
461 else
462 RDGV_HAS_MEM_WRITE (v) = true;
463 RDGV_DATAREFS (v).safe_push (dr);
466 return true;
469 /* Array mapping basic block's index to its topological order. */
470 static int *bb_top_order_index;
471 /* And size of the array. */
472 static int bb_top_order_index_size;
474 /* If X has a smaller topological sort number than Y, returns -1;
475 if greater, returns 1. */
477 static int
478 bb_top_order_cmp (const void *x, const void *y)
480 basic_block bb1 = *(const basic_block *) x;
481 basic_block bb2 = *(const basic_block *) y;
483 gcc_assert (bb1->index < bb_top_order_index_size
484 && bb2->index < bb_top_order_index_size);
485 gcc_assert (bb1 == bb2
486 || bb_top_order_index[bb1->index]
487 != bb_top_order_index[bb2->index]);
489 return (bb_top_order_index[bb1->index] - bb_top_order_index[bb2->index]);
492 /* Initialize STMTS with all the statements of LOOP. We use topological
493 order to discover all statements. The order is important because
494 generate_loops_for_partition is using the same traversal for identifying
495 statements in loop copies. */
497 static void
498 stmts_from_loop (struct loop *loop, vec<gimple *> *stmts)
500 unsigned int i;
501 basic_block *bbs = get_loop_body_in_custom_order (loop, bb_top_order_cmp);
503 for (i = 0; i < loop->num_nodes; i++)
505 basic_block bb = bbs[i];
507 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
508 gsi_next (&bsi))
509 if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
510 stmts->safe_push (bsi.phi ());
512 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
513 gsi_next (&bsi))
515 gimple *stmt = gsi_stmt (bsi);
516 if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
517 stmts->safe_push (stmt);
521 free (bbs);
524 /* Free the reduced dependence graph RDG. */
526 static void
527 free_rdg (struct graph *rdg)
529 int i;
531 for (i = 0; i < rdg->n_vertices; i++)
533 struct vertex *v = &(rdg->vertices[i]);
534 struct graph_edge *e;
536 for (e = v->succ; e; e = e->succ_next)
537 free (e->data);
539 if (v->data)
541 gimple_set_uid (RDGV_STMT (v), -1);
542 (RDGV_DATAREFS (v)).release ();
543 free (v->data);
547 free_graph (rdg);
550 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
551 LOOP, and one edge per flow dependence or control dependence from control
552 dependence CD. During visiting each statement, data references are also
553 collected and recorded in global data DATAREFS_VEC. */
555 static struct graph *
556 build_rdg (struct loop *loop, control_dependences *cd)
558 struct graph *rdg;
560 /* Create the RDG vertices from the stmts of the loop nest. */
561 auto_vec<gimple *, 10> stmts;
562 stmts_from_loop (loop, &stmts);
563 rdg = new_graph (stmts.length ());
564 if (!create_rdg_vertices (rdg, stmts, loop))
566 free_rdg (rdg);
567 return NULL;
569 stmts.release ();
571 create_rdg_flow_edges (rdg);
572 if (cd)
573 create_rdg_cd_edges (rdg, cd, loop);
575 return rdg;
579 /* Kind of distributed loop. */
580 enum partition_kind {
581 PKIND_NORMAL, PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
584 /* Type of distributed loop. */
585 enum partition_type {
586 /* The distributed loop can be executed parallelly. */
587 PTYPE_PARALLEL = 0,
588 /* The distributed loop has to be executed sequentially. */
589 PTYPE_SEQUENTIAL
592 /* Partition for loop distribution. */
593 struct partition
595 /* Statements of the partition. */
596 bitmap stmts;
597 /* Loops of the partition. */
598 bitmap loops;
599 /* True if the partition defines variable which is used outside of loop. */
600 bool reduction_p;
601 /* For builtin partition, true if it executes one iteration more than
602 number of loop (latch) iterations. */
603 bool plus_one;
604 enum partition_kind kind;
605 enum partition_type type;
606 /* data-references a kind != PKIND_NORMAL partition is about. */
607 data_reference_p main_dr;
608 data_reference_p secondary_dr;
609 /* Number of loop (latch) iterations. */
610 tree niter;
611 /* Data references in the partition. */
612 bitmap datarefs;
616 /* Allocate and initialize a partition from BITMAP. */
618 static partition *
619 partition_alloc (void)
621 partition *partition = XCNEW (struct partition);
622 partition->stmts = BITMAP_ALLOC (NULL);
623 partition->loops = BITMAP_ALLOC (NULL);
624 partition->reduction_p = false;
625 partition->kind = PKIND_NORMAL;
626 partition->datarefs = BITMAP_ALLOC (NULL);
627 return partition;
630 /* Free PARTITION. */
632 static void
633 partition_free (partition *partition)
635 BITMAP_FREE (partition->stmts);
636 BITMAP_FREE (partition->loops);
637 BITMAP_FREE (partition->datarefs);
638 free (partition);
641 /* Returns true if the partition can be generated as a builtin. */
643 static bool
644 partition_builtin_p (partition *partition)
646 return partition->kind != PKIND_NORMAL;
649 /* Returns true if the partition contains a reduction. */
651 static bool
652 partition_reduction_p (partition *partition)
654 return partition->reduction_p;
657 /* Partitions are fused because of different reasons. */
658 enum fuse_type
660 FUSE_NON_BUILTIN = 0,
661 FUSE_REDUCTION = 1,
662 FUSE_SHARE_REF = 2,
663 FUSE_SAME_SCC = 3,
664 FUSE_FINALIZE = 4
667 /* Description on different fusing reason. */
668 static const char *fuse_message[] = {
669 "they are non-builtins",
670 "they have reductions",
671 "they have shared memory refs",
672 "they are in the same dependence scc",
673 "there is no point to distribute loop"};
675 static void
676 update_type_for_merge (struct graph *, partition *, partition *);
678 /* Merge PARTITION into the partition DEST. RDG is the reduced dependence
679 graph and we update type for result partition if it is non-NULL. */
681 static void
682 partition_merge_into (struct graph *rdg, partition *dest,
683 partition *partition, enum fuse_type ft)
685 if (dump_file && (dump_flags & TDF_DETAILS))
687 fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
688 fprintf (dump_file, " Part 1: ");
689 dump_bitmap (dump_file, dest->stmts);
690 fprintf (dump_file, " Part 2: ");
691 dump_bitmap (dump_file, partition->stmts);
694 dest->kind = PKIND_NORMAL;
695 if (dest->type == PTYPE_PARALLEL)
696 dest->type = partition->type;
698 bitmap_ior_into (dest->stmts, partition->stmts);
699 if (partition_reduction_p (partition))
700 dest->reduction_p = true;
702 /* Further check if any data dependence prevents us from executing the
703 new partition parallelly. */
704 if (dest->type == PTYPE_PARALLEL && rdg != NULL)
705 update_type_for_merge (rdg, dest, partition);
707 bitmap_ior_into (dest->datarefs, partition->datarefs);
711 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
712 the LOOP. */
714 static bool
715 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
717 imm_use_iterator imm_iter;
718 use_operand_p use_p;
720 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
722 gimple *use_stmt = USE_STMT (use_p);
723 if (!is_gimple_debug (use_stmt)
724 && loop != loop_containing_stmt (use_stmt))
725 return true;
728 return false;
731 /* Returns true when STMT defines a scalar variable used after the
732 loop LOOP. */
734 static bool
735 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
737 def_operand_p def_p;
738 ssa_op_iter op_iter;
740 if (gimple_code (stmt) == GIMPLE_PHI)
741 return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
743 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
744 if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
745 return true;
747 return false;
750 /* Return a copy of LOOP placed before LOOP. */
752 static struct loop *
753 copy_loop_before (struct loop *loop)
755 struct loop *res;
756 edge preheader = loop_preheader_edge (loop);
758 initialize_original_copy_tables ();
759 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
760 gcc_assert (res != NULL);
761 free_original_copy_tables ();
762 delete_update_ssa ();
764 return res;
767 /* Creates an empty basic block after LOOP. */
769 static void
770 create_bb_after_loop (struct loop *loop)
772 edge exit = single_exit (loop);
774 if (!exit)
775 return;
777 split_edge (exit);
780 /* Generate code for PARTITION from the code in LOOP. The loop is
781 copied when COPY_P is true. All the statements not flagged in the
782 PARTITION bitmap are removed from the loop or from its copy. The
783 statements are indexed in sequence inside a basic block, and the
784 basic blocks of a loop are taken in dom order. */
786 static void
787 generate_loops_for_partition (struct loop *loop, partition *partition,
788 bool copy_p)
790 unsigned i;
791 basic_block *bbs;
793 if (copy_p)
795 int orig_loop_num = loop->orig_loop_num;
796 loop = copy_loop_before (loop);
797 gcc_assert (loop != NULL);
798 loop->orig_loop_num = orig_loop_num;
799 create_preheader (loop, CP_SIMPLE_PREHEADERS);
800 create_bb_after_loop (loop);
802 else
804 /* Origin number is set to the new versioned loop's num. */
805 gcc_assert (loop->orig_loop_num != loop->num);
808 /* Remove stmts not in the PARTITION bitmap. */
809 bbs = get_loop_body_in_dom_order (loop);
811 if (MAY_HAVE_DEBUG_STMTS)
812 for (i = 0; i < loop->num_nodes; i++)
814 basic_block bb = bbs[i];
816 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
817 gsi_next (&bsi))
819 gphi *phi = bsi.phi ();
820 if (!virtual_operand_p (gimple_phi_result (phi))
821 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
822 reset_debug_uses (phi);
825 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
827 gimple *stmt = gsi_stmt (bsi);
828 if (gimple_code (stmt) != GIMPLE_LABEL
829 && !is_gimple_debug (stmt)
830 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
831 reset_debug_uses (stmt);
835 for (i = 0; i < loop->num_nodes; i++)
837 basic_block bb = bbs[i];
839 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
841 gphi *phi = bsi.phi ();
842 if (!virtual_operand_p (gimple_phi_result (phi))
843 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
844 remove_phi_node (&bsi, true);
845 else
846 gsi_next (&bsi);
849 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
851 gimple *stmt = gsi_stmt (bsi);
852 if (gimple_code (stmt) != GIMPLE_LABEL
853 && !is_gimple_debug (stmt)
854 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
856 /* Choose an arbitrary path through the empty CFG part
857 that this unnecessary control stmt controls. */
858 if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
860 gimple_cond_make_false (cond_stmt);
861 update_stmt (stmt);
863 else if (gimple_code (stmt) == GIMPLE_SWITCH)
865 gswitch *switch_stmt = as_a <gswitch *> (stmt);
866 gimple_switch_set_index
867 (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
868 update_stmt (stmt);
870 else
872 unlink_stmt_vdef (stmt);
873 gsi_remove (&bsi, true);
874 release_defs (stmt);
875 continue;
878 gsi_next (&bsi);
882 free (bbs);
885 /* Build the size argument for a memory operation call. */
887 static tree
888 build_size_arg_loc (location_t loc, data_reference_p dr, tree nb_iter,
889 bool plus_one)
891 tree size = fold_convert_loc (loc, sizetype, nb_iter);
892 if (plus_one)
893 size = size_binop (PLUS_EXPR, size, size_one_node);
894 size = fold_build2_loc (loc, MULT_EXPR, sizetype, size,
895 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))));
896 size = fold_convert_loc (loc, size_type_node, size);
897 return size;
900 /* Build an address argument for a memory operation call. */
902 static tree
903 build_addr_arg_loc (location_t loc, data_reference_p dr, tree nb_bytes)
905 tree addr_base;
907 addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
908 addr_base = fold_convert_loc (loc, sizetype, addr_base);
910 /* Test for a negative stride, iterating over every element. */
911 if (tree_int_cst_sgn (DR_STEP (dr)) == -1)
913 addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base,
914 fold_convert_loc (loc, sizetype, nb_bytes));
915 addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base,
916 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))));
919 return fold_build_pointer_plus_loc (loc, DR_BASE_ADDRESS (dr), addr_base);
922 /* If VAL memory representation contains the same value in all bytes,
923 return that value, otherwise return -1.
924 E.g. for 0x24242424 return 0x24, for IEEE double
925 747708026454360457216.0 return 0x44, etc. */
927 static int
928 const_with_all_bytes_same (tree val)
930 unsigned char buf[64];
931 int i, len;
933 if (integer_zerop (val)
934 || (TREE_CODE (val) == CONSTRUCTOR
935 && !TREE_CLOBBER_P (val)
936 && CONSTRUCTOR_NELTS (val) == 0))
937 return 0;
939 if (real_zerop (val))
941 /* Only return 0 for +0.0, not for -0.0, which doesn't have
942 an all bytes same memory representation. Don't transform
943 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */
944 switch (TREE_CODE (val))
946 case REAL_CST:
947 if (!real_isneg (TREE_REAL_CST_PTR (val)))
948 return 0;
949 break;
950 case COMPLEX_CST:
951 if (!const_with_all_bytes_same (TREE_REALPART (val))
952 && !const_with_all_bytes_same (TREE_IMAGPART (val)))
953 return 0;
954 break;
955 case VECTOR_CST:
956 unsigned int j;
957 for (j = 0; j < VECTOR_CST_NELTS (val); ++j)
958 if (const_with_all_bytes_same (VECTOR_CST_ELT (val, j)))
959 break;
960 if (j == VECTOR_CST_NELTS (val))
961 return 0;
962 break;
963 default:
964 break;
968 if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
969 return -1;
971 len = native_encode_expr (val, buf, sizeof (buf));
972 if (len == 0)
973 return -1;
974 for (i = 1; i < len; i++)
975 if (buf[i] != buf[0])
976 return -1;
977 return buf[0];
980 /* Generate a call to memset for PARTITION in LOOP. */
982 static void
983 generate_memset_builtin (struct loop *loop, partition *partition)
985 gimple_stmt_iterator gsi;
986 gimple *stmt, *fn_call;
987 tree mem, fn, nb_bytes;
988 location_t loc;
989 tree val;
991 stmt = DR_STMT (partition->main_dr);
992 loc = gimple_location (stmt);
994 /* The new statements will be placed before LOOP. */
995 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
997 nb_bytes = build_size_arg_loc (loc, partition->main_dr, partition->niter,
998 partition->plus_one);
999 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1000 false, GSI_CONTINUE_LINKING);
1001 mem = build_addr_arg_loc (loc, partition->main_dr, nb_bytes);
1002 mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1003 false, GSI_CONTINUE_LINKING);
1005 /* This exactly matches the pattern recognition in classify_partition. */
1006 val = gimple_assign_rhs1 (stmt);
1007 /* Handle constants like 0x15151515 and similarly
1008 floating point constants etc. where all bytes are the same. */
1009 int bytev = const_with_all_bytes_same (val);
1010 if (bytev != -1)
1011 val = build_int_cst (integer_type_node, bytev);
1012 else if (TREE_CODE (val) == INTEGER_CST)
1013 val = fold_convert (integer_type_node, val);
1014 else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1016 tree tem = make_ssa_name (integer_type_node);
1017 gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1018 gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1019 val = tem;
1022 fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1023 fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1024 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1026 if (dump_file && (dump_flags & TDF_DETAILS))
1028 fprintf (dump_file, "generated memset");
1029 if (bytev == 0)
1030 fprintf (dump_file, " zero\n");
1031 else
1032 fprintf (dump_file, "\n");
1036 /* Generate a call to memcpy for PARTITION in LOOP. */
1038 static void
1039 generate_memcpy_builtin (struct loop *loop, partition *partition)
1041 gimple_stmt_iterator gsi;
1042 gimple *stmt, *fn_call;
1043 tree dest, src, fn, nb_bytes;
1044 location_t loc;
1045 enum built_in_function kind;
1047 stmt = DR_STMT (partition->main_dr);
1048 loc = gimple_location (stmt);
1050 /* The new statements will be placed before LOOP. */
1051 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1053 nb_bytes = build_size_arg_loc (loc, partition->main_dr, partition->niter,
1054 partition->plus_one);
1055 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1056 false, GSI_CONTINUE_LINKING);
1057 dest = build_addr_arg_loc (loc, partition->main_dr, nb_bytes);
1058 src = build_addr_arg_loc (loc, partition->secondary_dr, nb_bytes);
1059 if (partition->kind == PKIND_MEMCPY
1060 || ! ptr_derefs_may_alias_p (dest, src))
1061 kind = BUILT_IN_MEMCPY;
1062 else
1063 kind = BUILT_IN_MEMMOVE;
1065 dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1066 false, GSI_CONTINUE_LINKING);
1067 src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1068 false, GSI_CONTINUE_LINKING);
1069 fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1070 fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1071 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1073 if (dump_file && (dump_flags & TDF_DETAILS))
1075 if (kind == BUILT_IN_MEMCPY)
1076 fprintf (dump_file, "generated memcpy\n");
1077 else
1078 fprintf (dump_file, "generated memmove\n");
1082 /* Remove and destroy the loop LOOP. */
1084 static void
1085 destroy_loop (struct loop *loop)
1087 unsigned nbbs = loop->num_nodes;
1088 edge exit = single_exit (loop);
1089 basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1090 basic_block *bbs;
1091 unsigned i;
1093 bbs = get_loop_body_in_dom_order (loop);
1095 redirect_edge_pred (exit, src);
1096 exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1097 exit->flags |= EDGE_FALLTHRU;
1098 cancel_loop_tree (loop);
1099 rescan_loop_exit (exit, false, true);
1101 i = nbbs;
1104 /* We have made sure to not leave any dangling uses of SSA
1105 names defined in the loop. With the exception of virtuals.
1106 Make sure we replace all uses of virtual defs that will remain
1107 outside of the loop with the bare symbol as delete_basic_block
1108 will release them. */
1109 --i;
1110 for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1111 gsi_next (&gsi))
1113 gphi *phi = gsi.phi ();
1114 if (virtual_operand_p (gimple_phi_result (phi)))
1115 mark_virtual_phi_result_for_renaming (phi);
1117 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);
1118 gsi_next (&gsi))
1120 gimple *stmt = gsi_stmt (gsi);
1121 tree vdef = gimple_vdef (stmt);
1122 if (vdef && TREE_CODE (vdef) == SSA_NAME)
1123 mark_virtual_operand_for_renaming (vdef);
1125 delete_basic_block (bbs[i]);
1127 while (i != 0);
1129 free (bbs);
1131 set_immediate_dominator (CDI_DOMINATORS, dest,
1132 recompute_dominator (CDI_DOMINATORS, dest));
1135 /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */
1137 static bool
1138 generate_code_for_partition (struct loop *loop,
1139 partition *partition, bool copy_p)
1141 switch (partition->kind)
1143 case PKIND_NORMAL:
1144 /* Reductions all have to be in the last partition. */
1145 gcc_assert (!partition_reduction_p (partition)
1146 || !copy_p);
1147 generate_loops_for_partition (loop, partition, copy_p);
1148 return false;
1150 case PKIND_MEMSET:
1151 generate_memset_builtin (loop, partition);
1152 break;
1154 case PKIND_MEMCPY:
1155 case PKIND_MEMMOVE:
1156 generate_memcpy_builtin (loop, partition);
1157 break;
1159 default:
1160 gcc_unreachable ();
1163 /* Common tail for partitions we turn into a call. If this was the last
1164 partition for which we generate code, we have to destroy the loop. */
1165 if (!copy_p)
1166 return true;
1167 return false;
1170 /* Return data dependence relation for data references A and B. The two
1171 data references must be in lexicographic order wrto reduced dependence
1172 graph RDG. We firstly try to find ddr from global ddr hash table. If
1173 it doesn't exist, compute the ddr and cache it. */
1175 static data_dependence_relation *
1176 get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b)
1178 struct data_dependence_relation ent, **slot;
1179 struct data_dependence_relation *ddr;
1181 gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1182 gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1183 <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1184 ent.a = a;
1185 ent.b = b;
1186 slot = ddrs_table.find_slot (&ent, INSERT);
1187 if (*slot == NULL)
1189 ddr = initialize_data_dependence_relation (a, b, loop_nest);
1190 compute_affine_dependence (ddr, loop_nest[0]);
1191 *slot = ddr;
1194 return *slot;
1197 /* In reduced dependence graph RDG for loop distribution, return true if
1198 dependence between references DR1 and DR2 leads to a dependence cycle
1199 and such dependence cycle can't be resolved by runtime alias check. */
1201 static bool
1202 data_dep_in_cycle_p (struct graph *rdg,
1203 data_reference_p dr1, data_reference_p dr2)
1205 struct data_dependence_relation *ddr;
1207 /* Re-shuffle data-refs to be in topological order. */
1208 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1209 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1210 std::swap (dr1, dr2);
1212 ddr = get_data_dependence (rdg, dr1, dr2);
1214 /* In case of no data dependence. */
1215 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1216 return false;
1217 /* For unknown data dependence or known data dependence which can't be
1218 expressed in classic distance vector, we check if it can be resolved
1219 by runtime alias check. If yes, we still consider data dependence
1220 as won't introduce data dependence cycle. */
1221 else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1222 || DDR_NUM_DIST_VECTS (ddr) == 0)
1223 return !runtime_alias_check_p (ddr, NULL, true);
1224 else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1225 return true;
1226 else if (DDR_REVERSED_P (ddr)
1227 || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1228 return false;
1230 return true;
1233 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
1234 PARTITION1's type after merging PARTITION2 into PARTITION1. */
1236 static void
1237 update_type_for_merge (struct graph *rdg,
1238 partition *partition1, partition *partition2)
1240 unsigned i, j;
1241 bitmap_iterator bi, bj;
1242 data_reference_p dr1, dr2;
1244 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1246 unsigned start = (partition1 == partition2) ? i + 1 : 0;
1248 dr1 = datarefs_vec[i];
1249 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1251 dr2 = datarefs_vec[j];
1252 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1253 continue;
1255 /* Partition can only be executed sequentially if there is any
1256 data dependence cycle. */
1257 if (data_dep_in_cycle_p (rdg, dr1, dr2))
1259 partition1->type = PTYPE_SEQUENTIAL;
1260 return;
1266 /* Returns a partition with all the statements needed for computing
1267 the vertex V of the RDG, also including the loop exit conditions. */
1269 static partition *
1270 build_rdg_partition_for_vertex (struct graph *rdg, int v)
1272 partition *partition = partition_alloc ();
1273 auto_vec<int, 3> nodes;
1274 unsigned i, j;
1275 int x;
1276 data_reference_p dr;
1278 graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1280 FOR_EACH_VEC_ELT (nodes, i, x)
1282 bitmap_set_bit (partition->stmts, x);
1283 bitmap_set_bit (partition->loops,
1284 loop_containing_stmt (RDG_STMT (rdg, x))->num);
1286 for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1288 unsigned idx = (unsigned) DR_INDEX (dr);
1289 gcc_assert (idx < datarefs_vec.length ());
1291 /* Partition can only be executed sequentially if there is any
1292 unknown data reference. */
1293 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1294 || !DR_INIT (dr) || !DR_STEP (dr))
1295 partition->type = PTYPE_SEQUENTIAL;
1297 bitmap_set_bit (partition->datarefs, idx);
1301 if (partition->type == PTYPE_SEQUENTIAL)
1302 return partition;
1304 /* Further check if any data dependence prevents us from executing the
1305 partition parallelly. */
1306 update_type_for_merge (rdg, partition, partition);
1308 return partition;
1311 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
1312 For the moment we detect memset, memcpy and memmove patterns. Bitmap
1313 STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */
1315 static void
1316 classify_partition (loop_p loop, struct graph *rdg, partition *partition,
1317 bitmap stmt_in_all_partitions)
1319 bitmap_iterator bi;
1320 unsigned i;
1321 tree nb_iter;
1322 data_reference_p single_load, single_store;
1323 bool volatiles_p = false, plus_one = false, has_reduction = false;
1325 partition->kind = PKIND_NORMAL;
1326 partition->main_dr = NULL;
1327 partition->secondary_dr = NULL;
1328 partition->niter = NULL_TREE;
1329 partition->plus_one = false;
1331 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1333 gimple *stmt = RDG_STMT (rdg, i);
1335 if (gimple_has_volatile_ops (stmt))
1336 volatiles_p = true;
1338 /* If the stmt is not included by all partitions and there is uses
1339 outside of the loop, then mark the partition as reduction. */
1340 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1342 /* Due to limitation in the transform phase we have to fuse all
1343 reduction partitions. As a result, this could cancel valid
1344 loop distribution especially for loop that induction variable
1345 is used outside of loop. To workaround this issue, we skip
1346 marking partition as reudction if the reduction stmt belongs
1347 to all partitions. In such case, reduction will be computed
1348 correctly no matter how partitions are fused/distributed. */
1349 if (!bitmap_bit_p (stmt_in_all_partitions, i))
1351 partition->reduction_p = true;
1352 return;
1354 has_reduction = true;
1358 /* Perform general partition disqualification for builtins. */
1359 if (volatiles_p
1360 /* Simple workaround to prevent classifying the partition as builtin
1361 if it contains any use outside of loop. */
1362 || has_reduction
1363 || !flag_tree_loop_distribute_patterns)
1364 return;
1366 /* Detect memset and memcpy. */
1367 single_load = NULL;
1368 single_store = NULL;
1369 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1371 gimple *stmt = RDG_STMT (rdg, i);
1372 data_reference_p dr;
1373 unsigned j;
1375 if (gimple_code (stmt) == GIMPLE_PHI)
1376 continue;
1378 /* Any scalar stmts are ok. */
1379 if (!gimple_vuse (stmt))
1380 continue;
1382 /* Otherwise just regular loads/stores. */
1383 if (!gimple_assign_single_p (stmt))
1384 return;
1386 /* But exactly one store and/or load. */
1387 for (j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1389 tree type = TREE_TYPE (DR_REF (dr));
1391 /* The memset, memcpy and memmove library calls are only
1392 able to deal with generic address space. */
1393 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1394 return;
1396 if (DR_IS_READ (dr))
1398 if (single_load != NULL)
1399 return;
1400 single_load = dr;
1402 else
1404 if (single_store != NULL)
1405 return;
1406 single_store = dr;
1411 if (!single_store)
1412 return;
1414 nb_iter = number_of_latch_executions (loop);
1415 if (!nb_iter || nb_iter == chrec_dont_know)
1416 return;
1417 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
1418 gimple_bb (DR_STMT (single_store))))
1419 plus_one = true;
1421 if (single_store && !single_load)
1423 gimple *stmt = DR_STMT (single_store);
1424 tree rhs = gimple_assign_rhs1 (stmt);
1425 if (const_with_all_bytes_same (rhs) == -1
1426 && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1427 || (TYPE_MODE (TREE_TYPE (rhs))
1428 != TYPE_MODE (unsigned_char_type_node))))
1429 return;
1430 if (TREE_CODE (rhs) == SSA_NAME
1431 && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1432 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1433 return;
1434 if (!adjacent_dr_p (single_store)
1435 || !dominated_by_p (CDI_DOMINATORS,
1436 loop->latch, gimple_bb (stmt)))
1437 return;
1438 partition->kind = PKIND_MEMSET;
1439 partition->main_dr = single_store;
1440 partition->niter = nb_iter;
1441 partition->plus_one = plus_one;
1443 else if (single_store && single_load)
1445 gimple *store = DR_STMT (single_store);
1446 gimple *load = DR_STMT (single_load);
1447 /* Direct aggregate copy or via an SSA name temporary. */
1448 if (load != store
1449 && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1450 return;
1451 if (!adjacent_dr_p (single_store)
1452 || !adjacent_dr_p (single_load)
1453 || !operand_equal_p (DR_STEP (single_store),
1454 DR_STEP (single_load), 0)
1455 || !dominated_by_p (CDI_DOMINATORS,
1456 loop->latch, gimple_bb (store)))
1457 return;
1458 /* Now check that if there is a dependence this dependence is
1459 of a suitable form for memmove. */
1460 ddr_p ddr = get_data_dependence (rdg, single_load, single_store);
1461 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1462 return;
1464 if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
1466 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1467 return;
1469 lambda_vector dist_v;
1470 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1472 int dist = dist_v[index_in_loop_nest (loop->num,
1473 DDR_LOOP_NEST (ddr))];
1474 if (dist > 0 && !DDR_REVERSED_P (ddr))
1475 return;
1477 partition->kind = PKIND_MEMMOVE;
1479 else
1480 partition->kind = PKIND_MEMCPY;
1481 partition->main_dr = single_store;
1482 partition->secondary_dr = single_load;
1483 partition->niter = nb_iter;
1484 partition->plus_one = plus_one;
1488 /* Returns true when PARTITION1 and PARTITION2 access the same memory
1489 object in RDG. */
1491 static bool
1492 share_memory_accesses (struct graph *rdg,
1493 partition *partition1, partition *partition2)
1495 unsigned i, j;
1496 bitmap_iterator bi, bj;
1497 data_reference_p dr1, dr2;
1499 /* First check whether in the intersection of the two partitions are
1500 any loads or stores. Common loads are the situation that happens
1501 most often. */
1502 EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1503 if (RDG_MEM_WRITE_STMT (rdg, i)
1504 || RDG_MEM_READS_STMT (rdg, i))
1505 return true;
1507 /* Then check whether the two partitions access the same memory object. */
1508 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1510 dr1 = datarefs_vec[i];
1512 if (!DR_BASE_ADDRESS (dr1)
1513 || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1514 continue;
1516 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1518 dr2 = datarefs_vec[j];
1520 if (!DR_BASE_ADDRESS (dr2)
1521 || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1522 continue;
1524 if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1525 && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1526 && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1527 && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1528 return true;
1532 return false;
1535 /* For each seed statement in STARTING_STMTS, this function builds
1536 partition for it by adding depended statements according to RDG.
1537 All partitions are recorded in PARTITIONS. */
1539 static void
1540 rdg_build_partitions (struct graph *rdg,
1541 vec<gimple *> starting_stmts,
1542 vec<partition *> *partitions)
1544 auto_bitmap processed;
1545 int i;
1546 gimple *stmt;
1548 FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1550 int v = rdg_vertex_for_stmt (rdg, stmt);
1552 if (dump_file && (dump_flags & TDF_DETAILS))
1553 fprintf (dump_file,
1554 "ldist asked to generate code for vertex %d\n", v);
1556 /* If the vertex is already contained in another partition so
1557 is the partition rooted at it. */
1558 if (bitmap_bit_p (processed, v))
1559 continue;
1561 partition *partition = build_rdg_partition_for_vertex (rdg, v);
1562 bitmap_ior_into (processed, partition->stmts);
1564 if (dump_file && (dump_flags & TDF_DETAILS))
1566 fprintf (dump_file, "ldist creates useful %s partition:\n",
1567 partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1568 bitmap_print (dump_file, partition->stmts, " ", "\n");
1571 partitions->safe_push (partition);
1574 /* All vertices should have been assigned to at least one partition now,
1575 other than vertices belonging to dead code. */
1578 /* Dump to FILE the PARTITIONS. */
1580 static void
1581 dump_rdg_partitions (FILE *file, vec<partition *> partitions)
1583 int i;
1584 partition *partition;
1586 FOR_EACH_VEC_ELT (partitions, i, partition)
1587 debug_bitmap_file (file, partition->stmts);
1590 /* Debug PARTITIONS. */
1591 extern void debug_rdg_partitions (vec<partition *> );
1593 DEBUG_FUNCTION void
1594 debug_rdg_partitions (vec<partition *> partitions)
1596 dump_rdg_partitions (stderr, partitions);
1599 /* Returns the number of read and write operations in the RDG. */
1601 static int
1602 number_of_rw_in_rdg (struct graph *rdg)
1604 int i, res = 0;
1606 for (i = 0; i < rdg->n_vertices; i++)
1608 if (RDG_MEM_WRITE_STMT (rdg, i))
1609 ++res;
1611 if (RDG_MEM_READS_STMT (rdg, i))
1612 ++res;
1615 return res;
1618 /* Returns the number of read and write operations in a PARTITION of
1619 the RDG. */
1621 static int
1622 number_of_rw_in_partition (struct graph *rdg, partition *partition)
1624 int res = 0;
1625 unsigned i;
1626 bitmap_iterator ii;
1628 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
1630 if (RDG_MEM_WRITE_STMT (rdg, i))
1631 ++res;
1633 if (RDG_MEM_READS_STMT (rdg, i))
1634 ++res;
1637 return res;
1640 /* Returns true when one of the PARTITIONS contains all the read or
1641 write operations of RDG. */
1643 static bool
1644 partition_contains_all_rw (struct graph *rdg,
1645 vec<partition *> partitions)
1647 int i;
1648 partition *partition;
1649 int nrw = number_of_rw_in_rdg (rdg);
1651 FOR_EACH_VEC_ELT (partitions, i, partition)
1652 if (nrw == number_of_rw_in_partition (rdg, partition))
1653 return true;
1655 return false;
1658 /* Compute partition dependence created by the data references in DRS1
1659 and DRS2, modify and return DIR according to that. IF ALIAS_DDR is
1660 not NULL, we record dependence introduced by possible alias between
1661 two data references in ALIAS_DDRS; otherwise, we simply ignore such
1662 dependence as if it doesn't exist at all. */
1664 static int
1665 pg_add_dependence_edges (struct graph *rdg, int dir,
1666 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
1668 unsigned i, j;
1669 bitmap_iterator bi, bj;
1670 data_reference_p dr1, dr2, saved_dr1;
1672 /* dependence direction - 0 is no dependence, -1 is back,
1673 1 is forth, 2 is both (we can stop then, merging will occur). */
1674 EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
1676 dr1 = datarefs_vec[i];
1678 EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
1680 int res, this_dir = 1;
1681 ddr_p ddr;
1683 dr2 = datarefs_vec[j];
1685 /* Skip all <read, read> data dependence. */
1686 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1687 continue;
1689 saved_dr1 = dr1;
1690 /* Re-shuffle data-refs to be in topological order. */
1691 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1692 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1694 std::swap (dr1, dr2);
1695 this_dir = -this_dir;
1697 ddr = get_data_dependence (rdg, dr1, dr2);
1698 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1700 this_dir = 0;
1701 res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
1702 DR_BASE_ADDRESS (dr2));
1703 /* Be conservative. If data references are not well analyzed,
1704 or the two data references have the same base address and
1705 offset, add dependence and consider it alias to each other.
1706 In other words, the dependence can not be resolved by
1707 runtime alias check. */
1708 if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
1709 || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
1710 || !DR_INIT (dr1) || !DR_INIT (dr2)
1711 || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
1712 || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
1713 || res == 0)
1714 this_dir = 2;
1715 /* Data dependence could be resolved by runtime alias check,
1716 record it in ALIAS_DDRS. */
1717 else if (alias_ddrs != NULL)
1718 alias_ddrs->safe_push (ddr);
1719 /* Or simply ignore it. */
1721 else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
1723 if (DDR_REVERSED_P (ddr))
1724 this_dir = -this_dir;
1726 /* Known dependences can still be unordered througout the
1727 iteration space, see gcc.dg/tree-ssa/ldist-16.c. */
1728 if (DDR_NUM_DIST_VECTS (ddr) != 1)
1729 this_dir = 2;
1730 /* If the overlap is exact preserve stmt order. */
1731 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1733 /* Else as the distance vector is lexicographic positive swap
1734 the dependence direction. */
1735 else
1736 this_dir = -this_dir;
1738 else
1739 this_dir = 0;
1740 if (this_dir == 2)
1741 return 2;
1742 else if (dir == 0)
1743 dir = this_dir;
1744 else if (this_dir != 0 && dir != this_dir)
1745 return 2;
1746 /* Shuffle "back" dr1. */
1747 dr1 = saved_dr1;
1750 return dir;
1753 /* Compare postorder number of the partition graph vertices V1 and V2. */
1755 static int
1756 pgcmp (const void *v1_, const void *v2_)
1758 const vertex *v1 = (const vertex *)v1_;
1759 const vertex *v2 = (const vertex *)v2_;
1760 return v2->post - v1->post;
1763 /* Data attached to vertices of partition dependence graph. */
1764 struct pg_vdata
1766 /* ID of the corresponding partition. */
1767 int id;
1768 /* The partition. */
1769 struct partition *partition;
1772 /* Data attached to edges of partition dependence graph. */
1773 struct pg_edata
1775 /* If the dependence edge can be resolved by runtime alias check,
1776 this vector contains data dependence relations for runtime alias
1777 check. On the other hand, if the dependence edge is introduced
1778 because of compilation time known data dependence, this vector
1779 contains nothing. */
1780 vec<ddr_p> alias_ddrs;
1783 /* Callback data for traversing edges in graph. */
1784 struct pg_edge_callback_data
1786 /* Bitmap contains strong connected components should be merged. */
1787 bitmap sccs_to_merge;
1788 /* Array constains component information for all vertices. */
1789 int *vertices_component;
1790 /* Vector to record all data dependence relations which are needed
1791 to break strong connected components by runtime alias checks. */
1792 vec<ddr_p> *alias_ddrs;
1795 /* Initialize vertice's data for partition dependence graph PG with
1796 PARTITIONS. */
1798 static void
1799 init_partition_graph_vertices (struct graph *pg,
1800 vec<struct partition *> *partitions)
1802 int i;
1803 partition *partition;
1804 struct pg_vdata *data;
1806 for (i = 0; partitions->iterate (i, &partition); ++i)
1808 data = new pg_vdata;
1809 pg->vertices[i].data = data;
1810 data->id = i;
1811 data->partition = partition;
1815 /* Add edge <I, J> to partition dependence graph PG. Attach vector of data
1816 dependence relations to the EDGE if DDRS isn't NULL. */
1818 static void
1819 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
1821 struct graph_edge *e = add_edge (pg, i, j);
1823 /* If the edge is attached with data dependence relations, it means this
1824 dependence edge can be resolved by runtime alias checks. */
1825 if (ddrs != NULL)
1827 struct pg_edata *data = new pg_edata;
1829 gcc_assert (ddrs->length () > 0);
1830 e->data = data;
1831 data->alias_ddrs = vNULL;
1832 data->alias_ddrs.safe_splice (*ddrs);
1836 /* Callback function for graph travesal algorithm. It returns true
1837 if edge E should skipped when traversing the graph. */
1839 static bool
1840 pg_skip_alias_edge (struct graph_edge *e)
1842 struct pg_edata *data = (struct pg_edata *)e->data;
1843 return (data != NULL && data->alias_ddrs.length () > 0);
1846 /* Callback function freeing data attached to edge E of graph. */
1848 static void
1849 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
1851 if (e->data != NULL)
1853 struct pg_edata *data = (struct pg_edata *)e->data;
1854 data->alias_ddrs.release ();
1855 delete data;
1859 /* Free data attached to vertice of partition dependence graph PG. */
1861 static void
1862 free_partition_graph_vdata (struct graph *pg)
1864 int i;
1865 struct pg_vdata *data;
1867 for (i = 0; i < pg->n_vertices; ++i)
1869 data = (struct pg_vdata *)pg->vertices[i].data;
1870 delete data;
1874 /* Build and return partition dependence graph for PARTITIONS. RDG is
1875 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
1876 is true, data dependence caused by possible alias between references
1877 is ignored, as if it doesn't exist at all; otherwise all depdendences
1878 are considered. */
1880 static struct graph *
1881 build_partition_graph (struct graph *rdg,
1882 vec<struct partition *> *partitions,
1883 bool ignore_alias_p)
1885 int i, j;
1886 struct partition *partition1, *partition2;
1887 graph *pg = new_graph (partitions->length ());
1888 auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
1890 alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
1892 init_partition_graph_vertices (pg, partitions);
1894 for (i = 0; partitions->iterate (i, &partition1); ++i)
1896 for (j = i + 1; partitions->iterate (j, &partition2); ++j)
1898 /* dependence direction - 0 is no dependence, -1 is back,
1899 1 is forth, 2 is both (we can stop then, merging will occur). */
1900 int dir = 0;
1902 /* If the first partition has reduction, add back edge; if the
1903 second partition has reduction, add forth edge. This makes
1904 sure that reduction partition will be sorted as the last one. */
1905 if (partition_reduction_p (partition1))
1906 dir = -1;
1907 else if (partition_reduction_p (partition2))
1908 dir = 1;
1910 /* Cleanup the temporary vector. */
1911 alias_ddrs.truncate (0);
1913 dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
1914 partition2->datarefs, alias_ddrs_p);
1916 /* Add edge to partition graph if there exists dependence. There
1917 are two types of edges. One type edge is caused by compilation
1918 time known dependence, this type can not be resolved by runtime
1919 alias check. The other type can be resolved by runtime alias
1920 check. */
1921 if (dir == 1 || dir == 2
1922 || alias_ddrs.length () > 0)
1924 /* Attach data dependence relations to edge that can be resolved
1925 by runtime alias check. */
1926 bool alias_edge_p = (dir != 1 && dir != 2);
1927 add_partition_graph_edge (pg, i, j,
1928 (alias_edge_p) ? &alias_ddrs : NULL);
1930 if (dir == -1 || dir == 2
1931 || alias_ddrs.length () > 0)
1933 /* Attach data dependence relations to edge that can be resolved
1934 by runtime alias check. */
1935 bool alias_edge_p = (dir != -1 && dir != 2);
1936 add_partition_graph_edge (pg, j, i,
1937 (alias_edge_p) ? &alias_ddrs : NULL);
1941 return pg;
1944 /* Sort partitions in PG by post order and store them in PARTITIONS. */
1946 static void
1947 sort_partitions_by_post_order (struct graph *pg,
1948 vec<struct partition *> *partitions)
1950 int i;
1951 struct pg_vdata *data;
1953 /* Now order the remaining nodes in postorder. */
1954 qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
1955 partitions->truncate (0);
1956 for (i = 0; i < pg->n_vertices; ++i)
1958 data = (struct pg_vdata *)pg->vertices[i].data;
1959 if (data->partition)
1960 partitions->safe_push (data->partition);
1964 /* Given reduced dependence graph RDG merge strong connected components
1965 of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by
1966 possible alias between references is ignored, as if it doesn't exist
1967 at all; otherwise all depdendences are considered. */
1969 static void
1970 merge_dep_scc_partitions (struct graph *rdg,
1971 vec<struct partition *> *partitions,
1972 bool ignore_alias_p)
1974 struct partition *partition1, *partition2;
1975 struct pg_vdata *data;
1976 graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
1977 int i, j, num_sccs = graphds_scc (pg, NULL);
1979 /* Strong connected compoenent means dependence cycle, we cannot distribute
1980 them. So fuse them together. */
1981 if ((unsigned) num_sccs < partitions->length ())
1983 for (i = 0; i < num_sccs; ++i)
1985 for (j = 0; partitions->iterate (j, &partition1); ++j)
1986 if (pg->vertices[j].component == i)
1987 break;
1988 for (j = j + 1; partitions->iterate (j, &partition2); ++j)
1989 if (pg->vertices[j].component == i)
1991 partition_merge_into (NULL, partition1,
1992 partition2, FUSE_SAME_SCC);
1993 partition1->type = PTYPE_SEQUENTIAL;
1994 (*partitions)[j] = NULL;
1995 partition_free (partition2);
1996 data = (struct pg_vdata *)pg->vertices[j].data;
1997 data->partition = NULL;
2000 sort_partitions_by_post_order (pg, partitions);
2002 gcc_assert (partitions->length () == (unsigned)num_sccs);
2003 free_partition_graph_vdata (pg);
2004 free_graph (pg);
2007 /* Callback function for traversing edge E in graph G. DATA is private
2008 callback data. */
2010 static void
2011 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2013 int i, j, component;
2014 struct pg_edge_callback_data *cbdata;
2015 struct pg_edata *edata = (struct pg_edata *) e->data;
2017 /* If the edge doesn't have attached data dependence, it represents
2018 compilation time known dependences. This type dependence cannot
2019 be resolved by runtime alias check. */
2020 if (edata == NULL || edata->alias_ddrs.length () == 0)
2021 return;
2023 cbdata = (struct pg_edge_callback_data *) data;
2024 i = e->src;
2025 j = e->dest;
2026 component = cbdata->vertices_component[i];
2027 /* Vertices are topologically sorted according to compilation time
2028 known dependences, so we can break strong connected components
2029 by removing edges of the opposite direction, i.e, edges pointing
2030 from vertice with smaller post number to vertice with bigger post
2031 number. */
2032 if (g->vertices[i].post < g->vertices[j].post
2033 /* We only need to remove edges connecting vertices in the same
2034 strong connected component to break it. */
2035 && component == cbdata->vertices_component[j]
2036 /* Check if we want to break the strong connected component or not. */
2037 && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2038 cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2041 /* This is the main function breaking strong conected components in
2042 PARTITIONS giving reduced depdendence graph RDG. Store data dependence
2043 relations for runtime alias check in ALIAS_DDRS. */
2045 static void
2046 break_alias_scc_partitions (struct graph *rdg,
2047 vec<struct partition *> *partitions,
2048 vec<ddr_p> *alias_ddrs)
2050 int i, j, num_sccs, num_sccs_no_alias;
2051 /* Build partition dependence graph. */
2052 graph *pg = build_partition_graph (rdg, partitions, false);
2054 alias_ddrs->truncate (0);
2055 /* Find strong connected components in the graph, with all dependence edges
2056 considered. */
2057 num_sccs = graphds_scc (pg, NULL);
2058 /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2059 compilation time known dependences are merged before this function. */
2060 if ((unsigned) num_sccs < partitions->length ())
2062 struct pg_edge_callback_data cbdata;
2063 auto_bitmap sccs_to_merge;
2064 auto_vec<enum partition_type> scc_types;
2065 struct partition *partition, *first;
2067 /* If all paritions in a SCC has the same type, we can simply merge the
2068 SCC. This loop finds out such SCCS and record them in bitmap. */
2069 bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2070 for (i = 0; i < num_sccs; ++i)
2072 for (j = 0; partitions->iterate (j, &first); ++j)
2073 if (pg->vertices[j].component == i)
2074 break;
2075 for (++j; partitions->iterate (j, &partition); ++j)
2077 if (pg->vertices[j].component != i)
2078 continue;
2080 if (first->type != partition->type)
2082 bitmap_clear_bit (sccs_to_merge, i);
2083 break;
2088 /* Initialize callback data for traversing. */
2089 cbdata.sccs_to_merge = sccs_to_merge;
2090 cbdata.alias_ddrs = alias_ddrs;
2091 cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2092 /* Record the component information which will be corrupted by next
2093 graph scc finding call. */
2094 for (i = 0; i < pg->n_vertices; ++i)
2095 cbdata.vertices_component[i] = pg->vertices[i].component;
2097 /* Collect data dependences for runtime alias checks to break SCCs. */
2098 if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2100 /* Run SCC finding algorithm again, with alias dependence edges
2101 skipped. This is to topologically sort paritions according to
2102 compilation time known dependence. Note the topological order
2103 is stored in the form of pg's post order number. */
2104 num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2105 gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias);
2106 /* With topological order, we can construct two subgraphs L and R.
2107 L contains edge <x, y> where x < y in terms of post order, while
2108 R contains edge <x, y> where x > y. Edges for compilation time
2109 known dependence all fall in R, so we break SCCs by removing all
2110 (alias) edges of in subgraph L. */
2111 for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2114 /* For SCC that doesn't need to be broken, merge it. */
2115 for (i = 0; i < num_sccs; ++i)
2117 if (!bitmap_bit_p (sccs_to_merge, i))
2118 continue;
2120 for (j = 0; partitions->iterate (j, &first); ++j)
2121 if (cbdata.vertices_component[j] == i)
2122 break;
2123 for (++j; partitions->iterate (j, &partition); ++j)
2125 struct pg_vdata *data;
2127 if (cbdata.vertices_component[j] != i)
2128 continue;
2130 partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2131 (*partitions)[j] = NULL;
2132 partition_free (partition);
2133 data = (struct pg_vdata *)pg->vertices[j].data;
2134 gcc_assert (data->id == j);
2135 data->partition = NULL;
2140 sort_partitions_by_post_order (pg, partitions);
2141 free_partition_graph_vdata (pg);
2142 for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2143 free_graph (pg);
2145 if (dump_file && (dump_flags & TDF_DETAILS))
2147 fprintf (dump_file, "Possible alias data dependence to break:\n");
2148 dump_data_dependence_relations (dump_file, *alias_ddrs);
2152 /* Compute and return an expression whose value is the segment length which
2153 will be accessed by DR in NITERS iterations. */
2155 static tree
2156 data_ref_segment_size (struct data_reference *dr, tree niters)
2158 tree segment_length;
2160 if (integer_zerop (DR_STEP (dr)))
2161 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2162 else
2163 segment_length = size_binop (MULT_EXPR,
2164 fold_convert (sizetype, DR_STEP (dr)),
2165 fold_convert (sizetype, niters));
2167 return segment_length;
2170 /* Return true if LOOP's latch is dominated by statement for data reference
2171 DR. */
2173 static inline bool
2174 latch_dominated_by_data_ref (struct loop *loop, data_reference *dr)
2176 return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2177 gimple_bb (DR_STMT (dr)));
2180 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2181 data dependence relations ALIAS_DDRS. */
2183 static void
2184 compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
2185 vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2187 unsigned int i;
2188 unsigned HOST_WIDE_INT factor = 1;
2189 tree niters_plus_one, niters = number_of_latch_executions (loop);
2191 gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2192 niters = fold_convert (sizetype, niters);
2193 niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2195 if (dump_file && (dump_flags & TDF_DETAILS))
2196 fprintf (dump_file, "Creating alias check pairs:\n");
2198 /* Iterate all data dependence relations and compute alias check pairs. */
2199 for (i = 0; i < alias_ddrs->length (); i++)
2201 ddr_p ddr = (*alias_ddrs)[i];
2202 struct data_reference *dr_a = DDR_A (ddr);
2203 struct data_reference *dr_b = DDR_B (ddr);
2204 tree seg_length_a, seg_length_b;
2205 int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2206 DR_BASE_ADDRESS (dr_b));
2208 if (comp_res == 0)
2209 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
2210 gcc_assert (comp_res != 0);
2212 if (latch_dominated_by_data_ref (loop, dr_a))
2213 seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2214 else
2215 seg_length_a = data_ref_segment_size (dr_a, niters);
2217 if (latch_dominated_by_data_ref (loop, dr_b))
2218 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2219 else
2220 seg_length_b = data_ref_segment_size (dr_b, niters);
2222 dr_with_seg_len_pair_t dr_with_seg_len_pair
2223 (dr_with_seg_len (dr_a, seg_length_a),
2224 dr_with_seg_len (dr_b, seg_length_b));
2226 /* Canonicalize pairs by sorting the two DR members. */
2227 if (comp_res > 0)
2228 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2230 comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2233 if (tree_fits_uhwi_p (niters))
2234 factor = tree_to_uhwi (niters);
2236 /* Prune alias check pairs. */
2237 prune_runtime_alias_test_list (comp_alias_pairs, factor);
2238 if (dump_file && (dump_flags & TDF_DETAILS))
2239 fprintf (dump_file,
2240 "Improved number of alias checks from %d to %d\n",
2241 alias_ddrs->length (), comp_alias_pairs->length ());
2244 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2245 checks and version LOOP under condition of these runtime alias checks. */
2247 static void
2248 version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs)
2250 profile_probability prob;
2251 basic_block cond_bb;
2252 struct loop *nloop;
2253 tree lhs, arg0, cond_expr = NULL_TREE;
2254 gimple_seq cond_stmts = NULL;
2255 gimple *call_stmt = NULL;
2256 auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2258 /* Generate code for runtime alias checks if necessary. */
2259 gcc_assert (alias_ddrs->length () > 0);
2261 if (dump_file && (dump_flags & TDF_DETAILS))
2262 fprintf (dump_file,
2263 "Version loop <%d> with runtime alias check\n", loop->num);
2265 compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2266 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2267 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2268 is_gimple_condexpr, NULL_TREE);
2270 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */
2271 if (flag_tree_loop_vectorize)
2273 /* Generate internal function call for loop distribution alias check. */
2274 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2275 2, NULL_TREE, cond_expr);
2276 lhs = make_ssa_name (boolean_type_node);
2277 gimple_call_set_lhs (call_stmt, lhs);
2279 else
2280 lhs = cond_expr;
2282 prob = profile_probability::guessed_always ().apply_scale (9, 10);
2283 initialize_original_copy_tables ();
2284 nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2285 prob, prob.invert (), true);
2286 free_original_copy_tables ();
2287 /* Record the original loop number in newly generated loops. In case of
2288 distribution, the original loop will be distributed and the new loop
2289 is kept. */
2290 loop->orig_loop_num = nloop->num;
2291 nloop->orig_loop_num = nloop->num;
2292 nloop->dont_vectorize = true;
2293 nloop->force_vectorize = false;
2295 if (call_stmt)
2297 /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2298 loop could be destroyed. */
2299 arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2300 gimple_call_set_arg (call_stmt, 0, arg0);
2301 gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2304 if (cond_stmts)
2306 gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2307 gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2309 update_ssa (TODO_update_ssa);
2312 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2313 ALIAS_DDRS are data dependence relations for runtime alias check. */
2315 static inline bool
2316 version_for_distribution_p (vec<struct partition *> *partitions,
2317 vec<ddr_p> *alias_ddrs)
2319 /* No need to version loop if we have only one partition. */
2320 if (partitions->length () == 1)
2321 return false;
2323 /* Need to version loop if runtime alias check is necessary. */
2324 return (alias_ddrs->length () > 0);
2327 /* Fuse all partitions if necessary before finalizing distribution. */
2329 static void
2330 finalize_partitions (vec<struct partition *> *partitions,
2331 vec<ddr_p> *alias_ddrs)
2333 unsigned i;
2334 struct partition *a, *partition;
2336 if (partitions->length () == 1
2337 || alias_ddrs->length () > 0)
2338 return;
2340 a = (*partitions)[0];
2341 if (a->kind != PKIND_NORMAL)
2342 return;
2344 for (i = 1; partitions->iterate (i, &partition); ++i)
2346 /* Don't fuse if partition has different type or it is a builtin. */
2347 if (partition->type != a->type
2348 || partition->kind != PKIND_NORMAL)
2349 return;
2352 /* Fuse all partitions. */
2353 for (i = 1; partitions->iterate (i, &partition); ++i)
2355 partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2356 partition_free (partition);
2358 partitions->truncate (1);
2361 /* Distributes the code from LOOP in such a way that producer statements
2362 are placed before consumer statements. Tries to separate only the
2363 statements from STMTS into separate loops. Returns the number of
2364 distributed loops. Set NB_CALLS to number of generated builtin calls.
2365 Set *DESTROY_P to whether LOOP needs to be destroyed. */
2367 static int
2368 distribute_loop (struct loop *loop, vec<gimple *> stmts,
2369 control_dependences *cd, int *nb_calls, bool *destroy_p)
2371 struct graph *rdg;
2372 partition *partition;
2373 bool any_builtin;
2374 int i, nbp;
2376 *destroy_p = false;
2377 *nb_calls = 0;
2378 loop_nest.create (0);
2379 if (!find_loop_nest (loop, &loop_nest))
2381 loop_nest.release ();
2382 return 0;
2385 datarefs_vec.create (20);
2386 rdg = build_rdg (loop, cd);
2387 if (!rdg)
2389 if (dump_file && (dump_flags & TDF_DETAILS))
2390 fprintf (dump_file,
2391 "Loop %d not distributed: failed to build the RDG.\n",
2392 loop->num);
2394 loop_nest.release ();
2395 free_data_refs (datarefs_vec);
2396 return 0;
2399 if (datarefs_vec.length () > MAX_DATAREFS_NUM)
2401 if (dump_file && (dump_flags & TDF_DETAILS))
2402 fprintf (dump_file,
2403 "Loop %d not distributed: too many memory references.\n",
2404 loop->num);
2406 free_rdg (rdg);
2407 loop_nest.release ();
2408 free_data_refs (datarefs_vec);
2409 return 0;
2412 data_reference_p dref;
2413 for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
2414 dref->aux = (void *) (uintptr_t) i;
2416 if (dump_file && (dump_flags & TDF_DETAILS))
2417 dump_rdg (dump_file, rdg);
2419 auto_vec<struct partition *, 3> partitions;
2420 rdg_build_partitions (rdg, stmts, &partitions);
2422 /* Can't do runtime alias check if loop niter is unknown. */
2423 tree niters = number_of_latch_executions (loop);
2424 bool rt_alias_check_p = (niters != NULL_TREE && niters != chrec_dont_know);
2425 auto_vec<ddr_p> alias_ddrs;
2427 auto_bitmap stmt_in_all_partitions;
2428 bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
2429 for (i = 1; partitions.iterate (i, &partition); ++i)
2430 bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
2432 any_builtin = false;
2433 FOR_EACH_VEC_ELT (partitions, i, partition)
2435 classify_partition (loop, rdg, partition, stmt_in_all_partitions);
2436 any_builtin |= partition_builtin_p (partition);
2439 /* If we are only distributing patterns but did not detect any,
2440 simply bail out. */
2441 if (!flag_tree_loop_distribution
2442 && !any_builtin)
2444 nbp = 0;
2445 goto ldist_done;
2448 /* If we are only distributing patterns fuse all partitions that
2449 were not classified as builtins. This also avoids chopping
2450 a loop into pieces, separated by builtin calls. That is, we
2451 only want no or a single loop body remaining. */
2452 struct partition *into;
2453 if (!flag_tree_loop_distribution)
2455 for (i = 0; partitions.iterate (i, &into); ++i)
2456 if (!partition_builtin_p (into))
2457 break;
2458 for (++i; partitions.iterate (i, &partition); ++i)
2459 if (!partition_builtin_p (partition))
2461 partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
2462 partitions.unordered_remove (i);
2463 partition_free (partition);
2464 i--;
2468 /* Due to limitations in the transform phase we have to fuse all
2469 reduction partitions into the last partition so the existing
2470 loop will contain all loop-closed PHI nodes. */
2471 for (i = 0; partitions.iterate (i, &into); ++i)
2472 if (partition_reduction_p (into))
2473 break;
2474 for (i = i + 1; partitions.iterate (i, &partition); ++i)
2475 if (partition_reduction_p (partition))
2477 partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
2478 partitions.unordered_remove (i);
2479 partition_free (partition);
2480 i--;
2483 /* Apply our simple cost model - fuse partitions with similar
2484 memory accesses. */
2485 for (i = 0; partitions.iterate (i, &into); ++i)
2487 bool changed = false;
2488 if (partition_builtin_p (into))
2489 continue;
2490 for (int j = i + 1;
2491 partitions.iterate (j, &partition); ++j)
2493 if (share_memory_accesses (rdg, into, partition))
2495 partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
2496 partitions.unordered_remove (j);
2497 partition_free (partition);
2498 j--;
2499 changed = true;
2502 /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
2503 accesses when 1 and 2 have similar accesses but not 0 and 1
2504 then in the next iteration we will fail to consider merging
2505 1 into 0,2. So try again if we did any merging into 0. */
2506 if (changed)
2507 i--;
2510 /* Build the partition dependency graph. */
2511 if (partitions.length () > 1)
2513 merge_dep_scc_partitions (rdg, &partitions, rt_alias_check_p);
2514 alias_ddrs.truncate (0);
2515 if (rt_alias_check_p && partitions.length () > 1)
2516 break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
2519 finalize_partitions (&partitions, &alias_ddrs);
2521 nbp = partitions.length ();
2522 if (nbp == 0
2523 || (nbp == 1 && !partition_builtin_p (partitions[0]))
2524 || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
2526 nbp = 0;
2527 goto ldist_done;
2530 if (version_for_distribution_p (&partitions, &alias_ddrs))
2531 version_loop_by_alias_check (loop, &alias_ddrs);
2533 if (dump_file && (dump_flags & TDF_DETAILS))
2535 fprintf (dump_file,
2536 "distribute loop <%d> into partitions:\n", loop->num);
2537 dump_rdg_partitions (dump_file, partitions);
2540 FOR_EACH_VEC_ELT (partitions, i, partition)
2542 if (partition_builtin_p (partition))
2543 (*nb_calls)++;
2544 *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
2547 ldist_done:
2548 loop_nest.release ();
2549 free_data_refs (datarefs_vec);
2550 for (hash_table<ddr_hasher>::iterator iter = ddrs_table.begin ();
2551 iter != ddrs_table.end (); ++iter)
2553 free_dependence_relation (*iter);
2554 *iter = NULL;
2556 ddrs_table.empty ();
2558 FOR_EACH_VEC_ELT (partitions, i, partition)
2559 partition_free (partition);
2561 free_rdg (rdg);
2562 return nbp - *nb_calls;
2565 /* Distribute all loops in the current function. */
2567 namespace {
2569 const pass_data pass_data_loop_distribution =
2571 GIMPLE_PASS, /* type */
2572 "ldist", /* name */
2573 OPTGROUP_LOOP, /* optinfo_flags */
2574 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
2575 ( PROP_cfg | PROP_ssa ), /* properties_required */
2576 0, /* properties_provided */
2577 0, /* properties_destroyed */
2578 0, /* todo_flags_start */
2579 0, /* todo_flags_finish */
2582 class pass_loop_distribution : public gimple_opt_pass
2584 public:
2585 pass_loop_distribution (gcc::context *ctxt)
2586 : gimple_opt_pass (pass_data_loop_distribution, ctxt)
2589 /* opt_pass methods: */
2590 virtual bool gate (function *)
2592 return flag_tree_loop_distribution
2593 || flag_tree_loop_distribute_patterns;
2596 virtual unsigned int execute (function *);
2598 }; // class pass_loop_distribution
2600 unsigned int
2601 pass_loop_distribution::execute (function *fun)
2603 struct loop *loop;
2604 bool changed = false;
2605 basic_block bb;
2606 control_dependences *cd = NULL;
2607 auto_vec<loop_p> loops_to_be_destroyed;
2609 if (number_of_loops (fun) <= 1)
2610 return 0;
2612 /* Compute topological order for basic blocks. Topological order is
2613 needed because data dependence is computed for data references in
2614 lexicographical order. */
2615 if (bb_top_order_index == NULL)
2617 int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun));
2619 bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
2620 bb_top_order_index_size
2621 = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true);
2622 for (int i = 0; i < bb_top_order_index_size; i++)
2623 bb_top_order_index[rpo[i]] = i;
2625 free (rpo);
2628 FOR_ALL_BB_FN (bb, fun)
2630 gimple_stmt_iterator gsi;
2631 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2632 gimple_set_uid (gsi_stmt (gsi), -1);
2633 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2634 gimple_set_uid (gsi_stmt (gsi), -1);
2637 /* We can at the moment only distribute non-nested loops, thus restrict
2638 walking to innermost loops. */
2639 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
2641 auto_vec<gimple *> work_list;
2642 basic_block *bbs;
2643 int num = loop->num;
2644 unsigned int i;
2646 /* If the loop doesn't have a single exit we will fail anyway,
2647 so do that early. */
2648 if (!single_exit (loop))
2649 continue;
2651 /* Only optimize hot loops. */
2652 if (!optimize_loop_for_speed_p (loop))
2653 continue;
2655 /* Initialize the worklist with stmts we seed the partitions with. */
2656 bbs = get_loop_body_in_dom_order (loop);
2657 for (i = 0; i < loop->num_nodes; ++i)
2659 for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
2660 !gsi_end_p (gsi);
2661 gsi_next (&gsi))
2663 gphi *phi = gsi.phi ();
2664 if (virtual_operand_p (gimple_phi_result (phi)))
2665 continue;
2666 /* Distribute stmts which have defs that are used outside of
2667 the loop. */
2668 if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
2669 continue;
2670 work_list.safe_push (phi);
2672 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2673 !gsi_end_p (gsi);
2674 gsi_next (&gsi))
2676 gimple *stmt = gsi_stmt (gsi);
2678 /* If there is a stmt with side-effects bail out - we
2679 cannot and should not distribute this loop. */
2680 if (gimple_has_side_effects (stmt))
2682 work_list.truncate (0);
2683 goto out;
2686 /* Distribute stmts which have defs that are used outside of
2687 the loop. */
2688 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
2690 /* Otherwise only distribute stores for now. */
2691 else if (!gimple_vdef (stmt))
2692 continue;
2694 work_list.safe_push (stmt);
2697 out:
2698 free (bbs);
2700 int nb_generated_loops = 0;
2701 int nb_generated_calls = 0;
2702 location_t loc = find_loop_location (loop);
2703 if (work_list.length () > 0)
2705 if (!cd)
2707 calculate_dominance_info (CDI_DOMINATORS);
2708 calculate_dominance_info (CDI_POST_DOMINATORS);
2709 cd = new control_dependences ();
2710 free_dominance_info (CDI_POST_DOMINATORS);
2712 bool destroy_p;
2713 nb_generated_loops = distribute_loop (loop, work_list, cd,
2714 &nb_generated_calls,
2715 &destroy_p);
2716 if (destroy_p)
2717 loops_to_be_destroyed.safe_push (loop);
2720 if (nb_generated_loops + nb_generated_calls > 0)
2722 changed = true;
2723 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
2724 loc, "Loop %d distributed: split to %d loops "
2725 "and %d library calls.\n",
2726 num, nb_generated_loops, nb_generated_calls);
2728 else if (dump_file && (dump_flags & TDF_DETAILS))
2729 fprintf (dump_file, "Loop %d is the same.\n", num);
2732 if (cd)
2733 delete cd;
2735 if (bb_top_order_index != NULL)
2737 free (bb_top_order_index);
2738 bb_top_order_index = NULL;
2739 bb_top_order_index_size = 0;
2742 if (changed)
2744 /* Destroy loop bodies that could not be reused. Do this late as we
2745 otherwise can end up refering to stale data in control dependences. */
2746 unsigned i;
2747 FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
2748 destroy_loop (loop);
2750 /* Cached scalar evolutions now may refer to wrong or non-existing
2751 loops. */
2752 scev_reset_htab ();
2753 mark_virtual_operands_for_renaming (fun);
2754 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
2757 checking_verify_loop_structure ();
2759 return 0;
2762 } // anon namespace
2764 gimple_opt_pass *
2765 make_pass_loop_distribution (gcc::context *ctxt)
2767 return new pass_loop_distribution (ctxt);