usr.sbin/makefs/hammer2: Remove redundant hammer2_inode_modify()
[dragonfly.git] / contrib / gcc-8.0 / gcc / tree-loop-distribution.c
blob769523ba21454e3477facae81be8a5b7a42f0e6d
1 /* Loop distribution.
2 Copyright (C) 2006-2018 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 /* This pass performs loop distribution: for example, the loop
24 |DO I = 2, N
25 | A(I) = B(I) + C
26 | D(I) = A(I-1)*E
27 |ENDDO
29 is transformed to
31 |DOALL I = 2, N
32 | A(I) = B(I) + C
33 |ENDDO
35 |DOALL I = 2, N
36 | D(I) = A(I-1)*E
37 |ENDDO
39 Loop distribution is the dual of loop fusion. It separates statements
40 of a loop (or loop nest) into multiple loops (or loop nests) with the
41 same loop header. The major goal is to separate statements which may
42 be vectorized from those that can't. This pass implements distribution
43 in the following steps:
45 1) Seed partitions with specific type statements. For now we support
46 two types seed statements: statement defining variable used outside
47 of loop; statement storing to memory.
48 2) Build reduced dependence graph (RDG) for loop to be distributed.
49 The vertices (RDG:V) model all statements in the loop and the edges
50 (RDG:E) model flow and control dependencies between statements.
51 3) Apart from RDG, compute data dependencies between memory references.
52 4) Starting from seed statement, build up partition by adding depended
53 statements according to RDG's dependence information. Partition is
54 classified as parallel type if it can be executed paralleled; or as
55 sequential type if it can't. Parallel type partition is further
56 classified as different builtin kinds if it can be implemented as
57 builtin function calls.
58 5) Build partition dependence graph (PG) based on data dependencies.
59 The vertices (PG:V) model all partitions and the edges (PG:E) model
60 all data dependencies between every partitions pair. In general,
61 data dependence is either compilation time known or unknown. In C
62 family languages, there exists quite amount compilation time unknown
63 dependencies because of possible alias relation of data references.
64 We categorize PG's edge to two types: "true" edge that represents
65 compilation time known data dependencies; "alias" edge for all other
66 data dependencies.
67 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge
68 partitions in each strong connected component (SCC) correspondingly.
69 Build new PG for merged partitions.
70 7) Traverse PG again and this time with both "true" and "alias" edges
71 included. We try to break SCCs by removing some edges. Because
72 SCCs by "true" edges are all fused in step 6), we can break SCCs
73 by removing some "alias" edges. It's NP-hard to choose optimal
74 edge set, fortunately simple approximation is good enough for us
75 given the small problem scale.
76 8) Collect all data dependencies of the removed "alias" edges. Create
77 runtime alias checks for collected data dependencies.
78 9) Version loop under the condition of runtime alias checks. Given
79 loop distribution generally introduces additional overhead, it is
80 only useful if vectorization is achieved in distributed loop. We
81 version loop with internal function call IFN_LOOP_DIST_ALIAS. If
82 no distributed loop can be vectorized, we simply remove distributed
83 loops and recover to the original one.
85 TODO:
86 1) We only distribute innermost two-level loop nest now. We should
87 extend it for arbitrary loop nests in the future.
88 2) We only fuse partitions in SCC now. A better fusion algorithm is
89 desired to minimize loop overhead, maximize parallelism and maximize
90 data reuse. */
92 #include "config.h"
93 #define INCLUDE_ALGORITHM /* stable_sort */
94 #include "system.h"
95 #include "coretypes.h"
96 #include "backend.h"
97 #include "tree.h"
98 #include "gimple.h"
99 #include "cfghooks.h"
100 #include "tree-pass.h"
101 #include "ssa.h"
102 #include "gimple-pretty-print.h"
103 #include "fold-const.h"
104 #include "cfganal.h"
105 #include "gimple-iterator.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-ssa-loop-manip.h"
110 #include "tree-ssa-loop-ivopts.h"
111 #include "tree-ssa-loop.h"
112 #include "tree-into-ssa.h"
113 #include "tree-ssa.h"
114 #include "cfgloop.h"
115 #include "tree-scalar-evolution.h"
116 #include "params.h"
117 #include "tree-vectorizer.h"
118 #include "tree-eh.h"
121 #define MAX_DATAREFS_NUM \
122 ((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
124 /* Threshold controlling number of distributed partitions. Given it may
125 be unnecessary if a memory stream cost model is invented in the future,
126 we define it as a temporary macro, rather than a parameter. */
127 #define NUM_PARTITION_THRESHOLD (4)
129 /* Hashtable helpers. */
131 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
133 static inline hashval_t hash (const data_dependence_relation *);
134 static inline bool equal (const data_dependence_relation *,
135 const data_dependence_relation *);
138 /* Hash function for data dependence. */
140 inline hashval_t
141 ddr_hasher::hash (const data_dependence_relation *ddr)
143 inchash::hash h;
144 h.add_ptr (DDR_A (ddr));
145 h.add_ptr (DDR_B (ddr));
146 return h.end ();
149 /* Hash table equality function for data dependence. */
151 inline bool
152 ddr_hasher::equal (const data_dependence_relation *ddr1,
153 const data_dependence_relation *ddr2)
155 return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
158 /* The loop (nest) to be distributed. */
159 static vec<loop_p> loop_nest;
161 /* Vector of data references in the loop to be distributed. */
162 static vec<data_reference_p> datarefs_vec;
164 /* Store index of data reference in aux field. */
165 #define DR_INDEX(dr) ((uintptr_t) (dr)->aux)
167 /* Hash table for data dependence relation in the loop to be distributed. */
168 static hash_table<ddr_hasher> *ddrs_table;
170 /* A Reduced Dependence Graph (RDG) vertex representing a statement. */
171 struct rdg_vertex
173 /* The statement represented by this vertex. */
174 gimple *stmt;
176 /* Vector of data-references in this statement. */
177 vec<data_reference_p> datarefs;
179 /* True when the statement contains a write to memory. */
180 bool has_mem_write;
182 /* True when the statement contains a read from memory. */
183 bool has_mem_reads;
186 #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt
187 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
188 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
189 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
190 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
191 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
192 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
193 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
195 /* Data dependence type. */
197 enum rdg_dep_type
199 /* Read After Write (RAW). */
200 flow_dd = 'f',
202 /* Control dependence (execute conditional on). */
203 control_dd = 'c'
206 /* Dependence information attached to an edge of the RDG. */
208 struct rdg_edge
210 /* Type of the dependence. */
211 enum rdg_dep_type type;
214 #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type
216 /* Dump vertex I in RDG to FILE. */
218 static void
219 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
221 struct vertex *v = &(rdg->vertices[i]);
222 struct graph_edge *e;
224 fprintf (file, "(vertex %d: (%s%s) (in:", i,
225 RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
226 RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
228 if (v->pred)
229 for (e = v->pred; e; e = e->pred_next)
230 fprintf (file, " %d", e->src);
232 fprintf (file, ") (out:");
234 if (v->succ)
235 for (e = v->succ; e; e = e->succ_next)
236 fprintf (file, " %d", e->dest);
238 fprintf (file, ")\n");
239 print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
240 fprintf (file, ")\n");
243 /* Call dump_rdg_vertex on stderr. */
245 DEBUG_FUNCTION void
246 debug_rdg_vertex (struct graph *rdg, int i)
248 dump_rdg_vertex (stderr, rdg, i);
251 /* Dump the reduced dependence graph RDG to FILE. */
253 static void
254 dump_rdg (FILE *file, struct graph *rdg)
256 fprintf (file, "(rdg\n");
257 for (int i = 0; i < rdg->n_vertices; i++)
258 dump_rdg_vertex (file, rdg, i);
259 fprintf (file, ")\n");
262 /* Call dump_rdg on stderr. */
264 DEBUG_FUNCTION void
265 debug_rdg (struct graph *rdg)
267 dump_rdg (stderr, rdg);
270 static void
271 dot_rdg_1 (FILE *file, struct graph *rdg)
273 int i;
274 pretty_printer buffer;
275 pp_needs_newline (&buffer) = false;
276 buffer.buffer->stream = file;
278 fprintf (file, "digraph RDG {\n");
280 for (i = 0; i < rdg->n_vertices; i++)
282 struct vertex *v = &(rdg->vertices[i]);
283 struct graph_edge *e;
285 fprintf (file, "%d [label=\"[%d] ", i, i);
286 pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
287 pp_flush (&buffer);
288 fprintf (file, "\"]\n");
290 /* Highlight reads from memory. */
291 if (RDG_MEM_READS_STMT (rdg, i))
292 fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
294 /* Highlight stores to memory. */
295 if (RDG_MEM_WRITE_STMT (rdg, i))
296 fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
298 if (v->succ)
299 for (e = v->succ; e; e = e->succ_next)
300 switch (RDGE_TYPE (e))
302 case flow_dd:
303 /* These are the most common dependences: don't print these. */
304 fprintf (file, "%d -> %d \n", i, e->dest);
305 break;
307 case control_dd:
308 fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
309 break;
311 default:
312 gcc_unreachable ();
316 fprintf (file, "}\n\n");
319 /* Display the Reduced Dependence Graph using dotty. */
321 DEBUG_FUNCTION void
322 dot_rdg (struct graph *rdg)
324 /* When debugging, you may want to enable the following code. */
325 #ifdef HAVE_POPEN
326 FILE *file = popen ("dot -Tx11", "w");
327 if (!file)
328 return;
329 dot_rdg_1 (file, rdg);
330 fflush (file);
331 close (fileno (file));
332 pclose (file);
333 #else
334 dot_rdg_1 (stderr, rdg);
335 #endif
338 /* Returns the index of STMT in RDG. */
340 static int
341 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
343 int index = gimple_uid (stmt);
344 gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
345 return index;
348 /* Creates dependence edges in RDG for all the uses of DEF. IDEF is
349 the index of DEF in RDG. */
351 static void
352 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
354 use_operand_p imm_use_p;
355 imm_use_iterator iterator;
357 FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
359 struct graph_edge *e;
360 int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
362 if (use < 0)
363 continue;
365 e = add_edge (rdg, idef, use);
366 e->data = XNEW (struct rdg_edge);
367 RDGE_TYPE (e) = flow_dd;
371 /* Creates an edge for the control dependences of BB to the vertex V. */
373 static void
374 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
375 int v, control_dependences *cd)
377 bitmap_iterator bi;
378 unsigned edge_n;
379 EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
380 0, edge_n, bi)
382 basic_block cond_bb = cd->get_edge_src (edge_n);
383 gimple *stmt = last_stmt (cond_bb);
384 if (stmt && is_ctrl_stmt (stmt))
386 struct graph_edge *e;
387 int c = rdg_vertex_for_stmt (rdg, stmt);
388 if (c < 0)
389 continue;
391 e = add_edge (rdg, c, v);
392 e->data = XNEW (struct rdg_edge);
393 RDGE_TYPE (e) = control_dd;
398 /* Creates the edges of the reduced dependence graph RDG. */
400 static void
401 create_rdg_flow_edges (struct graph *rdg)
403 int i;
404 def_operand_p def_p;
405 ssa_op_iter iter;
407 for (i = 0; i < rdg->n_vertices; i++)
408 FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
409 iter, SSA_OP_DEF)
410 create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
413 /* Creates the edges of the reduced dependence graph RDG. */
415 static void
416 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
418 int i;
420 for (i = 0; i < rdg->n_vertices; i++)
422 gimple *stmt = RDG_STMT (rdg, i);
423 if (gimple_code (stmt) == GIMPLE_PHI)
425 edge_iterator ei;
426 edge e;
427 FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
428 if (flow_bb_inside_loop_p (loop, e->src))
429 create_edge_for_control_dependence (rdg, e->src, i, cd);
431 else
432 create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
436 /* Build the vertices of the reduced dependence graph RDG. Return false
437 if that failed. */
439 static bool
440 create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop)
442 int i;
443 gimple *stmt;
445 FOR_EACH_VEC_ELT (stmts, i, stmt)
447 struct vertex *v = &(rdg->vertices[i]);
449 /* Record statement to vertex mapping. */
450 gimple_set_uid (stmt, i);
452 v->data = XNEW (struct rdg_vertex);
453 RDGV_STMT (v) = stmt;
454 RDGV_DATAREFS (v).create (0);
455 RDGV_HAS_MEM_WRITE (v) = false;
456 RDGV_HAS_MEM_READS (v) = false;
457 if (gimple_code (stmt) == GIMPLE_PHI)
458 continue;
460 unsigned drp = datarefs_vec.length ();
461 if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
462 return false;
463 for (unsigned j = drp; j < datarefs_vec.length (); ++j)
465 data_reference_p dr = datarefs_vec[j];
466 if (DR_IS_READ (dr))
467 RDGV_HAS_MEM_READS (v) = true;
468 else
469 RDGV_HAS_MEM_WRITE (v) = true;
470 RDGV_DATAREFS (v).safe_push (dr);
473 return true;
476 /* Array mapping basic block's index to its topological order. */
477 static int *bb_top_order_index;
478 /* And size of the array. */
479 static int bb_top_order_index_size;
481 /* If X has a smaller topological sort number than Y, returns -1;
482 if greater, returns 1. */
484 static int
485 bb_top_order_cmp (const void *x, const void *y)
487 basic_block bb1 = *(const basic_block *) x;
488 basic_block bb2 = *(const basic_block *) y;
490 gcc_assert (bb1->index < bb_top_order_index_size
491 && bb2->index < bb_top_order_index_size);
492 gcc_assert (bb1 == bb2
493 || bb_top_order_index[bb1->index]
494 != bb_top_order_index[bb2->index]);
496 return (bb_top_order_index[bb1->index] - bb_top_order_index[bb2->index]);
499 /* Initialize STMTS with all the statements of LOOP. We use topological
500 order to discover all statements. The order is important because
501 generate_loops_for_partition is using the same traversal for identifying
502 statements in loop copies. */
504 static void
505 stmts_from_loop (struct loop *loop, vec<gimple *> *stmts)
507 unsigned int i;
508 basic_block *bbs = get_loop_body_in_custom_order (loop, bb_top_order_cmp);
510 for (i = 0; i < loop->num_nodes; i++)
512 basic_block bb = bbs[i];
514 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
515 gsi_next (&bsi))
516 if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
517 stmts->safe_push (bsi.phi ());
519 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
520 gsi_next (&bsi))
522 gimple *stmt = gsi_stmt (bsi);
523 if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
524 stmts->safe_push (stmt);
528 free (bbs);
531 /* Free the reduced dependence graph RDG. */
533 static void
534 free_rdg (struct graph *rdg)
536 int i;
538 for (i = 0; i < rdg->n_vertices; i++)
540 struct vertex *v = &(rdg->vertices[i]);
541 struct graph_edge *e;
543 for (e = v->succ; e; e = e->succ_next)
544 free (e->data);
546 if (v->data)
548 gimple_set_uid (RDGV_STMT (v), -1);
549 (RDGV_DATAREFS (v)).release ();
550 free (v->data);
554 free_graph (rdg);
557 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
558 LOOP, and one edge per flow dependence or control dependence from control
559 dependence CD. During visiting each statement, data references are also
560 collected and recorded in global data DATAREFS_VEC. */
562 static struct graph *
563 build_rdg (struct loop *loop, control_dependences *cd)
565 struct graph *rdg;
567 /* Create the RDG vertices from the stmts of the loop nest. */
568 auto_vec<gimple *, 10> stmts;
569 stmts_from_loop (loop, &stmts);
570 rdg = new_graph (stmts.length ());
571 if (!create_rdg_vertices (rdg, stmts, loop))
573 free_rdg (rdg);
574 return NULL;
576 stmts.release ();
578 create_rdg_flow_edges (rdg);
579 if (cd)
580 create_rdg_cd_edges (rdg, cd, loop);
582 return rdg;
586 /* Kind of distributed loop. */
587 enum partition_kind {
588 PKIND_NORMAL,
589 /* Partial memset stands for a paritition can be distributed into a loop
590 of memset calls, rather than a single memset call. It's handled just
591 like a normal parition, i.e, distributed as separate loop, no memset
592 call is generated.
594 Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
595 loop nest as deep as possible. As a result, parloop achieves better
596 parallelization by parallelizing deeper loop nest. This hack should
597 be unnecessary and removed once distributed memset can be understood
598 and analyzed in data reference analysis. See PR82604 for more. */
599 PKIND_PARTIAL_MEMSET,
600 PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
603 /* Type of distributed loop. */
604 enum partition_type {
605 /* The distributed loop can be executed parallelly. */
606 PTYPE_PARALLEL = 0,
607 /* The distributed loop has to be executed sequentially. */
608 PTYPE_SEQUENTIAL
611 /* Builtin info for loop distribution. */
612 struct builtin_info
614 /* data-references a kind != PKIND_NORMAL partition is about. */
615 data_reference_p dst_dr;
616 data_reference_p src_dr;
617 /* Base address and size of memory objects operated by the builtin. Note
618 both dest and source memory objects must have the same size. */
619 tree dst_base;
620 tree src_base;
621 tree size;
622 /* Base and offset part of dst_base after stripping constant offset. This
623 is only used in memset builtin distribution for now. */
624 tree dst_base_base;
625 unsigned HOST_WIDE_INT dst_base_offset;
628 /* Partition for loop distribution. */
629 struct partition
631 /* Statements of the partition. */
632 bitmap stmts;
633 /* True if the partition defines variable which is used outside of loop. */
634 bool reduction_p;
635 enum partition_kind kind;
636 enum partition_type type;
637 /* Data references in the partition. */
638 bitmap datarefs;
639 /* Information of builtin parition. */
640 struct builtin_info *builtin;
644 /* Allocate and initialize a partition from BITMAP. */
646 static partition *
647 partition_alloc (void)
649 partition *partition = XCNEW (struct partition);
650 partition->stmts = BITMAP_ALLOC (NULL);
651 partition->reduction_p = false;
652 partition->kind = PKIND_NORMAL;
653 partition->datarefs = BITMAP_ALLOC (NULL);
654 return partition;
657 /* Free PARTITION. */
659 static void
660 partition_free (partition *partition)
662 BITMAP_FREE (partition->stmts);
663 BITMAP_FREE (partition->datarefs);
664 if (partition->builtin)
665 free (partition->builtin);
667 free (partition);
670 /* Returns true if the partition can be generated as a builtin. */
672 static bool
673 partition_builtin_p (partition *partition)
675 return partition->kind > PKIND_PARTIAL_MEMSET;
678 /* Returns true if the partition contains a reduction. */
680 static bool
681 partition_reduction_p (partition *partition)
683 return partition->reduction_p;
686 /* Partitions are fused because of different reasons. */
687 enum fuse_type
689 FUSE_NON_BUILTIN = 0,
690 FUSE_REDUCTION = 1,
691 FUSE_SHARE_REF = 2,
692 FUSE_SAME_SCC = 3,
693 FUSE_FINALIZE = 4
696 /* Description on different fusing reason. */
697 static const char *fuse_message[] = {
698 "they are non-builtins",
699 "they have reductions",
700 "they have shared memory refs",
701 "they are in the same dependence scc",
702 "there is no point to distribute loop"};
704 static void
705 update_type_for_merge (struct graph *, partition *, partition *);
707 /* Merge PARTITION into the partition DEST. RDG is the reduced dependence
708 graph and we update type for result partition if it is non-NULL. */
710 static void
711 partition_merge_into (struct graph *rdg, partition *dest,
712 partition *partition, enum fuse_type ft)
714 if (dump_file && (dump_flags & TDF_DETAILS))
716 fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
717 fprintf (dump_file, " Part 1: ");
718 dump_bitmap (dump_file, dest->stmts);
719 fprintf (dump_file, " Part 2: ");
720 dump_bitmap (dump_file, partition->stmts);
723 dest->kind = PKIND_NORMAL;
724 if (dest->type == PTYPE_PARALLEL)
725 dest->type = partition->type;
727 bitmap_ior_into (dest->stmts, partition->stmts);
728 if (partition_reduction_p (partition))
729 dest->reduction_p = true;
731 /* Further check if any data dependence prevents us from executing the
732 new partition parallelly. */
733 if (dest->type == PTYPE_PARALLEL && rdg != NULL)
734 update_type_for_merge (rdg, dest, partition);
736 bitmap_ior_into (dest->datarefs, partition->datarefs);
740 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
741 the LOOP. */
743 static bool
744 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
746 imm_use_iterator imm_iter;
747 use_operand_p use_p;
749 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
751 if (is_gimple_debug (USE_STMT (use_p)))
752 continue;
754 basic_block use_bb = gimple_bb (USE_STMT (use_p));
755 if (!flow_bb_inside_loop_p (loop, use_bb))
756 return true;
759 return false;
762 /* Returns true when STMT defines a scalar variable used after the
763 loop LOOP. */
765 static bool
766 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
768 def_operand_p def_p;
769 ssa_op_iter op_iter;
771 if (gimple_code (stmt) == GIMPLE_PHI)
772 return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
774 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
775 if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
776 return true;
778 return false;
781 /* Return a copy of LOOP placed before LOOP. */
783 static struct loop *
784 copy_loop_before (struct loop *loop)
786 struct loop *res;
787 edge preheader = loop_preheader_edge (loop);
789 initialize_original_copy_tables ();
790 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
791 gcc_assert (res != NULL);
792 free_original_copy_tables ();
793 delete_update_ssa ();
795 return res;
798 /* Creates an empty basic block after LOOP. */
800 static void
801 create_bb_after_loop (struct loop *loop)
803 edge exit = single_exit (loop);
805 if (!exit)
806 return;
808 split_edge (exit);
811 /* Generate code for PARTITION from the code in LOOP. The loop is
812 copied when COPY_P is true. All the statements not flagged in the
813 PARTITION bitmap are removed from the loop or from its copy. The
814 statements are indexed in sequence inside a basic block, and the
815 basic blocks of a loop are taken in dom order. */
817 static void
818 generate_loops_for_partition (struct loop *loop, partition *partition,
819 bool copy_p)
821 unsigned i;
822 basic_block *bbs;
824 if (copy_p)
826 int orig_loop_num = loop->orig_loop_num;
827 loop = copy_loop_before (loop);
828 gcc_assert (loop != NULL);
829 loop->orig_loop_num = orig_loop_num;
830 create_preheader (loop, CP_SIMPLE_PREHEADERS);
831 create_bb_after_loop (loop);
833 else
835 /* Origin number is set to the new versioned loop's num. */
836 gcc_assert (loop->orig_loop_num != loop->num);
839 /* Remove stmts not in the PARTITION bitmap. */
840 bbs = get_loop_body_in_dom_order (loop);
842 if (MAY_HAVE_DEBUG_BIND_STMTS)
843 for (i = 0; i < loop->num_nodes; i++)
845 basic_block bb = bbs[i];
847 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
848 gsi_next (&bsi))
850 gphi *phi = bsi.phi ();
851 if (!virtual_operand_p (gimple_phi_result (phi))
852 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
853 reset_debug_uses (phi);
856 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
858 gimple *stmt = gsi_stmt (bsi);
859 if (gimple_code (stmt) != GIMPLE_LABEL
860 && !is_gimple_debug (stmt)
861 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
862 reset_debug_uses (stmt);
866 for (i = 0; i < loop->num_nodes; i++)
868 basic_block bb = bbs[i];
869 edge inner_exit = NULL;
871 if (loop != bb->loop_father)
872 inner_exit = single_exit (bb->loop_father);
874 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
876 gphi *phi = bsi.phi ();
877 if (!virtual_operand_p (gimple_phi_result (phi))
878 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
879 remove_phi_node (&bsi, true);
880 else
881 gsi_next (&bsi);
884 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
886 gimple *stmt = gsi_stmt (bsi);
887 if (gimple_code (stmt) != GIMPLE_LABEL
888 && !is_gimple_debug (stmt)
889 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
891 /* In distribution of loop nest, if bb is inner loop's exit_bb,
892 we choose its exit edge/path in order to avoid generating
893 infinite loop. For all other cases, we choose an arbitrary
894 path through the empty CFG part that this unnecessary
895 control stmt controls. */
896 if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
898 if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
899 gimple_cond_make_true (cond_stmt);
900 else
901 gimple_cond_make_false (cond_stmt);
902 update_stmt (stmt);
904 else if (gimple_code (stmt) == GIMPLE_SWITCH)
906 gswitch *switch_stmt = as_a <gswitch *> (stmt);
907 gimple_switch_set_index
908 (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
909 update_stmt (stmt);
911 else
913 unlink_stmt_vdef (stmt);
914 gsi_remove (&bsi, true);
915 release_defs (stmt);
916 continue;
919 gsi_next (&bsi);
923 free (bbs);
926 /* If VAL memory representation contains the same value in all bytes,
927 return that value, otherwise return -1.
928 E.g. for 0x24242424 return 0x24, for IEEE double
929 747708026454360457216.0 return 0x44, etc. */
931 static int
932 const_with_all_bytes_same (tree val)
934 unsigned char buf[64];
935 int i, len;
937 if (integer_zerop (val)
938 || (TREE_CODE (val) == CONSTRUCTOR
939 && !TREE_CLOBBER_P (val)
940 && CONSTRUCTOR_NELTS (val) == 0))
941 return 0;
943 if (real_zerop (val))
945 /* Only return 0 for +0.0, not for -0.0, which doesn't have
946 an all bytes same memory representation. Don't transform
947 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */
948 switch (TREE_CODE (val))
950 case REAL_CST:
951 if (!real_isneg (TREE_REAL_CST_PTR (val)))
952 return 0;
953 break;
954 case COMPLEX_CST:
955 if (!const_with_all_bytes_same (TREE_REALPART (val))
956 && !const_with_all_bytes_same (TREE_IMAGPART (val)))
957 return 0;
958 break;
959 case VECTOR_CST:
961 unsigned int count = vector_cst_encoded_nelts (val);
962 unsigned int j;
963 for (j = 0; j < count; ++j)
964 if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
965 break;
966 if (j == count)
967 return 0;
968 break;
970 default:
971 break;
975 if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
976 return -1;
978 len = native_encode_expr (val, buf, sizeof (buf));
979 if (len == 0)
980 return -1;
981 for (i = 1; i < len; i++)
982 if (buf[i] != buf[0])
983 return -1;
984 return buf[0];
987 /* Generate a call to memset for PARTITION in LOOP. */
989 static void
990 generate_memset_builtin (struct loop *loop, partition *partition)
992 gimple_stmt_iterator gsi;
993 tree mem, fn, nb_bytes;
994 tree val;
995 struct builtin_info *builtin = partition->builtin;
996 gimple *fn_call;
998 /* The new statements will be placed before LOOP. */
999 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1001 nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1002 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1003 false, GSI_CONTINUE_LINKING);
1004 mem = builtin->dst_base;
1005 mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1006 false, GSI_CONTINUE_LINKING);
1008 /* This exactly matches the pattern recognition in classify_partition. */
1009 val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
1010 /* Handle constants like 0x15151515 and similarly
1011 floating point constants etc. where all bytes are the same. */
1012 int bytev = const_with_all_bytes_same (val);
1013 if (bytev != -1)
1014 val = build_int_cst (integer_type_node, bytev);
1015 else if (TREE_CODE (val) == INTEGER_CST)
1016 val = fold_convert (integer_type_node, val);
1017 else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1019 tree tem = make_ssa_name (integer_type_node);
1020 gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1021 gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1022 val = tem;
1025 fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1026 fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1027 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1029 if (dump_file && (dump_flags & TDF_DETAILS))
1031 fprintf (dump_file, "generated memset");
1032 if (bytev == 0)
1033 fprintf (dump_file, " zero\n");
1034 else
1035 fprintf (dump_file, "\n");
1039 /* Generate a call to memcpy for PARTITION in LOOP. */
1041 static void
1042 generate_memcpy_builtin (struct loop *loop, partition *partition)
1044 gimple_stmt_iterator gsi;
1045 gimple *fn_call;
1046 tree dest, src, fn, nb_bytes;
1047 enum built_in_function kind;
1048 struct builtin_info *builtin = partition->builtin;
1050 /* The new statements will be placed before LOOP. */
1051 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1053 nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1054 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1055 false, GSI_CONTINUE_LINKING);
1056 dest = builtin->dst_base;
1057 src = builtin->src_base;
1058 if (partition->kind == PKIND_MEMCPY
1059 || ! ptr_derefs_may_alias_p (dest, src))
1060 kind = BUILT_IN_MEMCPY;
1061 else
1062 kind = BUILT_IN_MEMMOVE;
1064 dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1065 false, GSI_CONTINUE_LINKING);
1066 src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1067 false, GSI_CONTINUE_LINKING);
1068 fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1069 fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1070 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1072 if (dump_file && (dump_flags & TDF_DETAILS))
1074 if (kind == BUILT_IN_MEMCPY)
1075 fprintf (dump_file, "generated memcpy\n");
1076 else
1077 fprintf (dump_file, "generated memmove\n");
1081 /* Remove and destroy the loop LOOP. */
1083 static void
1084 destroy_loop (struct loop *loop)
1086 unsigned nbbs = loop->num_nodes;
1087 edge exit = single_exit (loop);
1088 basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1089 basic_block *bbs;
1090 unsigned i;
1092 bbs = get_loop_body_in_dom_order (loop);
1094 redirect_edge_pred (exit, src);
1095 exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1096 exit->flags |= EDGE_FALLTHRU;
1097 cancel_loop_tree (loop);
1098 rescan_loop_exit (exit, false, true);
1100 i = nbbs;
1103 /* We have made sure to not leave any dangling uses of SSA
1104 names defined in the loop. With the exception of virtuals.
1105 Make sure we replace all uses of virtual defs that will remain
1106 outside of the loop with the bare symbol as delete_basic_block
1107 will release them. */
1108 --i;
1109 for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1110 gsi_next (&gsi))
1112 gphi *phi = gsi.phi ();
1113 if (virtual_operand_p (gimple_phi_result (phi)))
1114 mark_virtual_phi_result_for_renaming (phi);
1116 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);
1117 gsi_next (&gsi))
1119 gimple *stmt = gsi_stmt (gsi);
1120 tree vdef = gimple_vdef (stmt);
1121 if (vdef && TREE_CODE (vdef) == SSA_NAME)
1122 mark_virtual_operand_for_renaming (vdef);
1124 delete_basic_block (bbs[i]);
1126 while (i != 0);
1128 free (bbs);
1130 set_immediate_dominator (CDI_DOMINATORS, dest,
1131 recompute_dominator (CDI_DOMINATORS, dest));
1134 /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */
1136 static bool
1137 generate_code_for_partition (struct loop *loop,
1138 partition *partition, bool copy_p)
1140 switch (partition->kind)
1142 case PKIND_NORMAL:
1143 case PKIND_PARTIAL_MEMSET:
1144 /* Reductions all have to be in the last partition. */
1145 gcc_assert (!partition_reduction_p (partition)
1146 || !copy_p);
1147 generate_loops_for_partition (loop, partition, copy_p);
1148 return false;
1150 case PKIND_MEMSET:
1151 generate_memset_builtin (loop, partition);
1152 break;
1154 case PKIND_MEMCPY:
1155 case PKIND_MEMMOVE:
1156 generate_memcpy_builtin (loop, partition);
1157 break;
1159 default:
1160 gcc_unreachable ();
1163 /* Common tail for partitions we turn into a call. If this was the last
1164 partition for which we generate code, we have to destroy the loop. */
1165 if (!copy_p)
1166 return true;
1167 return false;
1170 /* Return data dependence relation for data references A and B. The two
1171 data references must be in lexicographic order wrto reduced dependence
1172 graph RDG. We firstly try to find ddr from global ddr hash table. If
1173 it doesn't exist, compute the ddr and cache it. */
1175 static data_dependence_relation *
1176 get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b)
1178 struct data_dependence_relation ent, **slot;
1179 struct data_dependence_relation *ddr;
1181 gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1182 gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1183 <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1184 ent.a = a;
1185 ent.b = b;
1186 slot = ddrs_table->find_slot (&ent, INSERT);
1187 if (*slot == NULL)
1189 ddr = initialize_data_dependence_relation (a, b, loop_nest);
1190 compute_affine_dependence (ddr, loop_nest[0]);
1191 *slot = ddr;
1194 return *slot;
1197 /* In reduced dependence graph RDG for loop distribution, return true if
1198 dependence between references DR1 and DR2 leads to a dependence cycle
1199 and such dependence cycle can't be resolved by runtime alias check. */
1201 static bool
1202 data_dep_in_cycle_p (struct graph *rdg,
1203 data_reference_p dr1, data_reference_p dr2)
1205 struct data_dependence_relation *ddr;
1207 /* Re-shuffle data-refs to be in topological order. */
1208 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1209 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1210 std::swap (dr1, dr2);
1212 ddr = get_data_dependence (rdg, dr1, dr2);
1214 /* In case of no data dependence. */
1215 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1216 return false;
1217 /* For unknown data dependence or known data dependence which can't be
1218 expressed in classic distance vector, we check if it can be resolved
1219 by runtime alias check. If yes, we still consider data dependence
1220 as won't introduce data dependence cycle. */
1221 else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1222 || DDR_NUM_DIST_VECTS (ddr) == 0)
1223 return !runtime_alias_check_p (ddr, NULL, true);
1224 else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1225 return true;
1226 else if (DDR_REVERSED_P (ddr)
1227 || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1228 return false;
1230 return true;
1233 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
1234 PARTITION1's type after merging PARTITION2 into PARTITION1. */
1236 static void
1237 update_type_for_merge (struct graph *rdg,
1238 partition *partition1, partition *partition2)
1240 unsigned i, j;
1241 bitmap_iterator bi, bj;
1242 data_reference_p dr1, dr2;
1244 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1246 unsigned start = (partition1 == partition2) ? i + 1 : 0;
1248 dr1 = datarefs_vec[i];
1249 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1251 dr2 = datarefs_vec[j];
1252 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1253 continue;
1255 /* Partition can only be executed sequentially if there is any
1256 data dependence cycle. */
1257 if (data_dep_in_cycle_p (rdg, dr1, dr2))
1259 partition1->type = PTYPE_SEQUENTIAL;
1260 return;
1266 /* Returns a partition with all the statements needed for computing
1267 the vertex V of the RDG, also including the loop exit conditions. */
1269 static partition *
1270 build_rdg_partition_for_vertex (struct graph *rdg, int v)
1272 partition *partition = partition_alloc ();
1273 auto_vec<int, 3> nodes;
1274 unsigned i, j;
1275 int x;
1276 data_reference_p dr;
1278 graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1280 FOR_EACH_VEC_ELT (nodes, i, x)
1282 bitmap_set_bit (partition->stmts, x);
1284 for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1286 unsigned idx = (unsigned) DR_INDEX (dr);
1287 gcc_assert (idx < datarefs_vec.length ());
1289 /* Partition can only be executed sequentially if there is any
1290 unknown data reference. */
1291 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1292 || !DR_INIT (dr) || !DR_STEP (dr))
1293 partition->type = PTYPE_SEQUENTIAL;
1295 bitmap_set_bit (partition->datarefs, idx);
1299 if (partition->type == PTYPE_SEQUENTIAL)
1300 return partition;
1302 /* Further check if any data dependence prevents us from executing the
1303 partition parallelly. */
1304 update_type_for_merge (rdg, partition, partition);
1306 return partition;
1309 /* Given PARTITION of LOOP and RDG, record single load/store data references
1310 for builtin partition in SRC_DR/DST_DR, return false if there is no such
1311 data references. */
1313 static bool
1314 find_single_drs (struct loop *loop, struct graph *rdg, partition *partition,
1315 data_reference_p *dst_dr, data_reference_p *src_dr)
1317 unsigned i;
1318 data_reference_p single_ld = NULL, single_st = NULL;
1319 bitmap_iterator bi;
1321 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1323 gimple *stmt = RDG_STMT (rdg, i);
1324 data_reference_p dr;
1326 if (gimple_code (stmt) == GIMPLE_PHI)
1327 continue;
1329 /* Any scalar stmts are ok. */
1330 if (!gimple_vuse (stmt))
1331 continue;
1333 /* Otherwise just regular loads/stores. */
1334 if (!gimple_assign_single_p (stmt))
1335 return false;
1337 /* But exactly one store and/or load. */
1338 for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1340 tree type = TREE_TYPE (DR_REF (dr));
1342 /* The memset, memcpy and memmove library calls are only
1343 able to deal with generic address space. */
1344 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1345 return false;
1347 if (DR_IS_READ (dr))
1349 if (single_ld != NULL)
1350 return false;
1351 single_ld = dr;
1353 else
1355 if (single_st != NULL)
1356 return false;
1357 single_st = dr;
1362 if (!single_st)
1363 return false;
1365 /* Bail out if this is a bitfield memory reference. */
1366 if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1367 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1368 return false;
1370 /* Data reference must be executed exactly once per iteration of each
1371 loop in the loop nest. We only need to check dominance information
1372 against the outermost one in a perfect loop nest because a bb can't
1373 dominate outermost loop's latch without dominating inner loop's. */
1374 basic_block bb_st = gimple_bb (DR_STMT (single_st));
1375 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1376 return false;
1378 if (single_ld)
1380 gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1381 /* Direct aggregate copy or via an SSA name temporary. */
1382 if (load != store
1383 && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1384 return false;
1386 /* Bail out if this is a bitfield memory reference. */
1387 if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1388 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1389 return false;
1391 /* Load and store must be in the same loop nest. */
1392 basic_block bb_ld = gimple_bb (DR_STMT (single_ld));
1393 if (bb_st->loop_father != bb_ld->loop_father)
1394 return false;
1396 /* Data reference must be executed exactly once per iteration.
1397 Same as single_st, we only need to check against the outermost
1398 loop. */
1399 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1400 return false;
1402 edge e = single_exit (bb_st->loop_father);
1403 bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1404 bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1405 if (dom_ld != dom_st)
1406 return false;
1409 *src_dr = single_ld;
1410 *dst_dr = single_st;
1411 return true;
1414 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1415 loops from inner to outer to see if loop's step equals to access size at
1416 each level of loop. Return 2 if we can prove this at all level loops;
1417 record access base and size in BASE and SIZE; save loop's step at each
1418 level of loop in STEPS if it is not null. For example:
1420 int arr[100][100][100];
1421 for (i = 0; i < 100; i++) ;steps[2] = 40000
1422 for (j = 100; j > 0; j--) ;steps[1] = -400
1423 for (k = 0; k < 100; k++) ;steps[0] = 4
1424 arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000
1426 Return 1 if we can prove the equality at the innermost loop, but not all
1427 level loops. In this case, no information is recorded.
1429 Return 0 if no equality can be proven at any level loops. */
1431 static int
1432 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1433 tree *size, vec<tree> *steps = NULL)
1435 location_t loc = gimple_location (DR_STMT (dr));
1436 basic_block bb = gimple_bb (DR_STMT (dr));
1437 struct loop *loop = bb->loop_father;
1438 tree ref = DR_REF (dr);
1439 tree access_base = build_fold_addr_expr (ref);
1440 tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1441 int res = 0;
1443 do {
1444 tree scev_fn = analyze_scalar_evolution (loop, access_base);
1445 if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1446 return res;
1448 access_base = CHREC_LEFT (scev_fn);
1449 if (tree_contains_chrecs (access_base, NULL))
1450 return res;
1452 tree scev_step = CHREC_RIGHT (scev_fn);
1453 /* Only support constant steps. */
1454 if (TREE_CODE (scev_step) != INTEGER_CST)
1455 return res;
1457 enum ev_direction access_dir = scev_direction (scev_fn);
1458 if (access_dir == EV_DIR_UNKNOWN)
1459 return res;
1461 if (steps != NULL)
1462 steps->safe_push (scev_step);
1464 scev_step = fold_convert_loc (loc, sizetype, scev_step);
1465 /* Compute absolute value of scev step. */
1466 if (access_dir == EV_DIR_DECREASES)
1467 scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1469 /* At each level of loop, scev step must equal to access size. In other
1470 words, DR must access consecutive memory between loop iterations. */
1471 if (!operand_equal_p (scev_step, access_size, 0))
1472 return res;
1474 /* Access stride can be computed for data reference at least for the
1475 innermost loop. */
1476 res = 1;
1478 /* Compute DR's execution times in loop. */
1479 tree niters = number_of_latch_executions (loop);
1480 niters = fold_convert_loc (loc, sizetype, niters);
1481 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1482 niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1484 /* Compute DR's overall access size in loop. */
1485 access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1486 niters, scev_step);
1487 /* Adjust base address in case of negative step. */
1488 if (access_dir == EV_DIR_DECREASES)
1490 tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1491 scev_step, access_size);
1492 access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1494 } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1496 *base = access_base;
1497 *size = access_size;
1498 /* Access stride can be computed for data reference at each level loop. */
1499 return 2;
1502 /* Allocate and return builtin struct. Record information like DST_DR,
1503 SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */
1505 static struct builtin_info *
1506 alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1507 tree dst_base, tree src_base, tree size)
1509 struct builtin_info *builtin = XNEW (struct builtin_info);
1510 builtin->dst_dr = dst_dr;
1511 builtin->src_dr = src_dr;
1512 builtin->dst_base = dst_base;
1513 builtin->src_base = src_base;
1514 builtin->size = size;
1515 return builtin;
1518 /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1519 memset call. */
1521 static void
1522 classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1524 gimple *stmt = DR_STMT (dr);
1525 tree base, size, rhs = gimple_assign_rhs1 (stmt);
1527 if (const_with_all_bytes_same (rhs) == -1
1528 && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1529 || (TYPE_MODE (TREE_TYPE (rhs))
1530 != TYPE_MODE (unsigned_char_type_node))))
1531 return;
1533 if (TREE_CODE (rhs) == SSA_NAME
1534 && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1535 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1536 return;
1538 int res = compute_access_range (loop, dr, &base, &size);
1539 if (res == 0)
1540 return;
1541 if (res == 1)
1543 partition->kind = PKIND_PARTIAL_MEMSET;
1544 return;
1547 poly_uint64 base_offset;
1548 unsigned HOST_WIDE_INT const_base_offset;
1549 tree base_base = strip_offset (base, &base_offset);
1550 if (!base_offset.is_constant (&const_base_offset))
1551 return;
1553 struct builtin_info *builtin;
1554 builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1555 builtin->dst_base_base = base_base;
1556 builtin->dst_base_offset = const_base_offset;
1557 partition->builtin = builtin;
1558 partition->kind = PKIND_MEMSET;
1561 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1562 if it forms builtin memcpy or memmove call. */
1564 static void
1565 classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
1566 data_reference_p dst_dr, data_reference_p src_dr)
1568 tree base, size, src_base, src_size;
1569 auto_vec<tree> dst_steps, src_steps;
1571 /* Compute access range of both load and store. */
1572 int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1573 if (res != 2)
1574 return;
1575 res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1576 if (res != 2)
1577 return;
1579 /* They much have the same access size. */
1580 if (!operand_equal_p (size, src_size, 0))
1581 return;
1583 /* Load and store in loop nest must access memory in the same way, i.e,
1584 their must have the same steps in each loop of the nest. */
1585 if (dst_steps.length () != src_steps.length ())
1586 return;
1587 for (unsigned i = 0; i < dst_steps.length (); ++i)
1588 if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1589 return;
1591 /* Now check that if there is a dependence. */
1592 ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1594 /* Classify as memcpy if no dependence between load and store. */
1595 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1597 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1598 partition->kind = PKIND_MEMCPY;
1599 return;
1602 /* Can't do memmove in case of unknown dependence or dependence without
1603 classical distance vector. */
1604 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1605 || DDR_NUM_DIST_VECTS (ddr) == 0)
1606 return;
1608 unsigned i;
1609 lambda_vector dist_v;
1610 int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1611 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1613 unsigned dep_lev = dependence_level (dist_v, num_lev);
1614 /* Can't do memmove if load depends on store. */
1615 if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1616 return;
1619 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1620 partition->kind = PKIND_MEMMOVE;
1621 return;
1624 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
1625 For the moment we detect memset, memcpy and memmove patterns. Bitmap
1626 STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */
1628 static void
1629 classify_partition (loop_p loop, struct graph *rdg, partition *partition,
1630 bitmap stmt_in_all_partitions)
1632 bitmap_iterator bi;
1633 unsigned i;
1634 data_reference_p single_ld = NULL, single_st = NULL;
1635 bool volatiles_p = false, has_reduction = false;
1637 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1639 gimple *stmt = RDG_STMT (rdg, i);
1641 if (gimple_has_volatile_ops (stmt))
1642 volatiles_p = true;
1644 /* If the stmt is not included by all partitions and there is uses
1645 outside of the loop, then mark the partition as reduction. */
1646 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1648 /* Due to limitation in the transform phase we have to fuse all
1649 reduction partitions. As a result, this could cancel valid
1650 loop distribution especially for loop that induction variable
1651 is used outside of loop. To workaround this issue, we skip
1652 marking partition as reudction if the reduction stmt belongs
1653 to all partitions. In such case, reduction will be computed
1654 correctly no matter how partitions are fused/distributed. */
1655 if (!bitmap_bit_p (stmt_in_all_partitions, i))
1657 partition->reduction_p = true;
1658 return;
1660 has_reduction = true;
1664 /* Perform general partition disqualification for builtins. */
1665 if (volatiles_p
1666 /* Simple workaround to prevent classifying the partition as builtin
1667 if it contains any use outside of loop. */
1668 || has_reduction
1669 || !flag_tree_loop_distribute_patterns)
1670 return;
1672 /* Find single load/store data references for builtin partition. */
1673 if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld))
1674 return;
1676 /* Classify the builtin kind. */
1677 if (single_ld == NULL)
1678 classify_builtin_st (loop, partition, single_st);
1679 else
1680 classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1683 /* Returns true when PARTITION1 and PARTITION2 access the same memory
1684 object in RDG. */
1686 static bool
1687 share_memory_accesses (struct graph *rdg,
1688 partition *partition1, partition *partition2)
1690 unsigned i, j;
1691 bitmap_iterator bi, bj;
1692 data_reference_p dr1, dr2;
1694 /* First check whether in the intersection of the two partitions are
1695 any loads or stores. Common loads are the situation that happens
1696 most often. */
1697 EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1698 if (RDG_MEM_WRITE_STMT (rdg, i)
1699 || RDG_MEM_READS_STMT (rdg, i))
1700 return true;
1702 /* Then check whether the two partitions access the same memory object. */
1703 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1705 dr1 = datarefs_vec[i];
1707 if (!DR_BASE_ADDRESS (dr1)
1708 || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1709 continue;
1711 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1713 dr2 = datarefs_vec[j];
1715 if (!DR_BASE_ADDRESS (dr2)
1716 || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1717 continue;
1719 if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1720 && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1721 && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1722 && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1723 return true;
1727 return false;
1730 /* For each seed statement in STARTING_STMTS, this function builds
1731 partition for it by adding depended statements according to RDG.
1732 All partitions are recorded in PARTITIONS. */
1734 static void
1735 rdg_build_partitions (struct graph *rdg,
1736 vec<gimple *> starting_stmts,
1737 vec<partition *> *partitions)
1739 auto_bitmap processed;
1740 int i;
1741 gimple *stmt;
1743 FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1745 int v = rdg_vertex_for_stmt (rdg, stmt);
1747 if (dump_file && (dump_flags & TDF_DETAILS))
1748 fprintf (dump_file,
1749 "ldist asked to generate code for vertex %d\n", v);
1751 /* If the vertex is already contained in another partition so
1752 is the partition rooted at it. */
1753 if (bitmap_bit_p (processed, v))
1754 continue;
1756 partition *partition = build_rdg_partition_for_vertex (rdg, v);
1757 bitmap_ior_into (processed, partition->stmts);
1759 if (dump_file && (dump_flags & TDF_DETAILS))
1761 fprintf (dump_file, "ldist creates useful %s partition:\n",
1762 partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1763 bitmap_print (dump_file, partition->stmts, " ", "\n");
1766 partitions->safe_push (partition);
1769 /* All vertices should have been assigned to at least one partition now,
1770 other than vertices belonging to dead code. */
1773 /* Dump to FILE the PARTITIONS. */
1775 static void
1776 dump_rdg_partitions (FILE *file, vec<partition *> partitions)
1778 int i;
1779 partition *partition;
1781 FOR_EACH_VEC_ELT (partitions, i, partition)
1782 debug_bitmap_file (file, partition->stmts);
1785 /* Debug PARTITIONS. */
1786 extern void debug_rdg_partitions (vec<partition *> );
1788 DEBUG_FUNCTION void
1789 debug_rdg_partitions (vec<partition *> partitions)
1791 dump_rdg_partitions (stderr, partitions);
1794 /* Returns the number of read and write operations in the RDG. */
1796 static int
1797 number_of_rw_in_rdg (struct graph *rdg)
1799 int i, res = 0;
1801 for (i = 0; i < rdg->n_vertices; i++)
1803 if (RDG_MEM_WRITE_STMT (rdg, i))
1804 ++res;
1806 if (RDG_MEM_READS_STMT (rdg, i))
1807 ++res;
1810 return res;
1813 /* Returns the number of read and write operations in a PARTITION of
1814 the RDG. */
1816 static int
1817 number_of_rw_in_partition (struct graph *rdg, partition *partition)
1819 int res = 0;
1820 unsigned i;
1821 bitmap_iterator ii;
1823 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
1825 if (RDG_MEM_WRITE_STMT (rdg, i))
1826 ++res;
1828 if (RDG_MEM_READS_STMT (rdg, i))
1829 ++res;
1832 return res;
1835 /* Returns true when one of the PARTITIONS contains all the read or
1836 write operations of RDG. */
1838 static bool
1839 partition_contains_all_rw (struct graph *rdg,
1840 vec<partition *> partitions)
1842 int i;
1843 partition *partition;
1844 int nrw = number_of_rw_in_rdg (rdg);
1846 FOR_EACH_VEC_ELT (partitions, i, partition)
1847 if (nrw == number_of_rw_in_partition (rdg, partition))
1848 return true;
1850 return false;
1853 /* Compute partition dependence created by the data references in DRS1
1854 and DRS2, modify and return DIR according to that. IF ALIAS_DDR is
1855 not NULL, we record dependence introduced by possible alias between
1856 two data references in ALIAS_DDRS; otherwise, we simply ignore such
1857 dependence as if it doesn't exist at all. */
1859 static int
1860 pg_add_dependence_edges (struct graph *rdg, int dir,
1861 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
1863 unsigned i, j;
1864 bitmap_iterator bi, bj;
1865 data_reference_p dr1, dr2, saved_dr1;
1867 /* dependence direction - 0 is no dependence, -1 is back,
1868 1 is forth, 2 is both (we can stop then, merging will occur). */
1869 EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
1871 dr1 = datarefs_vec[i];
1873 EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
1875 int res, this_dir = 1;
1876 ddr_p ddr;
1878 dr2 = datarefs_vec[j];
1880 /* Skip all <read, read> data dependence. */
1881 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1882 continue;
1884 saved_dr1 = dr1;
1885 /* Re-shuffle data-refs to be in topological order. */
1886 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1887 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1889 std::swap (dr1, dr2);
1890 this_dir = -this_dir;
1892 ddr = get_data_dependence (rdg, dr1, dr2);
1893 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1895 this_dir = 0;
1896 res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
1897 DR_BASE_ADDRESS (dr2));
1898 /* Be conservative. If data references are not well analyzed,
1899 or the two data references have the same base address and
1900 offset, add dependence and consider it alias to each other.
1901 In other words, the dependence can not be resolved by
1902 runtime alias check. */
1903 if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
1904 || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
1905 || !DR_INIT (dr1) || !DR_INIT (dr2)
1906 || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
1907 || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
1908 || res == 0)
1909 this_dir = 2;
1910 /* Data dependence could be resolved by runtime alias check,
1911 record it in ALIAS_DDRS. */
1912 else if (alias_ddrs != NULL)
1913 alias_ddrs->safe_push (ddr);
1914 /* Or simply ignore it. */
1916 else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
1918 if (DDR_REVERSED_P (ddr))
1919 this_dir = -this_dir;
1921 /* Known dependences can still be unordered througout the
1922 iteration space, see gcc.dg/tree-ssa/ldist-16.c. */
1923 if (DDR_NUM_DIST_VECTS (ddr) != 1)
1924 this_dir = 2;
1925 /* If the overlap is exact preserve stmt order. */
1926 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0),
1927 DDR_NB_LOOPS (ddr)))
1929 /* Else as the distance vector is lexicographic positive swap
1930 the dependence direction. */
1931 else
1932 this_dir = -this_dir;
1934 else
1935 this_dir = 0;
1936 if (this_dir == 2)
1937 return 2;
1938 else if (dir == 0)
1939 dir = this_dir;
1940 else if (this_dir != 0 && dir != this_dir)
1941 return 2;
1942 /* Shuffle "back" dr1. */
1943 dr1 = saved_dr1;
1946 return dir;
1949 /* Compare postorder number of the partition graph vertices V1 and V2. */
1951 static int
1952 pgcmp (const void *v1_, const void *v2_)
1954 const vertex *v1 = (const vertex *)v1_;
1955 const vertex *v2 = (const vertex *)v2_;
1956 return v2->post - v1->post;
1959 /* Data attached to vertices of partition dependence graph. */
1960 struct pg_vdata
1962 /* ID of the corresponding partition. */
1963 int id;
1964 /* The partition. */
1965 struct partition *partition;
1968 /* Data attached to edges of partition dependence graph. */
1969 struct pg_edata
1971 /* If the dependence edge can be resolved by runtime alias check,
1972 this vector contains data dependence relations for runtime alias
1973 check. On the other hand, if the dependence edge is introduced
1974 because of compilation time known data dependence, this vector
1975 contains nothing. */
1976 vec<ddr_p> alias_ddrs;
1979 /* Callback data for traversing edges in graph. */
1980 struct pg_edge_callback_data
1982 /* Bitmap contains strong connected components should be merged. */
1983 bitmap sccs_to_merge;
1984 /* Array constains component information for all vertices. */
1985 int *vertices_component;
1986 /* Vector to record all data dependence relations which are needed
1987 to break strong connected components by runtime alias checks. */
1988 vec<ddr_p> *alias_ddrs;
1991 /* Initialize vertice's data for partition dependence graph PG with
1992 PARTITIONS. */
1994 static void
1995 init_partition_graph_vertices (struct graph *pg,
1996 vec<struct partition *> *partitions)
1998 int i;
1999 partition *partition;
2000 struct pg_vdata *data;
2002 for (i = 0; partitions->iterate (i, &partition); ++i)
2004 data = new pg_vdata;
2005 pg->vertices[i].data = data;
2006 data->id = i;
2007 data->partition = partition;
2011 /* Add edge <I, J> to partition dependence graph PG. Attach vector of data
2012 dependence relations to the EDGE if DDRS isn't NULL. */
2014 static void
2015 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2017 struct graph_edge *e = add_edge (pg, i, j);
2019 /* If the edge is attached with data dependence relations, it means this
2020 dependence edge can be resolved by runtime alias checks. */
2021 if (ddrs != NULL)
2023 struct pg_edata *data = new pg_edata;
2025 gcc_assert (ddrs->length () > 0);
2026 e->data = data;
2027 data->alias_ddrs = vNULL;
2028 data->alias_ddrs.safe_splice (*ddrs);
2032 /* Callback function for graph travesal algorithm. It returns true
2033 if edge E should skipped when traversing the graph. */
2035 static bool
2036 pg_skip_alias_edge (struct graph_edge *e)
2038 struct pg_edata *data = (struct pg_edata *)e->data;
2039 return (data != NULL && data->alias_ddrs.length () > 0);
2042 /* Callback function freeing data attached to edge E of graph. */
2044 static void
2045 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2047 if (e->data != NULL)
2049 struct pg_edata *data = (struct pg_edata *)e->data;
2050 data->alias_ddrs.release ();
2051 delete data;
2055 /* Free data attached to vertice of partition dependence graph PG. */
2057 static void
2058 free_partition_graph_vdata (struct graph *pg)
2060 int i;
2061 struct pg_vdata *data;
2063 for (i = 0; i < pg->n_vertices; ++i)
2065 data = (struct pg_vdata *)pg->vertices[i].data;
2066 delete data;
2070 /* Build and return partition dependence graph for PARTITIONS. RDG is
2071 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
2072 is true, data dependence caused by possible alias between references
2073 is ignored, as if it doesn't exist at all; otherwise all depdendences
2074 are considered. */
2076 static struct graph *
2077 build_partition_graph (struct graph *rdg,
2078 vec<struct partition *> *partitions,
2079 bool ignore_alias_p)
2081 int i, j;
2082 struct partition *partition1, *partition2;
2083 graph *pg = new_graph (partitions->length ());
2084 auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2086 alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2088 init_partition_graph_vertices (pg, partitions);
2090 for (i = 0; partitions->iterate (i, &partition1); ++i)
2092 for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2094 /* dependence direction - 0 is no dependence, -1 is back,
2095 1 is forth, 2 is both (we can stop then, merging will occur). */
2096 int dir = 0;
2098 /* If the first partition has reduction, add back edge; if the
2099 second partition has reduction, add forth edge. This makes
2100 sure that reduction partition will be sorted as the last one. */
2101 if (partition_reduction_p (partition1))
2102 dir = -1;
2103 else if (partition_reduction_p (partition2))
2104 dir = 1;
2106 /* Cleanup the temporary vector. */
2107 alias_ddrs.truncate (0);
2109 dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2110 partition2->datarefs, alias_ddrs_p);
2112 /* Add edge to partition graph if there exists dependence. There
2113 are two types of edges. One type edge is caused by compilation
2114 time known dependence, this type can not be resolved by runtime
2115 alias check. The other type can be resolved by runtime alias
2116 check. */
2117 if (dir == 1 || dir == 2
2118 || alias_ddrs.length () > 0)
2120 /* Attach data dependence relations to edge that can be resolved
2121 by runtime alias check. */
2122 bool alias_edge_p = (dir != 1 && dir != 2);
2123 add_partition_graph_edge (pg, i, j,
2124 (alias_edge_p) ? &alias_ddrs : NULL);
2126 if (dir == -1 || dir == 2
2127 || alias_ddrs.length () > 0)
2129 /* Attach data dependence relations to edge that can be resolved
2130 by runtime alias check. */
2131 bool alias_edge_p = (dir != -1 && dir != 2);
2132 add_partition_graph_edge (pg, j, i,
2133 (alias_edge_p) ? &alias_ddrs : NULL);
2137 return pg;
2140 /* Sort partitions in PG in descending post order and store them in
2141 PARTITIONS. */
2143 static void
2144 sort_partitions_by_post_order (struct graph *pg,
2145 vec<struct partition *> *partitions)
2147 int i;
2148 struct pg_vdata *data;
2150 /* Now order the remaining nodes in descending postorder. */
2151 qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2152 partitions->truncate (0);
2153 for (i = 0; i < pg->n_vertices; ++i)
2155 data = (struct pg_vdata *)pg->vertices[i].data;
2156 if (data->partition)
2157 partitions->safe_push (data->partition);
2161 /* Given reduced dependence graph RDG merge strong connected components
2162 of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by
2163 possible alias between references is ignored, as if it doesn't exist
2164 at all; otherwise all depdendences are considered. */
2166 static void
2167 merge_dep_scc_partitions (struct graph *rdg,
2168 vec<struct partition *> *partitions,
2169 bool ignore_alias_p)
2171 struct partition *partition1, *partition2;
2172 struct pg_vdata *data;
2173 graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2174 int i, j, num_sccs = graphds_scc (pg, NULL);
2176 /* Strong connected compoenent means dependence cycle, we cannot distribute
2177 them. So fuse them together. */
2178 if ((unsigned) num_sccs < partitions->length ())
2180 for (i = 0; i < num_sccs; ++i)
2182 for (j = 0; partitions->iterate (j, &partition1); ++j)
2183 if (pg->vertices[j].component == i)
2184 break;
2185 for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2186 if (pg->vertices[j].component == i)
2188 partition_merge_into (NULL, partition1,
2189 partition2, FUSE_SAME_SCC);
2190 partition1->type = PTYPE_SEQUENTIAL;
2191 (*partitions)[j] = NULL;
2192 partition_free (partition2);
2193 data = (struct pg_vdata *)pg->vertices[j].data;
2194 data->partition = NULL;
2199 sort_partitions_by_post_order (pg, partitions);
2200 gcc_assert (partitions->length () == (unsigned)num_sccs);
2201 free_partition_graph_vdata (pg);
2202 free_graph (pg);
2205 /* Callback function for traversing edge E in graph G. DATA is private
2206 callback data. */
2208 static void
2209 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2211 int i, j, component;
2212 struct pg_edge_callback_data *cbdata;
2213 struct pg_edata *edata = (struct pg_edata *) e->data;
2215 /* If the edge doesn't have attached data dependence, it represents
2216 compilation time known dependences. This type dependence cannot
2217 be resolved by runtime alias check. */
2218 if (edata == NULL || edata->alias_ddrs.length () == 0)
2219 return;
2221 cbdata = (struct pg_edge_callback_data *) data;
2222 i = e->src;
2223 j = e->dest;
2224 component = cbdata->vertices_component[i];
2225 /* Vertices are topologically sorted according to compilation time
2226 known dependences, so we can break strong connected components
2227 by removing edges of the opposite direction, i.e, edges pointing
2228 from vertice with smaller post number to vertice with bigger post
2229 number. */
2230 if (g->vertices[i].post < g->vertices[j].post
2231 /* We only need to remove edges connecting vertices in the same
2232 strong connected component to break it. */
2233 && component == cbdata->vertices_component[j]
2234 /* Check if we want to break the strong connected component or not. */
2235 && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2236 cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2239 /* This is the main function breaking strong conected components in
2240 PARTITIONS giving reduced depdendence graph RDG. Store data dependence
2241 relations for runtime alias check in ALIAS_DDRS. */
2243 static void
2244 break_alias_scc_partitions (struct graph *rdg,
2245 vec<struct partition *> *partitions,
2246 vec<ddr_p> *alias_ddrs)
2248 int i, j, k, num_sccs, num_sccs_no_alias;
2249 /* Build partition dependence graph. */
2250 graph *pg = build_partition_graph (rdg, partitions, false);
2252 alias_ddrs->truncate (0);
2253 /* Find strong connected components in the graph, with all dependence edges
2254 considered. */
2255 num_sccs = graphds_scc (pg, NULL);
2256 /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2257 compilation time known dependences are merged before this function. */
2258 if ((unsigned) num_sccs < partitions->length ())
2260 struct pg_edge_callback_data cbdata;
2261 auto_bitmap sccs_to_merge;
2262 auto_vec<enum partition_type> scc_types;
2263 struct partition *partition, *first;
2265 /* If all partitions in a SCC have the same type, we can simply merge the
2266 SCC. This loop finds out such SCCS and record them in bitmap. */
2267 bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2268 for (i = 0; i < num_sccs; ++i)
2270 for (j = 0; partitions->iterate (j, &first); ++j)
2271 if (pg->vertices[j].component == i)
2272 break;
2273 for (++j; partitions->iterate (j, &partition); ++j)
2275 if (pg->vertices[j].component != i)
2276 continue;
2278 /* Note we Merge partitions of parallel type on purpose, though
2279 the result partition is sequential. The reason is vectorizer
2280 can do more accurate runtime alias check in this case. Also
2281 it results in more conservative distribution. */
2282 if (first->type != partition->type)
2284 bitmap_clear_bit (sccs_to_merge, i);
2285 break;
2290 /* Initialize callback data for traversing. */
2291 cbdata.sccs_to_merge = sccs_to_merge;
2292 cbdata.alias_ddrs = alias_ddrs;
2293 cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2294 /* Record the component information which will be corrupted by next
2295 graph scc finding call. */
2296 for (i = 0; i < pg->n_vertices; ++i)
2297 cbdata.vertices_component[i] = pg->vertices[i].component;
2299 /* Collect data dependences for runtime alias checks to break SCCs. */
2300 if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2302 /* Run SCC finding algorithm again, with alias dependence edges
2303 skipped. This is to topologically sort partitions according to
2304 compilation time known dependence. Note the topological order
2305 is stored in the form of pg's post order number. */
2306 num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2307 gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias);
2308 /* With topological order, we can construct two subgraphs L and R.
2309 L contains edge <x, y> where x < y in terms of post order, while
2310 R contains edge <x, y> where x > y. Edges for compilation time
2311 known dependence all fall in R, so we break SCCs by removing all
2312 (alias) edges of in subgraph L. */
2313 for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2316 /* For SCC that doesn't need to be broken, merge it. */
2317 for (i = 0; i < num_sccs; ++i)
2319 if (!bitmap_bit_p (sccs_to_merge, i))
2320 continue;
2322 for (j = 0; partitions->iterate (j, &first); ++j)
2323 if (cbdata.vertices_component[j] == i)
2324 break;
2325 for (k = j + 1; partitions->iterate (k, &partition); ++k)
2327 struct pg_vdata *data;
2329 if (cbdata.vertices_component[k] != i)
2330 continue;
2332 /* Update postorder number so that merged reduction partition is
2333 sorted after other partitions. */
2334 if (!partition_reduction_p (first)
2335 && partition_reduction_p (partition))
2337 gcc_assert (pg->vertices[k].post < pg->vertices[j].post);
2338 pg->vertices[j].post = pg->vertices[k].post;
2340 partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2341 (*partitions)[k] = NULL;
2342 partition_free (partition);
2343 data = (struct pg_vdata *)pg->vertices[k].data;
2344 gcc_assert (data->id == k);
2345 data->partition = NULL;
2346 /* The result partition of merged SCC must be sequential. */
2347 first->type = PTYPE_SEQUENTIAL;
2352 sort_partitions_by_post_order (pg, partitions);
2353 free_partition_graph_vdata (pg);
2354 for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2355 free_graph (pg);
2357 if (dump_file && (dump_flags & TDF_DETAILS))
2359 fprintf (dump_file, "Possible alias data dependence to break:\n");
2360 dump_data_dependence_relations (dump_file, *alias_ddrs);
2364 /* Compute and return an expression whose value is the segment length which
2365 will be accessed by DR in NITERS iterations. */
2367 static tree
2368 data_ref_segment_size (struct data_reference *dr, tree niters)
2370 niters = size_binop (MINUS_EXPR,
2371 fold_convert (sizetype, niters),
2372 size_one_node);
2373 return size_binop (MULT_EXPR,
2374 fold_convert (sizetype, DR_STEP (dr)),
2375 fold_convert (sizetype, niters));
2378 /* Return true if LOOP's latch is dominated by statement for data reference
2379 DR. */
2381 static inline bool
2382 latch_dominated_by_data_ref (struct loop *loop, data_reference *dr)
2384 return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2385 gimple_bb (DR_STMT (dr)));
2388 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2389 data dependence relations ALIAS_DDRS. */
2391 static void
2392 compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
2393 vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2395 unsigned int i;
2396 unsigned HOST_WIDE_INT factor = 1;
2397 tree niters_plus_one, niters = number_of_latch_executions (loop);
2399 gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2400 niters = fold_convert (sizetype, niters);
2401 niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2403 if (dump_file && (dump_flags & TDF_DETAILS))
2404 fprintf (dump_file, "Creating alias check pairs:\n");
2406 /* Iterate all data dependence relations and compute alias check pairs. */
2407 for (i = 0; i < alias_ddrs->length (); i++)
2409 ddr_p ddr = (*alias_ddrs)[i];
2410 struct data_reference *dr_a = DDR_A (ddr);
2411 struct data_reference *dr_b = DDR_B (ddr);
2412 tree seg_length_a, seg_length_b;
2413 int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2414 DR_BASE_ADDRESS (dr_b));
2416 if (comp_res == 0)
2417 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
2418 gcc_assert (comp_res != 0);
2420 if (latch_dominated_by_data_ref (loop, dr_a))
2421 seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2422 else
2423 seg_length_a = data_ref_segment_size (dr_a, niters);
2425 if (latch_dominated_by_data_ref (loop, dr_b))
2426 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2427 else
2428 seg_length_b = data_ref_segment_size (dr_b, niters);
2430 unsigned HOST_WIDE_INT access_size_a
2431 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2432 unsigned HOST_WIDE_INT access_size_b
2433 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2434 unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2435 unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2437 dr_with_seg_len_pair_t dr_with_seg_len_pair
2438 (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2439 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b));
2441 /* Canonicalize pairs by sorting the two DR members. */
2442 if (comp_res > 0)
2443 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2445 comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2448 if (tree_fits_uhwi_p (niters))
2449 factor = tree_to_uhwi (niters);
2451 /* Prune alias check pairs. */
2452 prune_runtime_alias_test_list (comp_alias_pairs, factor);
2453 if (dump_file && (dump_flags & TDF_DETAILS))
2454 fprintf (dump_file,
2455 "Improved number of alias checks from %d to %d\n",
2456 alias_ddrs->length (), comp_alias_pairs->length ());
2459 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2460 checks and version LOOP under condition of these runtime alias checks. */
2462 static void
2463 version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs)
2465 profile_probability prob;
2466 basic_block cond_bb;
2467 struct loop *nloop;
2468 tree lhs, arg0, cond_expr = NULL_TREE;
2469 gimple_seq cond_stmts = NULL;
2470 gimple *call_stmt = NULL;
2471 auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2473 /* Generate code for runtime alias checks if necessary. */
2474 gcc_assert (alias_ddrs->length () > 0);
2476 if (dump_file && (dump_flags & TDF_DETAILS))
2477 fprintf (dump_file,
2478 "Version loop <%d> with runtime alias check\n", loop->num);
2480 compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2481 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2482 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2483 is_gimple_val, NULL_TREE);
2485 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */
2486 if (flag_tree_loop_vectorize)
2488 /* Generate internal function call for loop distribution alias check. */
2489 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2490 2, NULL_TREE, cond_expr);
2491 lhs = make_ssa_name (boolean_type_node);
2492 gimple_call_set_lhs (call_stmt, lhs);
2494 else
2495 lhs = cond_expr;
2497 prob = profile_probability::guessed_always ().apply_scale (9, 10);
2498 initialize_original_copy_tables ();
2499 nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2500 prob, prob.invert (), true);
2501 free_original_copy_tables ();
2502 /* Record the original loop number in newly generated loops. In case of
2503 distribution, the original loop will be distributed and the new loop
2504 is kept. */
2505 loop->orig_loop_num = nloop->num;
2506 nloop->orig_loop_num = nloop->num;
2507 nloop->dont_vectorize = true;
2508 nloop->force_vectorize = false;
2510 if (call_stmt)
2512 /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2513 loop could be destroyed. */
2514 arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2515 gimple_call_set_arg (call_stmt, 0, arg0);
2516 gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2519 if (cond_stmts)
2521 gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2522 gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2524 update_ssa (TODO_update_ssa);
2527 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2528 ALIAS_DDRS are data dependence relations for runtime alias check. */
2530 static inline bool
2531 version_for_distribution_p (vec<struct partition *> *partitions,
2532 vec<ddr_p> *alias_ddrs)
2534 /* No need to version loop if we have only one partition. */
2535 if (partitions->length () == 1)
2536 return false;
2538 /* Need to version loop if runtime alias check is necessary. */
2539 return (alias_ddrs->length () > 0);
2542 /* Compare base offset of builtin mem* partitions P1 and P2. */
2544 static bool
2545 offset_cmp (struct partition *p1, struct partition *p2)
2547 gcc_assert (p1 != NULL && p1->builtin != NULL);
2548 gcc_assert (p2 != NULL && p2->builtin != NULL);
2549 return p1->builtin->dst_base_offset < p2->builtin->dst_base_offset;
2552 /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special
2553 case optimization transforming below code:
2555 __builtin_memset (&obj, 0, 100);
2556 _1 = &obj + 100;
2557 __builtin_memset (_1, 0, 200);
2558 _2 = &obj + 300;
2559 __builtin_memset (_2, 0, 100);
2561 into:
2563 __builtin_memset (&obj, 0, 400);
2565 Note we don't have dependence information between different partitions
2566 at this point, as a result, we can't handle nonadjacent memset builtin
2567 partitions since dependence might be broken. */
2569 static void
2570 fuse_memset_builtins (vec<struct partition *> *partitions)
2572 unsigned i, j;
2573 struct partition *part1, *part2;
2574 tree rhs1, rhs2;
2576 for (i = 0; partitions->iterate (i, &part1);)
2578 if (part1->kind != PKIND_MEMSET)
2580 i++;
2581 continue;
2584 /* Find sub-array of memset builtins of the same base. Index range
2585 of the sub-array is [i, j) with "j > i". */
2586 for (j = i + 1; partitions->iterate (j, &part2); ++j)
2588 if (part2->kind != PKIND_MEMSET
2589 || !operand_equal_p (part1->builtin->dst_base_base,
2590 part2->builtin->dst_base_base, 0))
2591 break;
2593 /* Memset calls setting different values can't be merged. */
2594 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2595 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2596 if (!operand_equal_p (rhs1, rhs2, 0))
2597 break;
2600 /* Stable sort is required in order to avoid breaking dependence. */
2601 std::stable_sort (&(*partitions)[i],
2602 &(*partitions)[i] + j - i, offset_cmp);
2603 /* Continue with next partition. */
2604 i = j;
2607 /* Merge all consecutive memset builtin partitions. */
2608 for (i = 0; i < partitions->length () - 1;)
2610 part1 = (*partitions)[i];
2611 if (part1->kind != PKIND_MEMSET)
2613 i++;
2614 continue;
2617 part2 = (*partitions)[i + 1];
2618 /* Only merge memset partitions of the same base and with constant
2619 access sizes. */
2620 if (part2->kind != PKIND_MEMSET
2621 || TREE_CODE (part1->builtin->size) != INTEGER_CST
2622 || TREE_CODE (part2->builtin->size) != INTEGER_CST
2623 || !operand_equal_p (part1->builtin->dst_base_base,
2624 part2->builtin->dst_base_base, 0))
2626 i++;
2627 continue;
2629 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2630 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2631 int bytev1 = const_with_all_bytes_same (rhs1);
2632 int bytev2 = const_with_all_bytes_same (rhs2);
2633 /* Only merge memset partitions of the same value. */
2634 if (bytev1 != bytev2 || bytev1 == -1)
2636 i++;
2637 continue;
2639 wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2640 wi::to_wide (part1->builtin->size));
2641 /* Only merge adjacent memset partitions. */
2642 if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2644 i++;
2645 continue;
2647 /* Merge partitions[i] and partitions[i+1]. */
2648 part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2649 part1->builtin->size,
2650 part2->builtin->size);
2651 partition_free (part2);
2652 partitions->ordered_remove (i + 1);
2656 /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
2657 ALIAS_DDRS contains ddrs which need runtime alias check. */
2659 static void
2660 finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
2661 vec<ddr_p> *alias_ddrs)
2663 unsigned i;
2664 struct partition *partition, *a;
2666 if (partitions->length () == 1
2667 || alias_ddrs->length () > 0)
2668 return;
2670 unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2671 bool same_type_p = true;
2672 enum partition_type type = ((*partitions)[0])->type;
2673 for (i = 0; partitions->iterate (i, &partition); ++i)
2675 same_type_p &= (type == partition->type);
2676 if (partition_builtin_p (partition))
2678 num_builtin++;
2679 continue;
2681 num_normal++;
2682 if (partition->kind == PKIND_PARTIAL_MEMSET)
2683 num_partial_memset++;
2686 /* Don't distribute current loop into too many loops given we don't have
2687 memory stream cost model. Be even more conservative in case of loop
2688 nest distribution. */
2689 if ((same_type_p && num_builtin == 0
2690 && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2691 || (loop->inner != NULL
2692 && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2693 || (loop->inner == NULL
2694 && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2696 a = (*partitions)[0];
2697 for (i = 1; partitions->iterate (i, &partition); ++i)
2699 partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2700 partition_free (partition);
2702 partitions->truncate (1);
2705 /* Fuse memset builtins if possible. */
2706 if (partitions->length () > 1)
2707 fuse_memset_builtins (partitions);
2710 /* Distributes the code from LOOP in such a way that producer statements
2711 are placed before consumer statements. Tries to separate only the
2712 statements from STMTS into separate loops. Returns the number of
2713 distributed loops. Set NB_CALLS to number of generated builtin calls.
2714 Set *DESTROY_P to whether LOOP needs to be destroyed. */
2716 static int
2717 distribute_loop (struct loop *loop, vec<gimple *> stmts,
2718 control_dependences *cd, int *nb_calls, bool *destroy_p)
2720 ddrs_table = new hash_table<ddr_hasher> (389);
2721 struct graph *rdg;
2722 partition *partition;
2723 bool any_builtin;
2724 int i, nbp;
2726 *destroy_p = false;
2727 *nb_calls = 0;
2728 loop_nest.create (0);
2729 if (!find_loop_nest (loop, &loop_nest))
2731 loop_nest.release ();
2732 delete ddrs_table;
2733 return 0;
2736 datarefs_vec.create (20);
2737 rdg = build_rdg (loop, cd);
2738 if (!rdg)
2740 if (dump_file && (dump_flags & TDF_DETAILS))
2741 fprintf (dump_file,
2742 "Loop %d not distributed: failed to build the RDG.\n",
2743 loop->num);
2745 loop_nest.release ();
2746 free_data_refs (datarefs_vec);
2747 delete ddrs_table;
2748 return 0;
2751 if (datarefs_vec.length () > MAX_DATAREFS_NUM)
2753 if (dump_file && (dump_flags & TDF_DETAILS))
2754 fprintf (dump_file,
2755 "Loop %d not distributed: too many memory references.\n",
2756 loop->num);
2758 free_rdg (rdg);
2759 loop_nest.release ();
2760 free_data_refs (datarefs_vec);
2761 delete ddrs_table;
2762 return 0;
2765 data_reference_p dref;
2766 for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
2767 dref->aux = (void *) (uintptr_t) i;
2769 if (dump_file && (dump_flags & TDF_DETAILS))
2770 dump_rdg (dump_file, rdg);
2772 auto_vec<struct partition *, 3> partitions;
2773 rdg_build_partitions (rdg, stmts, &partitions);
2775 auto_vec<ddr_p> alias_ddrs;
2777 auto_bitmap stmt_in_all_partitions;
2778 bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
2779 for (i = 1; partitions.iterate (i, &partition); ++i)
2780 bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
2782 any_builtin = false;
2783 FOR_EACH_VEC_ELT (partitions, i, partition)
2785 classify_partition (loop, rdg, partition, stmt_in_all_partitions);
2786 any_builtin |= partition_builtin_p (partition);
2789 /* If we are only distributing patterns but did not detect any,
2790 simply bail out. */
2791 if (!flag_tree_loop_distribution
2792 && !any_builtin)
2794 nbp = 0;
2795 goto ldist_done;
2798 /* If we are only distributing patterns fuse all partitions that
2799 were not classified as builtins. This also avoids chopping
2800 a loop into pieces, separated by builtin calls. That is, we
2801 only want no or a single loop body remaining. */
2802 struct partition *into;
2803 if (!flag_tree_loop_distribution)
2805 for (i = 0; partitions.iterate (i, &into); ++i)
2806 if (!partition_builtin_p (into))
2807 break;
2808 for (++i; partitions.iterate (i, &partition); ++i)
2809 if (!partition_builtin_p (partition))
2811 partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
2812 partitions.unordered_remove (i);
2813 partition_free (partition);
2814 i--;
2818 /* Due to limitations in the transform phase we have to fuse all
2819 reduction partitions into the last partition so the existing
2820 loop will contain all loop-closed PHI nodes. */
2821 for (i = 0; partitions.iterate (i, &into); ++i)
2822 if (partition_reduction_p (into))
2823 break;
2824 for (i = i + 1; partitions.iterate (i, &partition); ++i)
2825 if (partition_reduction_p (partition))
2827 partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
2828 partitions.unordered_remove (i);
2829 partition_free (partition);
2830 i--;
2833 /* Apply our simple cost model - fuse partitions with similar
2834 memory accesses. */
2835 for (i = 0; partitions.iterate (i, &into); ++i)
2837 bool changed = false;
2838 if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
2839 continue;
2840 for (int j = i + 1;
2841 partitions.iterate (j, &partition); ++j)
2843 if (share_memory_accesses (rdg, into, partition))
2845 partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
2846 partitions.unordered_remove (j);
2847 partition_free (partition);
2848 j--;
2849 changed = true;
2852 /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
2853 accesses when 1 and 2 have similar accesses but not 0 and 1
2854 then in the next iteration we will fail to consider merging
2855 1 into 0,2. So try again if we did any merging into 0. */
2856 if (changed)
2857 i--;
2860 /* Build the partition dependency graph and fuse partitions in strong
2861 connected component. */
2862 if (partitions.length () > 1)
2864 /* Don't support loop nest distribution under runtime alias check
2865 since it's not likely to enable many vectorization opportunities. */
2866 if (loop->inner)
2867 merge_dep_scc_partitions (rdg, &partitions, false);
2868 else
2870 merge_dep_scc_partitions (rdg, &partitions, true);
2871 if (partitions.length () > 1)
2872 break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
2876 finalize_partitions (loop, &partitions, &alias_ddrs);
2878 nbp = partitions.length ();
2879 if (nbp == 0
2880 || (nbp == 1 && !partition_builtin_p (partitions[0]))
2881 || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
2883 nbp = 0;
2884 goto ldist_done;
2887 if (version_for_distribution_p (&partitions, &alias_ddrs))
2888 version_loop_by_alias_check (loop, &alias_ddrs);
2890 if (dump_file && (dump_flags & TDF_DETAILS))
2892 fprintf (dump_file,
2893 "distribute loop <%d> into partitions:\n", loop->num);
2894 dump_rdg_partitions (dump_file, partitions);
2897 FOR_EACH_VEC_ELT (partitions, i, partition)
2899 if (partition_builtin_p (partition))
2900 (*nb_calls)++;
2901 *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
2904 ldist_done:
2905 loop_nest.release ();
2906 free_data_refs (datarefs_vec);
2907 for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
2908 iter != ddrs_table->end (); ++iter)
2910 free_dependence_relation (*iter);
2911 *iter = NULL;
2913 delete ddrs_table;
2915 FOR_EACH_VEC_ELT (partitions, i, partition)
2916 partition_free (partition);
2918 free_rdg (rdg);
2919 return nbp - *nb_calls;
2922 /* Distribute all loops in the current function. */
2924 namespace {
2926 const pass_data pass_data_loop_distribution =
2928 GIMPLE_PASS, /* type */
2929 "ldist", /* name */
2930 OPTGROUP_LOOP, /* optinfo_flags */
2931 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
2932 ( PROP_cfg | PROP_ssa ), /* properties_required */
2933 0, /* properties_provided */
2934 0, /* properties_destroyed */
2935 0, /* todo_flags_start */
2936 0, /* todo_flags_finish */
2939 class pass_loop_distribution : public gimple_opt_pass
2941 public:
2942 pass_loop_distribution (gcc::context *ctxt)
2943 : gimple_opt_pass (pass_data_loop_distribution, ctxt)
2946 /* opt_pass methods: */
2947 virtual bool gate (function *)
2949 return flag_tree_loop_distribution
2950 || flag_tree_loop_distribute_patterns;
2953 virtual unsigned int execute (function *);
2955 }; // class pass_loop_distribution
2958 /* Given LOOP, this function records seed statements for distribution in
2959 WORK_LIST. Return false if there is nothing for distribution. */
2961 static bool
2962 find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list)
2964 basic_block *bbs = get_loop_body_in_dom_order (loop);
2966 /* Initialize the worklist with stmts we seed the partitions with. */
2967 for (unsigned i = 0; i < loop->num_nodes; ++i)
2969 for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
2970 !gsi_end_p (gsi); gsi_next (&gsi))
2972 gphi *phi = gsi.phi ();
2973 if (virtual_operand_p (gimple_phi_result (phi)))
2974 continue;
2975 /* Distribute stmts which have defs that are used outside of
2976 the loop. */
2977 if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
2978 continue;
2979 work_list->safe_push (phi);
2981 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2982 !gsi_end_p (gsi); gsi_next (&gsi))
2984 gimple *stmt = gsi_stmt (gsi);
2986 /* If there is a stmt with side-effects bail out - we
2987 cannot and should not distribute this loop. */
2988 if (gimple_has_side_effects (stmt))
2990 free (bbs);
2991 return false;
2994 /* Distribute stmts which have defs that are used outside of
2995 the loop. */
2996 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
2998 /* Otherwise only distribute stores for now. */
2999 else if (!gimple_vdef (stmt))
3000 continue;
3002 work_list->safe_push (stmt);
3005 free (bbs);
3006 return work_list->length () > 0;
3009 /* Given innermost LOOP, return the outermost enclosing loop that forms a
3010 perfect loop nest. */
3012 static struct loop *
3013 prepare_perfect_loop_nest (struct loop *loop)
3015 struct loop *outer = loop_outer (loop);
3016 tree niters = number_of_latch_executions (loop);
3018 /* TODO: We only support the innermost 3-level loop nest distribution
3019 because of compilation time issue for now. This should be relaxed
3020 in the future. Note we only allow 3-level loop nest distribution
3021 when parallelizing loops. */
3022 while ((loop->inner == NULL
3023 || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
3024 && loop_outer (outer)
3025 && outer->inner == loop && loop->next == NULL
3026 && single_exit (outer)
3027 && optimize_loop_for_speed_p (outer)
3028 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3029 && (niters = number_of_latch_executions (outer)) != NULL_TREE
3030 && niters != chrec_dont_know)
3032 loop = outer;
3033 outer = loop_outer (loop);
3036 return loop;
3039 unsigned int
3040 pass_loop_distribution::execute (function *fun)
3042 struct loop *loop;
3043 bool changed = false;
3044 basic_block bb;
3045 control_dependences *cd = NULL;
3046 auto_vec<loop_p> loops_to_be_destroyed;
3048 if (number_of_loops (fun) <= 1)
3049 return 0;
3051 /* Compute topological order for basic blocks. Topological order is
3052 needed because data dependence is computed for data references in
3053 lexicographical order. */
3054 if (bb_top_order_index == NULL)
3056 int rpo_num;
3057 int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun));
3059 bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
3060 bb_top_order_index_size = last_basic_block_for_fn (cfun);
3061 rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true);
3062 for (int i = 0; i < rpo_num; i++)
3063 bb_top_order_index[rpo[i]] = i;
3065 free (rpo);
3068 FOR_ALL_BB_FN (bb, fun)
3070 gimple_stmt_iterator gsi;
3071 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3072 gimple_set_uid (gsi_stmt (gsi), -1);
3073 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3074 gimple_set_uid (gsi_stmt (gsi), -1);
3077 /* We can at the moment only distribute non-nested loops, thus restrict
3078 walking to innermost loops. */
3079 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
3081 /* Don't distribute multiple exit edges loop, or cold loop. */
3082 if (!single_exit (loop)
3083 || !optimize_loop_for_speed_p (loop))
3084 continue;
3086 /* Don't distribute loop if niters is unknown. */
3087 tree niters = number_of_latch_executions (loop);
3088 if (niters == NULL_TREE || niters == chrec_dont_know)
3089 continue;
3091 /* Get the perfect loop nest for distribution. */
3092 loop = prepare_perfect_loop_nest (loop);
3093 for (; loop; loop = loop->inner)
3095 auto_vec<gimple *> work_list;
3096 if (!find_seed_stmts_for_distribution (loop, &work_list))
3097 break;
3099 const char *str = loop->inner ? " nest" : "";
3100 location_t loc = find_loop_location (loop);
3101 if (!cd)
3103 calculate_dominance_info (CDI_DOMINATORS);
3104 calculate_dominance_info (CDI_POST_DOMINATORS);
3105 cd = new control_dependences ();
3106 free_dominance_info (CDI_POST_DOMINATORS);
3109 bool destroy_p;
3110 int nb_generated_loops, nb_generated_calls;
3111 nb_generated_loops = distribute_loop (loop, work_list, cd,
3112 &nb_generated_calls,
3113 &destroy_p);
3114 if (destroy_p)
3115 loops_to_be_destroyed.safe_push (loop);
3117 if (nb_generated_loops + nb_generated_calls > 0)
3119 changed = true;
3120 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3121 loc, "Loop%s %d distributed: split to %d loops "
3122 "and %d library calls.\n", str, loop->num,
3123 nb_generated_loops, nb_generated_calls);
3125 break;
3128 if (dump_file && (dump_flags & TDF_DETAILS))
3129 fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
3133 if (cd)
3134 delete cd;
3136 if (bb_top_order_index != NULL)
3138 free (bb_top_order_index);
3139 bb_top_order_index = NULL;
3140 bb_top_order_index_size = 0;
3143 if (changed)
3145 /* Destroy loop bodies that could not be reused. Do this late as we
3146 otherwise can end up refering to stale data in control dependences. */
3147 unsigned i;
3148 FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
3149 destroy_loop (loop);
3151 /* Cached scalar evolutions now may refer to wrong or non-existing
3152 loops. */
3153 scev_reset_htab ();
3154 mark_virtual_operands_for_renaming (fun);
3155 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3158 checking_verify_loop_structure ();
3160 return changed ? TODO_cleanup_cfg : 0;
3163 } // anon namespace
3165 gimple_opt_pass *
3166 make_pass_loop_distribution (gcc::context *ctxt)
3168 return new pass_loop_distribution (ctxt);