[NDS32] Add intrinsic function for ffb, ffmism and flmism.
[official-gcc.git] / gcc / tree-loop-distribution.c
blob67f27bad83cf30d694676476a7efde7a5da4d9e8
1 /* Loop distribution.
2 Copyright (C) 2006-2018 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 /* This pass performs loop distribution: for example, the loop
24 |DO I = 2, N
25 | A(I) = B(I) + C
26 | D(I) = A(I-1)*E
27 |ENDDO
29 is transformed to
31 |DOALL I = 2, N
32 | A(I) = B(I) + C
33 |ENDDO
35 |DOALL I = 2, N
36 | D(I) = A(I-1)*E
37 |ENDDO
39 Loop distribution is the dual of loop fusion. It separates statements
40 of a loop (or loop nest) into multiple loops (or loop nests) with the
41 same loop header. The major goal is to separate statements which may
42 be vectorized from those that can't. This pass implements distribution
43 in the following steps:
45 1) Seed partitions with specific type statements. For now we support
46 two types seed statements: statement defining variable used outside
47 of loop; statement storing to memory.
48 2) Build reduced dependence graph (RDG) for loop to be distributed.
49 The vertices (RDG:V) model all statements in the loop and the edges
50 (RDG:E) model flow and control dependencies between statements.
51 3) Apart from RDG, compute data dependencies between memory references.
52 4) Starting from seed statement, build up partition by adding depended
53 statements according to RDG's dependence information. Partition is
54 classified as parallel type if it can be executed paralleled; or as
55 sequential type if it can't. Parallel type partition is further
56 classified as different builtin kinds if it can be implemented as
57 builtin function calls.
58 5) Build partition dependence graph (PG) based on data dependencies.
59 The vertices (PG:V) model all partitions and the edges (PG:E) model
60 all data dependencies between every partitions pair. In general,
61 data dependence is either compilation time known or unknown. In C
62 family languages, there exists quite amount compilation time unknown
63 dependencies because of possible alias relation of data references.
64 We categorize PG's edge to two types: "true" edge that represents
65 compilation time known data dependencies; "alias" edge for all other
66 data dependencies.
67 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge
68 partitions in each strong connected component (SCC) correspondingly.
69 Build new PG for merged partitions.
70 7) Traverse PG again and this time with both "true" and "alias" edges
71 included. We try to break SCCs by removing some edges. Because
72 SCCs by "true" edges are all fused in step 6), we can break SCCs
73 by removing some "alias" edges. It's NP-hard to choose optimal
74 edge set, fortunately simple approximation is good enough for us
75 given the small problem scale.
76 8) Collect all data dependencies of the removed "alias" edges. Create
77 runtime alias checks for collected data dependencies.
78 9) Version loop under the condition of runtime alias checks. Given
79 loop distribution generally introduces additional overhead, it is
80 only useful if vectorization is achieved in distributed loop. We
81 version loop with internal function call IFN_LOOP_DIST_ALIAS. If
82 no distributed loop can be vectorized, we simply remove distributed
83 loops and recover to the original one.
85 TODO:
86 1) We only distribute innermost two-level loop nest now. We should
87 extend it for arbitrary loop nests in the future.
88 2) We only fuse partitions in SCC now. A better fusion algorithm is
89 desired to minimize loop overhead, maximize parallelism and maximize
90 data reuse. */
92 #include "config.h"
93 #define INCLUDE_ALGORITHM /* stable_sort */
94 #include "system.h"
95 #include "coretypes.h"
96 #include "backend.h"
97 #include "tree.h"
98 #include "gimple.h"
99 #include "cfghooks.h"
100 #include "tree-pass.h"
101 #include "ssa.h"
102 #include "gimple-pretty-print.h"
103 #include "fold-const.h"
104 #include "cfganal.h"
105 #include "gimple-iterator.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-ssa-loop-manip.h"
110 #include "tree-ssa-loop-ivopts.h"
111 #include "tree-ssa-loop.h"
112 #include "tree-into-ssa.h"
113 #include "tree-ssa.h"
114 #include "cfgloop.h"
115 #include "tree-scalar-evolution.h"
116 #include "params.h"
117 #include "tree-vectorizer.h"
120 #define MAX_DATAREFS_NUM \
121 ((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
123 /* Threshold controlling number of distributed partitions. Given it may
124 be unnecessary if a memory stream cost model is invented in the future,
125 we define it as a temporary macro, rather than a parameter. */
126 #define NUM_PARTITION_THRESHOLD (4)
128 /* Hashtable helpers. */
130 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
132 static inline hashval_t hash (const data_dependence_relation *);
133 static inline bool equal (const data_dependence_relation *,
134 const data_dependence_relation *);
137 /* Hash function for data dependence. */
139 inline hashval_t
140 ddr_hasher::hash (const data_dependence_relation *ddr)
142 inchash::hash h;
143 h.add_ptr (DDR_A (ddr));
144 h.add_ptr (DDR_B (ddr));
145 return h.end ();
148 /* Hash table equality function for data dependence. */
150 inline bool
151 ddr_hasher::equal (const data_dependence_relation *ddr1,
152 const data_dependence_relation *ddr2)
154 return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
157 /* The loop (nest) to be distributed. */
158 static vec<loop_p> loop_nest;
160 /* Vector of data references in the loop to be distributed. */
161 static vec<data_reference_p> datarefs_vec;
163 /* Store index of data reference in aux field. */
164 #define DR_INDEX(dr) ((uintptr_t) (dr)->aux)
166 /* Hash table for data dependence relation in the loop to be distributed. */
167 static hash_table<ddr_hasher> *ddrs_table;
169 /* A Reduced Dependence Graph (RDG) vertex representing a statement. */
170 struct rdg_vertex
172 /* The statement represented by this vertex. */
173 gimple *stmt;
175 /* Vector of data-references in this statement. */
176 vec<data_reference_p> datarefs;
178 /* True when the statement contains a write to memory. */
179 bool has_mem_write;
181 /* True when the statement contains a read from memory. */
182 bool has_mem_reads;
185 #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt
186 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
187 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
188 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
189 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
190 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
191 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
192 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
194 /* Data dependence type. */
196 enum rdg_dep_type
198 /* Read After Write (RAW). */
199 flow_dd = 'f',
201 /* Control dependence (execute conditional on). */
202 control_dd = 'c'
205 /* Dependence information attached to an edge of the RDG. */
207 struct rdg_edge
209 /* Type of the dependence. */
210 enum rdg_dep_type type;
213 #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type
215 /* Dump vertex I in RDG to FILE. */
217 static void
218 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
220 struct vertex *v = &(rdg->vertices[i]);
221 struct graph_edge *e;
223 fprintf (file, "(vertex %d: (%s%s) (in:", i,
224 RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
225 RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
227 if (v->pred)
228 for (e = v->pred; e; e = e->pred_next)
229 fprintf (file, " %d", e->src);
231 fprintf (file, ") (out:");
233 if (v->succ)
234 for (e = v->succ; e; e = e->succ_next)
235 fprintf (file, " %d", e->dest);
237 fprintf (file, ")\n");
238 print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
239 fprintf (file, ")\n");
242 /* Call dump_rdg_vertex on stderr. */
244 DEBUG_FUNCTION void
245 debug_rdg_vertex (struct graph *rdg, int i)
247 dump_rdg_vertex (stderr, rdg, i);
250 /* Dump the reduced dependence graph RDG to FILE. */
252 static void
253 dump_rdg (FILE *file, struct graph *rdg)
255 fprintf (file, "(rdg\n");
256 for (int i = 0; i < rdg->n_vertices; i++)
257 dump_rdg_vertex (file, rdg, i);
258 fprintf (file, ")\n");
261 /* Call dump_rdg on stderr. */
263 DEBUG_FUNCTION void
264 debug_rdg (struct graph *rdg)
266 dump_rdg (stderr, rdg);
269 static void
270 dot_rdg_1 (FILE *file, struct graph *rdg)
272 int i;
273 pretty_printer buffer;
274 pp_needs_newline (&buffer) = false;
275 buffer.buffer->stream = file;
277 fprintf (file, "digraph RDG {\n");
279 for (i = 0; i < rdg->n_vertices; i++)
281 struct vertex *v = &(rdg->vertices[i]);
282 struct graph_edge *e;
284 fprintf (file, "%d [label=\"[%d] ", i, i);
285 pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
286 pp_flush (&buffer);
287 fprintf (file, "\"]\n");
289 /* Highlight reads from memory. */
290 if (RDG_MEM_READS_STMT (rdg, i))
291 fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
293 /* Highlight stores to memory. */
294 if (RDG_MEM_WRITE_STMT (rdg, i))
295 fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
297 if (v->succ)
298 for (e = v->succ; e; e = e->succ_next)
299 switch (RDGE_TYPE (e))
301 case flow_dd:
302 /* These are the most common dependences: don't print these. */
303 fprintf (file, "%d -> %d \n", i, e->dest);
304 break;
306 case control_dd:
307 fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
308 break;
310 default:
311 gcc_unreachable ();
315 fprintf (file, "}\n\n");
318 /* Display the Reduced Dependence Graph using dotty. */
320 DEBUG_FUNCTION void
321 dot_rdg (struct graph *rdg)
323 /* When debugging, you may want to enable the following code. */
324 #ifdef HAVE_POPEN
325 FILE *file = popen ("dot -Tx11", "w");
326 if (!file)
327 return;
328 dot_rdg_1 (file, rdg);
329 fflush (file);
330 close (fileno (file));
331 pclose (file);
332 #else
333 dot_rdg_1 (stderr, rdg);
334 #endif
337 /* Returns the index of STMT in RDG. */
339 static int
340 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
342 int index = gimple_uid (stmt);
343 gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
344 return index;
347 /* Creates dependence edges in RDG for all the uses of DEF. IDEF is
348 the index of DEF in RDG. */
350 static void
351 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
353 use_operand_p imm_use_p;
354 imm_use_iterator iterator;
356 FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
358 struct graph_edge *e;
359 int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
361 if (use < 0)
362 continue;
364 e = add_edge (rdg, idef, use);
365 e->data = XNEW (struct rdg_edge);
366 RDGE_TYPE (e) = flow_dd;
370 /* Creates an edge for the control dependences of BB to the vertex V. */
372 static void
373 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
374 int v, control_dependences *cd)
376 bitmap_iterator bi;
377 unsigned edge_n;
378 EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
379 0, edge_n, bi)
381 basic_block cond_bb = cd->get_edge_src (edge_n);
382 gimple *stmt = last_stmt (cond_bb);
383 if (stmt && is_ctrl_stmt (stmt))
385 struct graph_edge *e;
386 int c = rdg_vertex_for_stmt (rdg, stmt);
387 if (c < 0)
388 continue;
390 e = add_edge (rdg, c, v);
391 e->data = XNEW (struct rdg_edge);
392 RDGE_TYPE (e) = control_dd;
397 /* Creates the edges of the reduced dependence graph RDG. */
399 static void
400 create_rdg_flow_edges (struct graph *rdg)
402 int i;
403 def_operand_p def_p;
404 ssa_op_iter iter;
406 for (i = 0; i < rdg->n_vertices; i++)
407 FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
408 iter, SSA_OP_DEF)
409 create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
412 /* Creates the edges of the reduced dependence graph RDG. */
414 static void
415 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
417 int i;
419 for (i = 0; i < rdg->n_vertices; i++)
421 gimple *stmt = RDG_STMT (rdg, i);
422 if (gimple_code (stmt) == GIMPLE_PHI)
424 edge_iterator ei;
425 edge e;
426 FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
427 if (flow_bb_inside_loop_p (loop, e->src))
428 create_edge_for_control_dependence (rdg, e->src, i, cd);
430 else
431 create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
435 /* Build the vertices of the reduced dependence graph RDG. Return false
436 if that failed. */
438 static bool
439 create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop)
441 int i;
442 gimple *stmt;
444 FOR_EACH_VEC_ELT (stmts, i, stmt)
446 struct vertex *v = &(rdg->vertices[i]);
448 /* Record statement to vertex mapping. */
449 gimple_set_uid (stmt, i);
451 v->data = XNEW (struct rdg_vertex);
452 RDGV_STMT (v) = stmt;
453 RDGV_DATAREFS (v).create (0);
454 RDGV_HAS_MEM_WRITE (v) = false;
455 RDGV_HAS_MEM_READS (v) = false;
456 if (gimple_code (stmt) == GIMPLE_PHI)
457 continue;
459 unsigned drp = datarefs_vec.length ();
460 if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
461 return false;
462 for (unsigned j = drp; j < datarefs_vec.length (); ++j)
464 data_reference_p dr = datarefs_vec[j];
465 if (DR_IS_READ (dr))
466 RDGV_HAS_MEM_READS (v) = true;
467 else
468 RDGV_HAS_MEM_WRITE (v) = true;
469 RDGV_DATAREFS (v).safe_push (dr);
472 return true;
475 /* Array mapping basic block's index to its topological order. */
476 static int *bb_top_order_index;
477 /* And size of the array. */
478 static int bb_top_order_index_size;
480 /* If X has a smaller topological sort number than Y, returns -1;
481 if greater, returns 1. */
483 static int
484 bb_top_order_cmp (const void *x, const void *y)
486 basic_block bb1 = *(const basic_block *) x;
487 basic_block bb2 = *(const basic_block *) y;
489 gcc_assert (bb1->index < bb_top_order_index_size
490 && bb2->index < bb_top_order_index_size);
491 gcc_assert (bb1 == bb2
492 || bb_top_order_index[bb1->index]
493 != bb_top_order_index[bb2->index]);
495 return (bb_top_order_index[bb1->index] - bb_top_order_index[bb2->index]);
498 /* Initialize STMTS with all the statements of LOOP. We use topological
499 order to discover all statements. The order is important because
500 generate_loops_for_partition is using the same traversal for identifying
501 statements in loop copies. */
503 static void
504 stmts_from_loop (struct loop *loop, vec<gimple *> *stmts)
506 unsigned int i;
507 basic_block *bbs = get_loop_body_in_custom_order (loop, bb_top_order_cmp);
509 for (i = 0; i < loop->num_nodes; i++)
511 basic_block bb = bbs[i];
513 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
514 gsi_next (&bsi))
515 if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
516 stmts->safe_push (bsi.phi ());
518 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
519 gsi_next (&bsi))
521 gimple *stmt = gsi_stmt (bsi);
522 if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
523 stmts->safe_push (stmt);
527 free (bbs);
530 /* Free the reduced dependence graph RDG. */
532 static void
533 free_rdg (struct graph *rdg)
535 int i;
537 for (i = 0; i < rdg->n_vertices; i++)
539 struct vertex *v = &(rdg->vertices[i]);
540 struct graph_edge *e;
542 for (e = v->succ; e; e = e->succ_next)
543 free (e->data);
545 if (v->data)
547 gimple_set_uid (RDGV_STMT (v), -1);
548 (RDGV_DATAREFS (v)).release ();
549 free (v->data);
553 free_graph (rdg);
556 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
557 LOOP, and one edge per flow dependence or control dependence from control
558 dependence CD. During visiting each statement, data references are also
559 collected and recorded in global data DATAREFS_VEC. */
561 static struct graph *
562 build_rdg (struct loop *loop, control_dependences *cd)
564 struct graph *rdg;
566 /* Create the RDG vertices from the stmts of the loop nest. */
567 auto_vec<gimple *, 10> stmts;
568 stmts_from_loop (loop, &stmts);
569 rdg = new_graph (stmts.length ());
570 if (!create_rdg_vertices (rdg, stmts, loop))
572 free_rdg (rdg);
573 return NULL;
575 stmts.release ();
577 create_rdg_flow_edges (rdg);
578 if (cd)
579 create_rdg_cd_edges (rdg, cd, loop);
581 return rdg;
585 /* Kind of distributed loop. */
586 enum partition_kind {
587 PKIND_NORMAL,
588 /* Partial memset stands for a paritition can be distributed into a loop
589 of memset calls, rather than a single memset call. It's handled just
590 like a normal parition, i.e, distributed as separate loop, no memset
591 call is generated.
593 Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
594 loop nest as deep as possible. As a result, parloop achieves better
595 parallelization by parallelizing deeper loop nest. This hack should
596 be unnecessary and removed once distributed memset can be understood
597 and analyzed in data reference analysis. See PR82604 for more. */
598 PKIND_PARTIAL_MEMSET,
599 PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
602 /* Type of distributed loop. */
603 enum partition_type {
604 /* The distributed loop can be executed parallelly. */
605 PTYPE_PARALLEL = 0,
606 /* The distributed loop has to be executed sequentially. */
607 PTYPE_SEQUENTIAL
610 /* Builtin info for loop distribution. */
611 struct builtin_info
613 /* data-references a kind != PKIND_NORMAL partition is about. */
614 data_reference_p dst_dr;
615 data_reference_p src_dr;
616 /* Base address and size of memory objects operated by the builtin. Note
617 both dest and source memory objects must have the same size. */
618 tree dst_base;
619 tree src_base;
620 tree size;
621 /* Base and offset part of dst_base after stripping constant offset. This
622 is only used in memset builtin distribution for now. */
623 tree dst_base_base;
624 unsigned HOST_WIDE_INT dst_base_offset;
627 /* Partition for loop distribution. */
628 struct partition
630 /* Statements of the partition. */
631 bitmap stmts;
632 /* True if the partition defines variable which is used outside of loop. */
633 bool reduction_p;
634 enum partition_kind kind;
635 enum partition_type type;
636 /* Data references in the partition. */
637 bitmap datarefs;
638 /* Information of builtin parition. */
639 struct builtin_info *builtin;
643 /* Allocate and initialize a partition from BITMAP. */
645 static partition *
646 partition_alloc (void)
648 partition *partition = XCNEW (struct partition);
649 partition->stmts = BITMAP_ALLOC (NULL);
650 partition->reduction_p = false;
651 partition->kind = PKIND_NORMAL;
652 partition->datarefs = BITMAP_ALLOC (NULL);
653 return partition;
656 /* Free PARTITION. */
658 static void
659 partition_free (partition *partition)
661 BITMAP_FREE (partition->stmts);
662 BITMAP_FREE (partition->datarefs);
663 if (partition->builtin)
664 free (partition->builtin);
666 free (partition);
669 /* Returns true if the partition can be generated as a builtin. */
671 static bool
672 partition_builtin_p (partition *partition)
674 return partition->kind > PKIND_PARTIAL_MEMSET;
677 /* Returns true if the partition contains a reduction. */
679 static bool
680 partition_reduction_p (partition *partition)
682 return partition->reduction_p;
685 /* Partitions are fused because of different reasons. */
686 enum fuse_type
688 FUSE_NON_BUILTIN = 0,
689 FUSE_REDUCTION = 1,
690 FUSE_SHARE_REF = 2,
691 FUSE_SAME_SCC = 3,
692 FUSE_FINALIZE = 4
695 /* Description on different fusing reason. */
696 static const char *fuse_message[] = {
697 "they are non-builtins",
698 "they have reductions",
699 "they have shared memory refs",
700 "they are in the same dependence scc",
701 "there is no point to distribute loop"};
703 static void
704 update_type_for_merge (struct graph *, partition *, partition *);
706 /* Merge PARTITION into the partition DEST. RDG is the reduced dependence
707 graph and we update type for result partition if it is non-NULL. */
709 static void
710 partition_merge_into (struct graph *rdg, partition *dest,
711 partition *partition, enum fuse_type ft)
713 if (dump_file && (dump_flags & TDF_DETAILS))
715 fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
716 fprintf (dump_file, " Part 1: ");
717 dump_bitmap (dump_file, dest->stmts);
718 fprintf (dump_file, " Part 2: ");
719 dump_bitmap (dump_file, partition->stmts);
722 dest->kind = PKIND_NORMAL;
723 if (dest->type == PTYPE_PARALLEL)
724 dest->type = partition->type;
726 bitmap_ior_into (dest->stmts, partition->stmts);
727 if (partition_reduction_p (partition))
728 dest->reduction_p = true;
730 /* Further check if any data dependence prevents us from executing the
731 new partition parallelly. */
732 if (dest->type == PTYPE_PARALLEL && rdg != NULL)
733 update_type_for_merge (rdg, dest, partition);
735 bitmap_ior_into (dest->datarefs, partition->datarefs);
739 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
740 the LOOP. */
742 static bool
743 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
745 imm_use_iterator imm_iter;
746 use_operand_p use_p;
748 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
750 if (is_gimple_debug (USE_STMT (use_p)))
751 continue;
753 basic_block use_bb = gimple_bb (USE_STMT (use_p));
754 if (!flow_bb_inside_loop_p (loop, use_bb))
755 return true;
758 return false;
761 /* Returns true when STMT defines a scalar variable used after the
762 loop LOOP. */
764 static bool
765 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
767 def_operand_p def_p;
768 ssa_op_iter op_iter;
770 if (gimple_code (stmt) == GIMPLE_PHI)
771 return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
773 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
774 if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
775 return true;
777 return false;
780 /* Return a copy of LOOP placed before LOOP. */
782 static struct loop *
783 copy_loop_before (struct loop *loop)
785 struct loop *res;
786 edge preheader = loop_preheader_edge (loop);
788 initialize_original_copy_tables ();
789 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
790 gcc_assert (res != NULL);
791 free_original_copy_tables ();
792 delete_update_ssa ();
794 return res;
797 /* Creates an empty basic block after LOOP. */
799 static void
800 create_bb_after_loop (struct loop *loop)
802 edge exit = single_exit (loop);
804 if (!exit)
805 return;
807 split_edge (exit);
810 /* Generate code for PARTITION from the code in LOOP. The loop is
811 copied when COPY_P is true. All the statements not flagged in the
812 PARTITION bitmap are removed from the loop or from its copy. The
813 statements are indexed in sequence inside a basic block, and the
814 basic blocks of a loop are taken in dom order. */
816 static void
817 generate_loops_for_partition (struct loop *loop, partition *partition,
818 bool copy_p)
820 unsigned i;
821 basic_block *bbs;
823 if (copy_p)
825 int orig_loop_num = loop->orig_loop_num;
826 loop = copy_loop_before (loop);
827 gcc_assert (loop != NULL);
828 loop->orig_loop_num = orig_loop_num;
829 create_preheader (loop, CP_SIMPLE_PREHEADERS);
830 create_bb_after_loop (loop);
832 else
834 /* Origin number is set to the new versioned loop's num. */
835 gcc_assert (loop->orig_loop_num != loop->num);
838 /* Remove stmts not in the PARTITION bitmap. */
839 bbs = get_loop_body_in_dom_order (loop);
841 if (MAY_HAVE_DEBUG_BIND_STMTS)
842 for (i = 0; i < loop->num_nodes; i++)
844 basic_block bb = bbs[i];
846 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
847 gsi_next (&bsi))
849 gphi *phi = bsi.phi ();
850 if (!virtual_operand_p (gimple_phi_result (phi))
851 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
852 reset_debug_uses (phi);
855 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
857 gimple *stmt = gsi_stmt (bsi);
858 if (gimple_code (stmt) != GIMPLE_LABEL
859 && !is_gimple_debug (stmt)
860 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
861 reset_debug_uses (stmt);
865 for (i = 0; i < loop->num_nodes; i++)
867 basic_block bb = bbs[i];
868 edge inner_exit = NULL;
870 if (loop != bb->loop_father)
871 inner_exit = single_exit (bb->loop_father);
873 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
875 gphi *phi = bsi.phi ();
876 if (!virtual_operand_p (gimple_phi_result (phi))
877 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
878 remove_phi_node (&bsi, true);
879 else
880 gsi_next (&bsi);
883 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
885 gimple *stmt = gsi_stmt (bsi);
886 if (gimple_code (stmt) != GIMPLE_LABEL
887 && !is_gimple_debug (stmt)
888 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
890 /* In distribution of loop nest, if bb is inner loop's exit_bb,
891 we choose its exit edge/path in order to avoid generating
892 infinite loop. For all other cases, we choose an arbitrary
893 path through the empty CFG part that this unnecessary
894 control stmt controls. */
895 if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
897 if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
898 gimple_cond_make_true (cond_stmt);
899 else
900 gimple_cond_make_false (cond_stmt);
901 update_stmt (stmt);
903 else if (gimple_code (stmt) == GIMPLE_SWITCH)
905 gswitch *switch_stmt = as_a <gswitch *> (stmt);
906 gimple_switch_set_index
907 (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
908 update_stmt (stmt);
910 else
912 unlink_stmt_vdef (stmt);
913 gsi_remove (&bsi, true);
914 release_defs (stmt);
915 continue;
918 gsi_next (&bsi);
922 free (bbs);
925 /* If VAL memory representation contains the same value in all bytes,
926 return that value, otherwise return -1.
927 E.g. for 0x24242424 return 0x24, for IEEE double
928 747708026454360457216.0 return 0x44, etc. */
930 static int
931 const_with_all_bytes_same (tree val)
933 unsigned char buf[64];
934 int i, len;
936 if (integer_zerop (val)
937 || (TREE_CODE (val) == CONSTRUCTOR
938 && !TREE_CLOBBER_P (val)
939 && CONSTRUCTOR_NELTS (val) == 0))
940 return 0;
942 if (real_zerop (val))
944 /* Only return 0 for +0.0, not for -0.0, which doesn't have
945 an all bytes same memory representation. Don't transform
946 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */
947 switch (TREE_CODE (val))
949 case REAL_CST:
950 if (!real_isneg (TREE_REAL_CST_PTR (val)))
951 return 0;
952 break;
953 case COMPLEX_CST:
954 if (!const_with_all_bytes_same (TREE_REALPART (val))
955 && !const_with_all_bytes_same (TREE_IMAGPART (val)))
956 return 0;
957 break;
958 case VECTOR_CST:
960 unsigned int count = vector_cst_encoded_nelts (val);
961 unsigned int j;
962 for (j = 0; j < count; ++j)
963 if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
964 break;
965 if (j == count)
966 return 0;
967 break;
969 default:
970 break;
974 if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
975 return -1;
977 len = native_encode_expr (val, buf, sizeof (buf));
978 if (len == 0)
979 return -1;
980 for (i = 1; i < len; i++)
981 if (buf[i] != buf[0])
982 return -1;
983 return buf[0];
986 /* Generate a call to memset for PARTITION in LOOP. */
988 static void
989 generate_memset_builtin (struct loop *loop, partition *partition)
991 gimple_stmt_iterator gsi;
992 tree mem, fn, nb_bytes;
993 tree val;
994 struct builtin_info *builtin = partition->builtin;
995 gimple *fn_call;
997 /* The new statements will be placed before LOOP. */
998 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1000 nb_bytes = builtin->size;
1001 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1002 false, GSI_CONTINUE_LINKING);
1003 mem = builtin->dst_base;
1004 mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1005 false, GSI_CONTINUE_LINKING);
1007 /* This exactly matches the pattern recognition in classify_partition. */
1008 val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
1009 /* Handle constants like 0x15151515 and similarly
1010 floating point constants etc. where all bytes are the same. */
1011 int bytev = const_with_all_bytes_same (val);
1012 if (bytev != -1)
1013 val = build_int_cst (integer_type_node, bytev);
1014 else if (TREE_CODE (val) == INTEGER_CST)
1015 val = fold_convert (integer_type_node, val);
1016 else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1018 tree tem = make_ssa_name (integer_type_node);
1019 gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1020 gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1021 val = tem;
1024 fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1025 fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1026 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1028 if (dump_file && (dump_flags & TDF_DETAILS))
1030 fprintf (dump_file, "generated memset");
1031 if (bytev == 0)
1032 fprintf (dump_file, " zero\n");
1033 else
1034 fprintf (dump_file, "\n");
1038 /* Generate a call to memcpy for PARTITION in LOOP. */
1040 static void
1041 generate_memcpy_builtin (struct loop *loop, partition *partition)
1043 gimple_stmt_iterator gsi;
1044 gimple *fn_call;
1045 tree dest, src, fn, nb_bytes;
1046 enum built_in_function kind;
1047 struct builtin_info *builtin = partition->builtin;
1049 /* The new statements will be placed before LOOP. */
1050 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1052 nb_bytes = builtin->size;
1053 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1054 false, GSI_CONTINUE_LINKING);
1055 dest = builtin->dst_base;
1056 src = builtin->src_base;
1057 if (partition->kind == PKIND_MEMCPY
1058 || ! ptr_derefs_may_alias_p (dest, src))
1059 kind = BUILT_IN_MEMCPY;
1060 else
1061 kind = BUILT_IN_MEMMOVE;
1063 dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1064 false, GSI_CONTINUE_LINKING);
1065 src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1066 false, GSI_CONTINUE_LINKING);
1067 fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1068 fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1069 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1071 if (dump_file && (dump_flags & TDF_DETAILS))
1073 if (kind == BUILT_IN_MEMCPY)
1074 fprintf (dump_file, "generated memcpy\n");
1075 else
1076 fprintf (dump_file, "generated memmove\n");
1080 /* Remove and destroy the loop LOOP. */
1082 static void
1083 destroy_loop (struct loop *loop)
1085 unsigned nbbs = loop->num_nodes;
1086 edge exit = single_exit (loop);
1087 basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1088 basic_block *bbs;
1089 unsigned i;
1091 bbs = get_loop_body_in_dom_order (loop);
1093 redirect_edge_pred (exit, src);
1094 exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1095 exit->flags |= EDGE_FALLTHRU;
1096 cancel_loop_tree (loop);
1097 rescan_loop_exit (exit, false, true);
1099 i = nbbs;
1102 /* We have made sure to not leave any dangling uses of SSA
1103 names defined in the loop. With the exception of virtuals.
1104 Make sure we replace all uses of virtual defs that will remain
1105 outside of the loop with the bare symbol as delete_basic_block
1106 will release them. */
1107 --i;
1108 for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1109 gsi_next (&gsi))
1111 gphi *phi = gsi.phi ();
1112 if (virtual_operand_p (gimple_phi_result (phi)))
1113 mark_virtual_phi_result_for_renaming (phi);
1115 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);
1116 gsi_next (&gsi))
1118 gimple *stmt = gsi_stmt (gsi);
1119 tree vdef = gimple_vdef (stmt);
1120 if (vdef && TREE_CODE (vdef) == SSA_NAME)
1121 mark_virtual_operand_for_renaming (vdef);
1123 delete_basic_block (bbs[i]);
1125 while (i != 0);
1127 free (bbs);
1129 set_immediate_dominator (CDI_DOMINATORS, dest,
1130 recompute_dominator (CDI_DOMINATORS, dest));
1133 /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */
1135 static bool
1136 generate_code_for_partition (struct loop *loop,
1137 partition *partition, bool copy_p)
1139 switch (partition->kind)
1141 case PKIND_NORMAL:
1142 case PKIND_PARTIAL_MEMSET:
1143 /* Reductions all have to be in the last partition. */
1144 gcc_assert (!partition_reduction_p (partition)
1145 || !copy_p);
1146 generate_loops_for_partition (loop, partition, copy_p);
1147 return false;
1149 case PKIND_MEMSET:
1150 generate_memset_builtin (loop, partition);
1151 break;
1153 case PKIND_MEMCPY:
1154 case PKIND_MEMMOVE:
1155 generate_memcpy_builtin (loop, partition);
1156 break;
1158 default:
1159 gcc_unreachable ();
1162 /* Common tail for partitions we turn into a call. If this was the last
1163 partition for which we generate code, we have to destroy the loop. */
1164 if (!copy_p)
1165 return true;
1166 return false;
1169 /* Return data dependence relation for data references A and B. The two
1170 data references must be in lexicographic order wrto reduced dependence
1171 graph RDG. We firstly try to find ddr from global ddr hash table. If
1172 it doesn't exist, compute the ddr and cache it. */
1174 static data_dependence_relation *
1175 get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b)
1177 struct data_dependence_relation ent, **slot;
1178 struct data_dependence_relation *ddr;
1180 gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1181 gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1182 <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1183 ent.a = a;
1184 ent.b = b;
1185 slot = ddrs_table->find_slot (&ent, INSERT);
1186 if (*slot == NULL)
1188 ddr = initialize_data_dependence_relation (a, b, loop_nest);
1189 compute_affine_dependence (ddr, loop_nest[0]);
1190 *slot = ddr;
1193 return *slot;
1196 /* In reduced dependence graph RDG for loop distribution, return true if
1197 dependence between references DR1 and DR2 leads to a dependence cycle
1198 and such dependence cycle can't be resolved by runtime alias check. */
1200 static bool
1201 data_dep_in_cycle_p (struct graph *rdg,
1202 data_reference_p dr1, data_reference_p dr2)
1204 struct data_dependence_relation *ddr;
1206 /* Re-shuffle data-refs to be in topological order. */
1207 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1208 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1209 std::swap (dr1, dr2);
1211 ddr = get_data_dependence (rdg, dr1, dr2);
1213 /* In case of no data dependence. */
1214 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1215 return false;
1216 /* For unknown data dependence or known data dependence which can't be
1217 expressed in classic distance vector, we check if it can be resolved
1218 by runtime alias check. If yes, we still consider data dependence
1219 as won't introduce data dependence cycle. */
1220 else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1221 || DDR_NUM_DIST_VECTS (ddr) == 0)
1222 return !runtime_alias_check_p (ddr, NULL, true);
1223 else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1224 return true;
1225 else if (DDR_REVERSED_P (ddr)
1226 || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1227 return false;
1229 return true;
1232 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
1233 PARTITION1's type after merging PARTITION2 into PARTITION1. */
1235 static void
1236 update_type_for_merge (struct graph *rdg,
1237 partition *partition1, partition *partition2)
1239 unsigned i, j;
1240 bitmap_iterator bi, bj;
1241 data_reference_p dr1, dr2;
1243 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1245 unsigned start = (partition1 == partition2) ? i + 1 : 0;
1247 dr1 = datarefs_vec[i];
1248 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1250 dr2 = datarefs_vec[j];
1251 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1252 continue;
1254 /* Partition can only be executed sequentially if there is any
1255 data dependence cycle. */
1256 if (data_dep_in_cycle_p (rdg, dr1, dr2))
1258 partition1->type = PTYPE_SEQUENTIAL;
1259 return;
1265 /* Returns a partition with all the statements needed for computing
1266 the vertex V of the RDG, also including the loop exit conditions. */
1268 static partition *
1269 build_rdg_partition_for_vertex (struct graph *rdg, int v)
1271 partition *partition = partition_alloc ();
1272 auto_vec<int, 3> nodes;
1273 unsigned i, j;
1274 int x;
1275 data_reference_p dr;
1277 graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1279 FOR_EACH_VEC_ELT (nodes, i, x)
1281 bitmap_set_bit (partition->stmts, x);
1283 for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1285 unsigned idx = (unsigned) DR_INDEX (dr);
1286 gcc_assert (idx < datarefs_vec.length ());
1288 /* Partition can only be executed sequentially if there is any
1289 unknown data reference. */
1290 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1291 || !DR_INIT (dr) || !DR_STEP (dr))
1292 partition->type = PTYPE_SEQUENTIAL;
1294 bitmap_set_bit (partition->datarefs, idx);
1298 if (partition->type == PTYPE_SEQUENTIAL)
1299 return partition;
1301 /* Further check if any data dependence prevents us from executing the
1302 partition parallelly. */
1303 update_type_for_merge (rdg, partition, partition);
1305 return partition;
1308 /* Given PARTITION of LOOP and RDG, record single load/store data references
1309 for builtin partition in SRC_DR/DST_DR, return false if there is no such
1310 data references. */
1312 static bool
1313 find_single_drs (struct loop *loop, struct graph *rdg, partition *partition,
1314 data_reference_p *dst_dr, data_reference_p *src_dr)
1316 unsigned i;
1317 data_reference_p single_ld = NULL, single_st = NULL;
1318 bitmap_iterator bi;
1320 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1322 gimple *stmt = RDG_STMT (rdg, i);
1323 data_reference_p dr;
1325 if (gimple_code (stmt) == GIMPLE_PHI)
1326 continue;
1328 /* Any scalar stmts are ok. */
1329 if (!gimple_vuse (stmt))
1330 continue;
1332 /* Otherwise just regular loads/stores. */
1333 if (!gimple_assign_single_p (stmt))
1334 return false;
1336 /* But exactly one store and/or load. */
1337 for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1339 tree type = TREE_TYPE (DR_REF (dr));
1341 /* The memset, memcpy and memmove library calls are only
1342 able to deal with generic address space. */
1343 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1344 return false;
1346 if (DR_IS_READ (dr))
1348 if (single_ld != NULL)
1349 return false;
1350 single_ld = dr;
1352 else
1354 if (single_st != NULL)
1355 return false;
1356 single_st = dr;
1361 if (!single_st)
1362 return false;
1364 /* Bail out if this is a bitfield memory reference. */
1365 if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1366 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1367 return false;
1369 /* Data reference must be executed exactly once per iteration of each
1370 loop in the loop nest. We only need to check dominance information
1371 against the outermost one in a perfect loop nest because a bb can't
1372 dominate outermost loop's latch without dominating inner loop's. */
1373 basic_block bb_st = gimple_bb (DR_STMT (single_st));
1374 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1375 return false;
1377 if (single_ld)
1379 gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1380 /* Direct aggregate copy or via an SSA name temporary. */
1381 if (load != store
1382 && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1383 return false;
1385 /* Bail out if this is a bitfield memory reference. */
1386 if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1387 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1388 return false;
1390 /* Load and store must be in the same loop nest. */
1391 basic_block bb_ld = gimple_bb (DR_STMT (single_ld));
1392 if (bb_st->loop_father != bb_ld->loop_father)
1393 return false;
1395 /* Data reference must be executed exactly once per iteration.
1396 Same as single_st, we only need to check against the outermost
1397 loop. */
1398 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1399 return false;
1401 edge e = single_exit (bb_st->loop_father);
1402 bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1403 bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1404 if (dom_ld != dom_st)
1405 return false;
1408 *src_dr = single_ld;
1409 *dst_dr = single_st;
1410 return true;
1413 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1414 loops from inner to outer to see if loop's step equals to access size at
1415 each level of loop. Return 2 if we can prove this at all level loops;
1416 record access base and size in BASE and SIZE; save loop's step at each
1417 level of loop in STEPS if it is not null. For example:
1419 int arr[100][100][100];
1420 for (i = 0; i < 100; i++) ;steps[2] = 40000
1421 for (j = 100; j > 0; j--) ;steps[1] = -400
1422 for (k = 0; k < 100; k++) ;steps[0] = 4
1423 arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000
1425 Return 1 if we can prove the equality at the innermost loop, but not all
1426 level loops. In this case, no information is recorded.
1428 Return 0 if no equality can be proven at any level loops. */
1430 static int
1431 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1432 tree *size, vec<tree> *steps = NULL)
1434 location_t loc = gimple_location (DR_STMT (dr));
1435 basic_block bb = gimple_bb (DR_STMT (dr));
1436 struct loop *loop = bb->loop_father;
1437 tree ref = DR_REF (dr);
1438 tree access_base = build_fold_addr_expr (ref);
1439 tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1440 int res = 0;
1442 do {
1443 tree scev_fn = analyze_scalar_evolution (loop, access_base);
1444 if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1445 return res;
1447 access_base = CHREC_LEFT (scev_fn);
1448 if (tree_contains_chrecs (access_base, NULL))
1449 return res;
1451 tree scev_step = CHREC_RIGHT (scev_fn);
1452 /* Only support constant steps. */
1453 if (TREE_CODE (scev_step) != INTEGER_CST)
1454 return res;
1456 enum ev_direction access_dir = scev_direction (scev_fn);
1457 if (access_dir == EV_DIR_UNKNOWN)
1458 return res;
1460 if (steps != NULL)
1461 steps->safe_push (scev_step);
1463 scev_step = fold_convert_loc (loc, sizetype, scev_step);
1464 /* Compute absolute value of scev step. */
1465 if (access_dir == EV_DIR_DECREASES)
1466 scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1468 /* At each level of loop, scev step must equal to access size. In other
1469 words, DR must access consecutive memory between loop iterations. */
1470 if (!operand_equal_p (scev_step, access_size, 0))
1471 return res;
1473 /* Access stride can be computed for data reference at least for the
1474 innermost loop. */
1475 res = 1;
1477 /* Compute DR's execution times in loop. */
1478 tree niters = number_of_latch_executions (loop);
1479 niters = fold_convert_loc (loc, sizetype, niters);
1480 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1481 niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1483 /* Compute DR's overall access size in loop. */
1484 access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1485 niters, scev_step);
1486 /* Adjust base address in case of negative step. */
1487 if (access_dir == EV_DIR_DECREASES)
1489 tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1490 scev_step, access_size);
1491 access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1493 } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1495 *base = access_base;
1496 *size = access_size;
1497 /* Access stride can be computed for data reference at each level loop. */
1498 return 2;
1501 /* Allocate and return builtin struct. Record information like DST_DR,
1502 SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */
1504 static struct builtin_info *
1505 alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1506 tree dst_base, tree src_base, tree size)
1508 struct builtin_info *builtin = XNEW (struct builtin_info);
1509 builtin->dst_dr = dst_dr;
1510 builtin->src_dr = src_dr;
1511 builtin->dst_base = dst_base;
1512 builtin->src_base = src_base;
1513 builtin->size = size;
1514 return builtin;
1517 /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1518 memset call. */
1520 static void
1521 classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1523 gimple *stmt = DR_STMT (dr);
1524 tree base, size, rhs = gimple_assign_rhs1 (stmt);
1526 if (const_with_all_bytes_same (rhs) == -1
1527 && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1528 || (TYPE_MODE (TREE_TYPE (rhs))
1529 != TYPE_MODE (unsigned_char_type_node))))
1530 return;
1532 if (TREE_CODE (rhs) == SSA_NAME
1533 && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1534 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1535 return;
1537 int res = compute_access_range (loop, dr, &base, &size);
1538 if (res == 0)
1539 return;
1540 if (res == 1)
1542 partition->kind = PKIND_PARTIAL_MEMSET;
1543 return;
1546 poly_uint64 base_offset;
1547 unsigned HOST_WIDE_INT const_base_offset;
1548 tree base_base = strip_offset (base, &base_offset);
1549 if (!base_offset.is_constant (&const_base_offset))
1550 return;
1552 struct builtin_info *builtin;
1553 builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1554 builtin->dst_base_base = base_base;
1555 builtin->dst_base_offset = const_base_offset;
1556 partition->builtin = builtin;
1557 partition->kind = PKIND_MEMSET;
1560 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1561 if it forms builtin memcpy or memmove call. */
1563 static void
1564 classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
1565 data_reference_p dst_dr, data_reference_p src_dr)
1567 tree base, size, src_base, src_size;
1568 auto_vec<tree> dst_steps, src_steps;
1570 /* Compute access range of both load and store. */
1571 int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1572 if (res != 2)
1573 return;
1574 res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1575 if (res != 2)
1576 return;
1578 /* They much have the same access size. */
1579 if (!operand_equal_p (size, src_size, 0))
1580 return;
1582 /* Load and store in loop nest must access memory in the same way, i.e,
1583 their must have the same steps in each loop of the nest. */
1584 if (dst_steps.length () != src_steps.length ())
1585 return;
1586 for (unsigned i = 0; i < dst_steps.length (); ++i)
1587 if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1588 return;
1590 /* Now check that if there is a dependence. */
1591 ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1593 /* Classify as memcpy if no dependence between load and store. */
1594 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1596 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1597 partition->kind = PKIND_MEMCPY;
1598 return;
1601 /* Can't do memmove in case of unknown dependence or dependence without
1602 classical distance vector. */
1603 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1604 || DDR_NUM_DIST_VECTS (ddr) == 0)
1605 return;
1607 unsigned i;
1608 lambda_vector dist_v;
1609 int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1610 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1612 unsigned dep_lev = dependence_level (dist_v, num_lev);
1613 /* Can't do memmove if load depends on store. */
1614 if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1615 return;
1618 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1619 partition->kind = PKIND_MEMMOVE;
1620 return;
1623 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
1624 For the moment we detect memset, memcpy and memmove patterns. Bitmap
1625 STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */
1627 static void
1628 classify_partition (loop_p loop, struct graph *rdg, partition *partition,
1629 bitmap stmt_in_all_partitions)
1631 bitmap_iterator bi;
1632 unsigned i;
1633 data_reference_p single_ld = NULL, single_st = NULL;
1634 bool volatiles_p = false, has_reduction = false;
1636 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1638 gimple *stmt = RDG_STMT (rdg, i);
1640 if (gimple_has_volatile_ops (stmt))
1641 volatiles_p = true;
1643 /* If the stmt is not included by all partitions and there is uses
1644 outside of the loop, then mark the partition as reduction. */
1645 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1647 /* Due to limitation in the transform phase we have to fuse all
1648 reduction partitions. As a result, this could cancel valid
1649 loop distribution especially for loop that induction variable
1650 is used outside of loop. To workaround this issue, we skip
1651 marking partition as reudction if the reduction stmt belongs
1652 to all partitions. In such case, reduction will be computed
1653 correctly no matter how partitions are fused/distributed. */
1654 if (!bitmap_bit_p (stmt_in_all_partitions, i))
1656 partition->reduction_p = true;
1657 return;
1659 has_reduction = true;
1663 /* Perform general partition disqualification for builtins. */
1664 if (volatiles_p
1665 /* Simple workaround to prevent classifying the partition as builtin
1666 if it contains any use outside of loop. */
1667 || has_reduction
1668 || !flag_tree_loop_distribute_patterns)
1669 return;
1671 /* Find single load/store data references for builtin partition. */
1672 if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld))
1673 return;
1675 /* Classify the builtin kind. */
1676 if (single_ld == NULL)
1677 classify_builtin_st (loop, partition, single_st);
1678 else
1679 classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1682 /* Returns true when PARTITION1 and PARTITION2 access the same memory
1683 object in RDG. */
1685 static bool
1686 share_memory_accesses (struct graph *rdg,
1687 partition *partition1, partition *partition2)
1689 unsigned i, j;
1690 bitmap_iterator bi, bj;
1691 data_reference_p dr1, dr2;
1693 /* First check whether in the intersection of the two partitions are
1694 any loads or stores. Common loads are the situation that happens
1695 most often. */
1696 EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1697 if (RDG_MEM_WRITE_STMT (rdg, i)
1698 || RDG_MEM_READS_STMT (rdg, i))
1699 return true;
1701 /* Then check whether the two partitions access the same memory object. */
1702 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1704 dr1 = datarefs_vec[i];
1706 if (!DR_BASE_ADDRESS (dr1)
1707 || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1708 continue;
1710 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1712 dr2 = datarefs_vec[j];
1714 if (!DR_BASE_ADDRESS (dr2)
1715 || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1716 continue;
1718 if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1719 && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1720 && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1721 && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1722 return true;
1726 return false;
1729 /* For each seed statement in STARTING_STMTS, this function builds
1730 partition for it by adding depended statements according to RDG.
1731 All partitions are recorded in PARTITIONS. */
1733 static void
1734 rdg_build_partitions (struct graph *rdg,
1735 vec<gimple *> starting_stmts,
1736 vec<partition *> *partitions)
1738 auto_bitmap processed;
1739 int i;
1740 gimple *stmt;
1742 FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1744 int v = rdg_vertex_for_stmt (rdg, stmt);
1746 if (dump_file && (dump_flags & TDF_DETAILS))
1747 fprintf (dump_file,
1748 "ldist asked to generate code for vertex %d\n", v);
1750 /* If the vertex is already contained in another partition so
1751 is the partition rooted at it. */
1752 if (bitmap_bit_p (processed, v))
1753 continue;
1755 partition *partition = build_rdg_partition_for_vertex (rdg, v);
1756 bitmap_ior_into (processed, partition->stmts);
1758 if (dump_file && (dump_flags & TDF_DETAILS))
1760 fprintf (dump_file, "ldist creates useful %s partition:\n",
1761 partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1762 bitmap_print (dump_file, partition->stmts, " ", "\n");
1765 partitions->safe_push (partition);
1768 /* All vertices should have been assigned to at least one partition now,
1769 other than vertices belonging to dead code. */
1772 /* Dump to FILE the PARTITIONS. */
1774 static void
1775 dump_rdg_partitions (FILE *file, vec<partition *> partitions)
1777 int i;
1778 partition *partition;
1780 FOR_EACH_VEC_ELT (partitions, i, partition)
1781 debug_bitmap_file (file, partition->stmts);
1784 /* Debug PARTITIONS. */
1785 extern void debug_rdg_partitions (vec<partition *> );
1787 DEBUG_FUNCTION void
1788 debug_rdg_partitions (vec<partition *> partitions)
1790 dump_rdg_partitions (stderr, partitions);
1793 /* Returns the number of read and write operations in the RDG. */
1795 static int
1796 number_of_rw_in_rdg (struct graph *rdg)
1798 int i, res = 0;
1800 for (i = 0; i < rdg->n_vertices; i++)
1802 if (RDG_MEM_WRITE_STMT (rdg, i))
1803 ++res;
1805 if (RDG_MEM_READS_STMT (rdg, i))
1806 ++res;
1809 return res;
1812 /* Returns the number of read and write operations in a PARTITION of
1813 the RDG. */
1815 static int
1816 number_of_rw_in_partition (struct graph *rdg, partition *partition)
1818 int res = 0;
1819 unsigned i;
1820 bitmap_iterator ii;
1822 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
1824 if (RDG_MEM_WRITE_STMT (rdg, i))
1825 ++res;
1827 if (RDG_MEM_READS_STMT (rdg, i))
1828 ++res;
1831 return res;
1834 /* Returns true when one of the PARTITIONS contains all the read or
1835 write operations of RDG. */
1837 static bool
1838 partition_contains_all_rw (struct graph *rdg,
1839 vec<partition *> partitions)
1841 int i;
1842 partition *partition;
1843 int nrw = number_of_rw_in_rdg (rdg);
1845 FOR_EACH_VEC_ELT (partitions, i, partition)
1846 if (nrw == number_of_rw_in_partition (rdg, partition))
1847 return true;
1849 return false;
1852 /* Compute partition dependence created by the data references in DRS1
1853 and DRS2, modify and return DIR according to that. IF ALIAS_DDR is
1854 not NULL, we record dependence introduced by possible alias between
1855 two data references in ALIAS_DDRS; otherwise, we simply ignore such
1856 dependence as if it doesn't exist at all. */
1858 static int
1859 pg_add_dependence_edges (struct graph *rdg, int dir,
1860 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
1862 unsigned i, j;
1863 bitmap_iterator bi, bj;
1864 data_reference_p dr1, dr2, saved_dr1;
1866 /* dependence direction - 0 is no dependence, -1 is back,
1867 1 is forth, 2 is both (we can stop then, merging will occur). */
1868 EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
1870 dr1 = datarefs_vec[i];
1872 EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
1874 int res, this_dir = 1;
1875 ddr_p ddr;
1877 dr2 = datarefs_vec[j];
1879 /* Skip all <read, read> data dependence. */
1880 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1881 continue;
1883 saved_dr1 = dr1;
1884 /* Re-shuffle data-refs to be in topological order. */
1885 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1886 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1888 std::swap (dr1, dr2);
1889 this_dir = -this_dir;
1891 ddr = get_data_dependence (rdg, dr1, dr2);
1892 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1894 this_dir = 0;
1895 res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
1896 DR_BASE_ADDRESS (dr2));
1897 /* Be conservative. If data references are not well analyzed,
1898 or the two data references have the same base address and
1899 offset, add dependence and consider it alias to each other.
1900 In other words, the dependence can not be resolved by
1901 runtime alias check. */
1902 if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
1903 || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
1904 || !DR_INIT (dr1) || !DR_INIT (dr2)
1905 || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
1906 || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
1907 || res == 0)
1908 this_dir = 2;
1909 /* Data dependence could be resolved by runtime alias check,
1910 record it in ALIAS_DDRS. */
1911 else if (alias_ddrs != NULL)
1912 alias_ddrs->safe_push (ddr);
1913 /* Or simply ignore it. */
1915 else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
1917 if (DDR_REVERSED_P (ddr))
1918 this_dir = -this_dir;
1920 /* Known dependences can still be unordered througout the
1921 iteration space, see gcc.dg/tree-ssa/ldist-16.c. */
1922 if (DDR_NUM_DIST_VECTS (ddr) != 1)
1923 this_dir = 2;
1924 /* If the overlap is exact preserve stmt order. */
1925 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1927 /* Else as the distance vector is lexicographic positive swap
1928 the dependence direction. */
1929 else
1930 this_dir = -this_dir;
1932 else
1933 this_dir = 0;
1934 if (this_dir == 2)
1935 return 2;
1936 else if (dir == 0)
1937 dir = this_dir;
1938 else if (this_dir != 0 && dir != this_dir)
1939 return 2;
1940 /* Shuffle "back" dr1. */
1941 dr1 = saved_dr1;
1944 return dir;
1947 /* Compare postorder number of the partition graph vertices V1 and V2. */
1949 static int
1950 pgcmp (const void *v1_, const void *v2_)
1952 const vertex *v1 = (const vertex *)v1_;
1953 const vertex *v2 = (const vertex *)v2_;
1954 return v2->post - v1->post;
1957 /* Data attached to vertices of partition dependence graph. */
1958 struct pg_vdata
1960 /* ID of the corresponding partition. */
1961 int id;
1962 /* The partition. */
1963 struct partition *partition;
1966 /* Data attached to edges of partition dependence graph. */
1967 struct pg_edata
1969 /* If the dependence edge can be resolved by runtime alias check,
1970 this vector contains data dependence relations for runtime alias
1971 check. On the other hand, if the dependence edge is introduced
1972 because of compilation time known data dependence, this vector
1973 contains nothing. */
1974 vec<ddr_p> alias_ddrs;
1977 /* Callback data for traversing edges in graph. */
1978 struct pg_edge_callback_data
1980 /* Bitmap contains strong connected components should be merged. */
1981 bitmap sccs_to_merge;
1982 /* Array constains component information for all vertices. */
1983 int *vertices_component;
1984 /* Vector to record all data dependence relations which are needed
1985 to break strong connected components by runtime alias checks. */
1986 vec<ddr_p> *alias_ddrs;
1989 /* Initialize vertice's data for partition dependence graph PG with
1990 PARTITIONS. */
1992 static void
1993 init_partition_graph_vertices (struct graph *pg,
1994 vec<struct partition *> *partitions)
1996 int i;
1997 partition *partition;
1998 struct pg_vdata *data;
2000 for (i = 0; partitions->iterate (i, &partition); ++i)
2002 data = new pg_vdata;
2003 pg->vertices[i].data = data;
2004 data->id = i;
2005 data->partition = partition;
2009 /* Add edge <I, J> to partition dependence graph PG. Attach vector of data
2010 dependence relations to the EDGE if DDRS isn't NULL. */
2012 static void
2013 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2015 struct graph_edge *e = add_edge (pg, i, j);
2017 /* If the edge is attached with data dependence relations, it means this
2018 dependence edge can be resolved by runtime alias checks. */
2019 if (ddrs != NULL)
2021 struct pg_edata *data = new pg_edata;
2023 gcc_assert (ddrs->length () > 0);
2024 e->data = data;
2025 data->alias_ddrs = vNULL;
2026 data->alias_ddrs.safe_splice (*ddrs);
2030 /* Callback function for graph travesal algorithm. It returns true
2031 if edge E should skipped when traversing the graph. */
2033 static bool
2034 pg_skip_alias_edge (struct graph_edge *e)
2036 struct pg_edata *data = (struct pg_edata *)e->data;
2037 return (data != NULL && data->alias_ddrs.length () > 0);
2040 /* Callback function freeing data attached to edge E of graph. */
2042 static void
2043 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2045 if (e->data != NULL)
2047 struct pg_edata *data = (struct pg_edata *)e->data;
2048 data->alias_ddrs.release ();
2049 delete data;
2053 /* Free data attached to vertice of partition dependence graph PG. */
2055 static void
2056 free_partition_graph_vdata (struct graph *pg)
2058 int i;
2059 struct pg_vdata *data;
2061 for (i = 0; i < pg->n_vertices; ++i)
2063 data = (struct pg_vdata *)pg->vertices[i].data;
2064 delete data;
2068 /* Build and return partition dependence graph for PARTITIONS. RDG is
2069 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
2070 is true, data dependence caused by possible alias between references
2071 is ignored, as if it doesn't exist at all; otherwise all depdendences
2072 are considered. */
2074 static struct graph *
2075 build_partition_graph (struct graph *rdg,
2076 vec<struct partition *> *partitions,
2077 bool ignore_alias_p)
2079 int i, j;
2080 struct partition *partition1, *partition2;
2081 graph *pg = new_graph (partitions->length ());
2082 auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2084 alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2086 init_partition_graph_vertices (pg, partitions);
2088 for (i = 0; partitions->iterate (i, &partition1); ++i)
2090 for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2092 /* dependence direction - 0 is no dependence, -1 is back,
2093 1 is forth, 2 is both (we can stop then, merging will occur). */
2094 int dir = 0;
2096 /* If the first partition has reduction, add back edge; if the
2097 second partition has reduction, add forth edge. This makes
2098 sure that reduction partition will be sorted as the last one. */
2099 if (partition_reduction_p (partition1))
2100 dir = -1;
2101 else if (partition_reduction_p (partition2))
2102 dir = 1;
2104 /* Cleanup the temporary vector. */
2105 alias_ddrs.truncate (0);
2107 dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2108 partition2->datarefs, alias_ddrs_p);
2110 /* Add edge to partition graph if there exists dependence. There
2111 are two types of edges. One type edge is caused by compilation
2112 time known dependence, this type can not be resolved by runtime
2113 alias check. The other type can be resolved by runtime alias
2114 check. */
2115 if (dir == 1 || dir == 2
2116 || alias_ddrs.length () > 0)
2118 /* Attach data dependence relations to edge that can be resolved
2119 by runtime alias check. */
2120 bool alias_edge_p = (dir != 1 && dir != 2);
2121 add_partition_graph_edge (pg, i, j,
2122 (alias_edge_p) ? &alias_ddrs : NULL);
2124 if (dir == -1 || dir == 2
2125 || alias_ddrs.length () > 0)
2127 /* Attach data dependence relations to edge that can be resolved
2128 by runtime alias check. */
2129 bool alias_edge_p = (dir != -1 && dir != 2);
2130 add_partition_graph_edge (pg, j, i,
2131 (alias_edge_p) ? &alias_ddrs : NULL);
2135 return pg;
2138 /* Sort partitions in PG in descending post order and store them in
2139 PARTITIONS. */
2141 static void
2142 sort_partitions_by_post_order (struct graph *pg,
2143 vec<struct partition *> *partitions)
2145 int i;
2146 struct pg_vdata *data;
2148 /* Now order the remaining nodes in descending postorder. */
2149 qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2150 partitions->truncate (0);
2151 for (i = 0; i < pg->n_vertices; ++i)
2153 data = (struct pg_vdata *)pg->vertices[i].data;
2154 if (data->partition)
2155 partitions->safe_push (data->partition);
2159 /* Given reduced dependence graph RDG merge strong connected components
2160 of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by
2161 possible alias between references is ignored, as if it doesn't exist
2162 at all; otherwise all depdendences are considered. */
2164 static void
2165 merge_dep_scc_partitions (struct graph *rdg,
2166 vec<struct partition *> *partitions,
2167 bool ignore_alias_p)
2169 struct partition *partition1, *partition2;
2170 struct pg_vdata *data;
2171 graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2172 int i, j, num_sccs = graphds_scc (pg, NULL);
2174 /* Strong connected compoenent means dependence cycle, we cannot distribute
2175 them. So fuse them together. */
2176 if ((unsigned) num_sccs < partitions->length ())
2178 for (i = 0; i < num_sccs; ++i)
2180 for (j = 0; partitions->iterate (j, &partition1); ++j)
2181 if (pg->vertices[j].component == i)
2182 break;
2183 for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2184 if (pg->vertices[j].component == i)
2186 partition_merge_into (NULL, partition1,
2187 partition2, FUSE_SAME_SCC);
2188 partition1->type = PTYPE_SEQUENTIAL;
2189 (*partitions)[j] = NULL;
2190 partition_free (partition2);
2191 data = (struct pg_vdata *)pg->vertices[j].data;
2192 data->partition = NULL;
2197 sort_partitions_by_post_order (pg, partitions);
2198 gcc_assert (partitions->length () == (unsigned)num_sccs);
2199 free_partition_graph_vdata (pg);
2200 free_graph (pg);
2203 /* Callback function for traversing edge E in graph G. DATA is private
2204 callback data. */
2206 static void
2207 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2209 int i, j, component;
2210 struct pg_edge_callback_data *cbdata;
2211 struct pg_edata *edata = (struct pg_edata *) e->data;
2213 /* If the edge doesn't have attached data dependence, it represents
2214 compilation time known dependences. This type dependence cannot
2215 be resolved by runtime alias check. */
2216 if (edata == NULL || edata->alias_ddrs.length () == 0)
2217 return;
2219 cbdata = (struct pg_edge_callback_data *) data;
2220 i = e->src;
2221 j = e->dest;
2222 component = cbdata->vertices_component[i];
2223 /* Vertices are topologically sorted according to compilation time
2224 known dependences, so we can break strong connected components
2225 by removing edges of the opposite direction, i.e, edges pointing
2226 from vertice with smaller post number to vertice with bigger post
2227 number. */
2228 if (g->vertices[i].post < g->vertices[j].post
2229 /* We only need to remove edges connecting vertices in the same
2230 strong connected component to break it. */
2231 && component == cbdata->vertices_component[j]
2232 /* Check if we want to break the strong connected component or not. */
2233 && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2234 cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2237 /* This is the main function breaking strong conected components in
2238 PARTITIONS giving reduced depdendence graph RDG. Store data dependence
2239 relations for runtime alias check in ALIAS_DDRS. */
2241 static void
2242 break_alias_scc_partitions (struct graph *rdg,
2243 vec<struct partition *> *partitions,
2244 vec<ddr_p> *alias_ddrs)
2246 int i, j, k, num_sccs, num_sccs_no_alias;
2247 /* Build partition dependence graph. */
2248 graph *pg = build_partition_graph (rdg, partitions, false);
2250 alias_ddrs->truncate (0);
2251 /* Find strong connected components in the graph, with all dependence edges
2252 considered. */
2253 num_sccs = graphds_scc (pg, NULL);
2254 /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2255 compilation time known dependences are merged before this function. */
2256 if ((unsigned) num_sccs < partitions->length ())
2258 struct pg_edge_callback_data cbdata;
2259 auto_bitmap sccs_to_merge;
2260 auto_vec<enum partition_type> scc_types;
2261 struct partition *partition, *first;
2263 /* If all partitions in a SCC have the same type, we can simply merge the
2264 SCC. This loop finds out such SCCS and record them in bitmap. */
2265 bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2266 for (i = 0; i < num_sccs; ++i)
2268 for (j = 0; partitions->iterate (j, &first); ++j)
2269 if (pg->vertices[j].component == i)
2270 break;
2271 for (++j; partitions->iterate (j, &partition); ++j)
2273 if (pg->vertices[j].component != i)
2274 continue;
2276 /* Note we Merge partitions of parallel type on purpose, though
2277 the result partition is sequential. The reason is vectorizer
2278 can do more accurate runtime alias check in this case. Also
2279 it results in more conservative distribution. */
2280 if (first->type != partition->type)
2282 bitmap_clear_bit (sccs_to_merge, i);
2283 break;
2288 /* Initialize callback data for traversing. */
2289 cbdata.sccs_to_merge = sccs_to_merge;
2290 cbdata.alias_ddrs = alias_ddrs;
2291 cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2292 /* Record the component information which will be corrupted by next
2293 graph scc finding call. */
2294 for (i = 0; i < pg->n_vertices; ++i)
2295 cbdata.vertices_component[i] = pg->vertices[i].component;
2297 /* Collect data dependences for runtime alias checks to break SCCs. */
2298 if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2300 /* Run SCC finding algorithm again, with alias dependence edges
2301 skipped. This is to topologically sort partitions according to
2302 compilation time known dependence. Note the topological order
2303 is stored in the form of pg's post order number. */
2304 num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2305 gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias);
2306 /* With topological order, we can construct two subgraphs L and R.
2307 L contains edge <x, y> where x < y in terms of post order, while
2308 R contains edge <x, y> where x > y. Edges for compilation time
2309 known dependence all fall in R, so we break SCCs by removing all
2310 (alias) edges of in subgraph L. */
2311 for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2314 /* For SCC that doesn't need to be broken, merge it. */
2315 for (i = 0; i < num_sccs; ++i)
2317 if (!bitmap_bit_p (sccs_to_merge, i))
2318 continue;
2320 for (j = 0; partitions->iterate (j, &first); ++j)
2321 if (cbdata.vertices_component[j] == i)
2322 break;
2323 for (k = j + 1; partitions->iterate (k, &partition); ++k)
2325 struct pg_vdata *data;
2327 if (cbdata.vertices_component[k] != i)
2328 continue;
2330 /* Update postorder number so that merged reduction partition is
2331 sorted after other partitions. */
2332 if (!partition_reduction_p (first)
2333 && partition_reduction_p (partition))
2335 gcc_assert (pg->vertices[k].post < pg->vertices[j].post);
2336 pg->vertices[j].post = pg->vertices[k].post;
2338 partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2339 (*partitions)[k] = NULL;
2340 partition_free (partition);
2341 data = (struct pg_vdata *)pg->vertices[k].data;
2342 gcc_assert (data->id == k);
2343 data->partition = NULL;
2344 /* The result partition of merged SCC must be sequential. */
2345 first->type = PTYPE_SEQUENTIAL;
2350 sort_partitions_by_post_order (pg, partitions);
2351 free_partition_graph_vdata (pg);
2352 for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2353 free_graph (pg);
2355 if (dump_file && (dump_flags & TDF_DETAILS))
2357 fprintf (dump_file, "Possible alias data dependence to break:\n");
2358 dump_data_dependence_relations (dump_file, *alias_ddrs);
2362 /* Compute and return an expression whose value is the segment length which
2363 will be accessed by DR in NITERS iterations. */
2365 static tree
2366 data_ref_segment_size (struct data_reference *dr, tree niters)
2368 niters = size_binop (MINUS_EXPR,
2369 fold_convert (sizetype, niters),
2370 size_one_node);
2371 return size_binop (MULT_EXPR,
2372 fold_convert (sizetype, DR_STEP (dr)),
2373 fold_convert (sizetype, niters));
2376 /* Return true if LOOP's latch is dominated by statement for data reference
2377 DR. */
2379 static inline bool
2380 latch_dominated_by_data_ref (struct loop *loop, data_reference *dr)
2382 return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2383 gimple_bb (DR_STMT (dr)));
2386 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2387 data dependence relations ALIAS_DDRS. */
2389 static void
2390 compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
2391 vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2393 unsigned int i;
2394 unsigned HOST_WIDE_INT factor = 1;
2395 tree niters_plus_one, niters = number_of_latch_executions (loop);
2397 gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2398 niters = fold_convert (sizetype, niters);
2399 niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2401 if (dump_file && (dump_flags & TDF_DETAILS))
2402 fprintf (dump_file, "Creating alias check pairs:\n");
2404 /* Iterate all data dependence relations and compute alias check pairs. */
2405 for (i = 0; i < alias_ddrs->length (); i++)
2407 ddr_p ddr = (*alias_ddrs)[i];
2408 struct data_reference *dr_a = DDR_A (ddr);
2409 struct data_reference *dr_b = DDR_B (ddr);
2410 tree seg_length_a, seg_length_b;
2411 int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2412 DR_BASE_ADDRESS (dr_b));
2414 if (comp_res == 0)
2415 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
2416 gcc_assert (comp_res != 0);
2418 if (latch_dominated_by_data_ref (loop, dr_a))
2419 seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2420 else
2421 seg_length_a = data_ref_segment_size (dr_a, niters);
2423 if (latch_dominated_by_data_ref (loop, dr_b))
2424 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2425 else
2426 seg_length_b = data_ref_segment_size (dr_b, niters);
2428 unsigned HOST_WIDE_INT access_size_a
2429 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2430 unsigned HOST_WIDE_INT access_size_b
2431 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2432 unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2433 unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2435 dr_with_seg_len_pair_t dr_with_seg_len_pair
2436 (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2437 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b));
2439 /* Canonicalize pairs by sorting the two DR members. */
2440 if (comp_res > 0)
2441 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2443 comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2446 if (tree_fits_uhwi_p (niters))
2447 factor = tree_to_uhwi (niters);
2449 /* Prune alias check pairs. */
2450 prune_runtime_alias_test_list (comp_alias_pairs, factor);
2451 if (dump_file && (dump_flags & TDF_DETAILS))
2452 fprintf (dump_file,
2453 "Improved number of alias checks from %d to %d\n",
2454 alias_ddrs->length (), comp_alias_pairs->length ());
2457 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2458 checks and version LOOP under condition of these runtime alias checks. */
2460 static void
2461 version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs)
2463 profile_probability prob;
2464 basic_block cond_bb;
2465 struct loop *nloop;
2466 tree lhs, arg0, cond_expr = NULL_TREE;
2467 gimple_seq cond_stmts = NULL;
2468 gimple *call_stmt = NULL;
2469 auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2471 /* Generate code for runtime alias checks if necessary. */
2472 gcc_assert (alias_ddrs->length () > 0);
2474 if (dump_file && (dump_flags & TDF_DETAILS))
2475 fprintf (dump_file,
2476 "Version loop <%d> with runtime alias check\n", loop->num);
2478 compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2479 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2480 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2481 is_gimple_val, NULL_TREE);
2483 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */
2484 if (flag_tree_loop_vectorize)
2486 /* Generate internal function call for loop distribution alias check. */
2487 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2488 2, NULL_TREE, cond_expr);
2489 lhs = make_ssa_name (boolean_type_node);
2490 gimple_call_set_lhs (call_stmt, lhs);
2492 else
2493 lhs = cond_expr;
2495 prob = profile_probability::guessed_always ().apply_scale (9, 10);
2496 initialize_original_copy_tables ();
2497 nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2498 prob, prob.invert (), true);
2499 free_original_copy_tables ();
2500 /* Record the original loop number in newly generated loops. In case of
2501 distribution, the original loop will be distributed and the new loop
2502 is kept. */
2503 loop->orig_loop_num = nloop->num;
2504 nloop->orig_loop_num = nloop->num;
2505 nloop->dont_vectorize = true;
2506 nloop->force_vectorize = false;
2508 if (call_stmt)
2510 /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2511 loop could be destroyed. */
2512 arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2513 gimple_call_set_arg (call_stmt, 0, arg0);
2514 gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2517 if (cond_stmts)
2519 gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2520 gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2522 update_ssa (TODO_update_ssa);
2525 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2526 ALIAS_DDRS are data dependence relations for runtime alias check. */
2528 static inline bool
2529 version_for_distribution_p (vec<struct partition *> *partitions,
2530 vec<ddr_p> *alias_ddrs)
2532 /* No need to version loop if we have only one partition. */
2533 if (partitions->length () == 1)
2534 return false;
2536 /* Need to version loop if runtime alias check is necessary. */
2537 return (alias_ddrs->length () > 0);
2540 /* Compare base offset of builtin mem* partitions P1 and P2. */
2542 static bool
2543 offset_cmp (struct partition *p1, struct partition *p2)
2545 gcc_assert (p1 != NULL && p1->builtin != NULL);
2546 gcc_assert (p2 != NULL && p2->builtin != NULL);
2547 return p1->builtin->dst_base_offset < p2->builtin->dst_base_offset;
2550 /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special
2551 case optimization transforming below code:
2553 __builtin_memset (&obj, 0, 100);
2554 _1 = &obj + 100;
2555 __builtin_memset (_1, 0, 200);
2556 _2 = &obj + 300;
2557 __builtin_memset (_2, 0, 100);
2559 into:
2561 __builtin_memset (&obj, 0, 400);
2563 Note we don't have dependence information between different partitions
2564 at this point, as a result, we can't handle nonadjacent memset builtin
2565 partitions since dependence might be broken. */
2567 static void
2568 fuse_memset_builtins (vec<struct partition *> *partitions)
2570 unsigned i, j;
2571 struct partition *part1, *part2;
2573 for (i = 0; partitions->iterate (i, &part1);)
2575 if (part1->kind != PKIND_MEMSET)
2577 i++;
2578 continue;
2581 /* Find sub-array of memset builtins of the same base. Index range
2582 of the sub-array is [i, j) with "j > i". */
2583 for (j = i + 1; partitions->iterate (j, &part2); ++j)
2585 if (part2->kind != PKIND_MEMSET
2586 || !operand_equal_p (part1->builtin->dst_base_base,
2587 part2->builtin->dst_base_base, 0))
2588 break;
2591 /* Stable sort is required in order to avoid breaking dependence. */
2592 std::stable_sort (&(*partitions)[i],
2593 &(*partitions)[i] + j - i, offset_cmp);
2594 /* Continue with next partition. */
2595 i = j;
2598 /* Merge all consecutive memset builtin partitions. */
2599 for (i = 0; i < partitions->length () - 1;)
2601 part1 = (*partitions)[i];
2602 if (part1->kind != PKIND_MEMSET)
2604 i++;
2605 continue;
2608 part2 = (*partitions)[i + 1];
2609 /* Only merge memset partitions of the same base and with constant
2610 access sizes. */
2611 if (part2->kind != PKIND_MEMSET
2612 || TREE_CODE (part1->builtin->size) != INTEGER_CST
2613 || TREE_CODE (part2->builtin->size) != INTEGER_CST
2614 || !operand_equal_p (part1->builtin->dst_base_base,
2615 part2->builtin->dst_base_base, 0))
2617 i++;
2618 continue;
2620 tree rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2621 tree rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2622 int bytev1 = const_with_all_bytes_same (rhs1);
2623 int bytev2 = const_with_all_bytes_same (rhs2);
2624 /* Only merge memset partitions of the same value. */
2625 if (bytev1 != bytev2 || bytev1 == -1)
2627 i++;
2628 continue;
2630 wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2631 wi::to_wide (part1->builtin->size));
2632 /* Only merge adjacent memset partitions. */
2633 if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2635 i++;
2636 continue;
2638 /* Merge partitions[i] and partitions[i+1]. */
2639 part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2640 part1->builtin->size,
2641 part2->builtin->size);
2642 partition_free (part2);
2643 partitions->ordered_remove (i + 1);
2647 /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
2648 ALIAS_DDRS contains ddrs which need runtime alias check. */
2650 static void
2651 finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
2652 vec<ddr_p> *alias_ddrs)
2654 unsigned i;
2655 struct partition *partition, *a;
2657 if (partitions->length () == 1
2658 || alias_ddrs->length () > 0)
2659 return;
2661 unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2662 bool same_type_p = true;
2663 enum partition_type type = ((*partitions)[0])->type;
2664 for (i = 0; partitions->iterate (i, &partition); ++i)
2666 same_type_p &= (type == partition->type);
2667 if (partition_builtin_p (partition))
2669 num_builtin++;
2670 continue;
2672 num_normal++;
2673 if (partition->kind == PKIND_PARTIAL_MEMSET)
2674 num_partial_memset++;
2677 /* Don't distribute current loop into too many loops given we don't have
2678 memory stream cost model. Be even more conservative in case of loop
2679 nest distribution. */
2680 if ((same_type_p && num_builtin == 0
2681 && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2682 || (loop->inner != NULL
2683 && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2684 || (loop->inner == NULL
2685 && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2687 a = (*partitions)[0];
2688 for (i = 1; partitions->iterate (i, &partition); ++i)
2690 partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2691 partition_free (partition);
2693 partitions->truncate (1);
2696 /* Fuse memset builtins if possible. */
2697 if (partitions->length () > 1)
2698 fuse_memset_builtins (partitions);
2701 /* Distributes the code from LOOP in such a way that producer statements
2702 are placed before consumer statements. Tries to separate only the
2703 statements from STMTS into separate loops. Returns the number of
2704 distributed loops. Set NB_CALLS to number of generated builtin calls.
2705 Set *DESTROY_P to whether LOOP needs to be destroyed. */
2707 static int
2708 distribute_loop (struct loop *loop, vec<gimple *> stmts,
2709 control_dependences *cd, int *nb_calls, bool *destroy_p)
2711 ddrs_table = new hash_table<ddr_hasher> (389);
2712 struct graph *rdg;
2713 partition *partition;
2714 bool any_builtin;
2715 int i, nbp;
2717 *destroy_p = false;
2718 *nb_calls = 0;
2719 loop_nest.create (0);
2720 if (!find_loop_nest (loop, &loop_nest))
2722 loop_nest.release ();
2723 delete ddrs_table;
2724 return 0;
2727 datarefs_vec.create (20);
2728 rdg = build_rdg (loop, cd);
2729 if (!rdg)
2731 if (dump_file && (dump_flags & TDF_DETAILS))
2732 fprintf (dump_file,
2733 "Loop %d not distributed: failed to build the RDG.\n",
2734 loop->num);
2736 loop_nest.release ();
2737 free_data_refs (datarefs_vec);
2738 delete ddrs_table;
2739 return 0;
2742 if (datarefs_vec.length () > MAX_DATAREFS_NUM)
2744 if (dump_file && (dump_flags & TDF_DETAILS))
2745 fprintf (dump_file,
2746 "Loop %d not distributed: too many memory references.\n",
2747 loop->num);
2749 free_rdg (rdg);
2750 loop_nest.release ();
2751 free_data_refs (datarefs_vec);
2752 delete ddrs_table;
2753 return 0;
2756 data_reference_p dref;
2757 for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
2758 dref->aux = (void *) (uintptr_t) i;
2760 if (dump_file && (dump_flags & TDF_DETAILS))
2761 dump_rdg (dump_file, rdg);
2763 auto_vec<struct partition *, 3> partitions;
2764 rdg_build_partitions (rdg, stmts, &partitions);
2766 auto_vec<ddr_p> alias_ddrs;
2768 auto_bitmap stmt_in_all_partitions;
2769 bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
2770 for (i = 1; partitions.iterate (i, &partition); ++i)
2771 bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
2773 any_builtin = false;
2774 FOR_EACH_VEC_ELT (partitions, i, partition)
2776 classify_partition (loop, rdg, partition, stmt_in_all_partitions);
2777 any_builtin |= partition_builtin_p (partition);
2780 /* If we are only distributing patterns but did not detect any,
2781 simply bail out. */
2782 if (!flag_tree_loop_distribution
2783 && !any_builtin)
2785 nbp = 0;
2786 goto ldist_done;
2789 /* If we are only distributing patterns fuse all partitions that
2790 were not classified as builtins. This also avoids chopping
2791 a loop into pieces, separated by builtin calls. That is, we
2792 only want no or a single loop body remaining. */
2793 struct partition *into;
2794 if (!flag_tree_loop_distribution)
2796 for (i = 0; partitions.iterate (i, &into); ++i)
2797 if (!partition_builtin_p (into))
2798 break;
2799 for (++i; partitions.iterate (i, &partition); ++i)
2800 if (!partition_builtin_p (partition))
2802 partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
2803 partitions.unordered_remove (i);
2804 partition_free (partition);
2805 i--;
2809 /* Due to limitations in the transform phase we have to fuse all
2810 reduction partitions into the last partition so the existing
2811 loop will contain all loop-closed PHI nodes. */
2812 for (i = 0; partitions.iterate (i, &into); ++i)
2813 if (partition_reduction_p (into))
2814 break;
2815 for (i = i + 1; partitions.iterate (i, &partition); ++i)
2816 if (partition_reduction_p (partition))
2818 partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
2819 partitions.unordered_remove (i);
2820 partition_free (partition);
2821 i--;
2824 /* Apply our simple cost model - fuse partitions with similar
2825 memory accesses. */
2826 for (i = 0; partitions.iterate (i, &into); ++i)
2828 bool changed = false;
2829 if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
2830 continue;
2831 for (int j = i + 1;
2832 partitions.iterate (j, &partition); ++j)
2834 if (share_memory_accesses (rdg, into, partition))
2836 partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
2837 partitions.unordered_remove (j);
2838 partition_free (partition);
2839 j--;
2840 changed = true;
2843 /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
2844 accesses when 1 and 2 have similar accesses but not 0 and 1
2845 then in the next iteration we will fail to consider merging
2846 1 into 0,2. So try again if we did any merging into 0. */
2847 if (changed)
2848 i--;
2851 /* Build the partition dependency graph and fuse partitions in strong
2852 connected component. */
2853 if (partitions.length () > 1)
2855 /* Don't support loop nest distribution under runtime alias check
2856 since it's not likely to enable many vectorization opportunities. */
2857 if (loop->inner)
2858 merge_dep_scc_partitions (rdg, &partitions, false);
2859 else
2861 merge_dep_scc_partitions (rdg, &partitions, true);
2862 if (partitions.length () > 1)
2863 break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
2867 finalize_partitions (loop, &partitions, &alias_ddrs);
2869 nbp = partitions.length ();
2870 if (nbp == 0
2871 || (nbp == 1 && !partition_builtin_p (partitions[0]))
2872 || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
2874 nbp = 0;
2875 goto ldist_done;
2878 if (version_for_distribution_p (&partitions, &alias_ddrs))
2879 version_loop_by_alias_check (loop, &alias_ddrs);
2881 if (dump_file && (dump_flags & TDF_DETAILS))
2883 fprintf (dump_file,
2884 "distribute loop <%d> into partitions:\n", loop->num);
2885 dump_rdg_partitions (dump_file, partitions);
2888 FOR_EACH_VEC_ELT (partitions, i, partition)
2890 if (partition_builtin_p (partition))
2891 (*nb_calls)++;
2892 *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
2895 ldist_done:
2896 loop_nest.release ();
2897 free_data_refs (datarefs_vec);
2898 for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
2899 iter != ddrs_table->end (); ++iter)
2901 free_dependence_relation (*iter);
2902 *iter = NULL;
2904 delete ddrs_table;
2906 FOR_EACH_VEC_ELT (partitions, i, partition)
2907 partition_free (partition);
2909 free_rdg (rdg);
2910 return nbp - *nb_calls;
2913 /* Distribute all loops in the current function. */
2915 namespace {
2917 const pass_data pass_data_loop_distribution =
2919 GIMPLE_PASS, /* type */
2920 "ldist", /* name */
2921 OPTGROUP_LOOP, /* optinfo_flags */
2922 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
2923 ( PROP_cfg | PROP_ssa ), /* properties_required */
2924 0, /* properties_provided */
2925 0, /* properties_destroyed */
2926 0, /* todo_flags_start */
2927 0, /* todo_flags_finish */
2930 class pass_loop_distribution : public gimple_opt_pass
2932 public:
2933 pass_loop_distribution (gcc::context *ctxt)
2934 : gimple_opt_pass (pass_data_loop_distribution, ctxt)
2937 /* opt_pass methods: */
2938 virtual bool gate (function *)
2940 return flag_tree_loop_distribution
2941 || flag_tree_loop_distribute_patterns;
2944 virtual unsigned int execute (function *);
2946 }; // class pass_loop_distribution
2949 /* Given LOOP, this function records seed statements for distribution in
2950 WORK_LIST. Return false if there is nothing for distribution. */
2952 static bool
2953 find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list)
2955 basic_block *bbs = get_loop_body_in_dom_order (loop);
2957 /* Initialize the worklist with stmts we seed the partitions with. */
2958 for (unsigned i = 0; i < loop->num_nodes; ++i)
2960 for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
2961 !gsi_end_p (gsi); gsi_next (&gsi))
2963 gphi *phi = gsi.phi ();
2964 if (virtual_operand_p (gimple_phi_result (phi)))
2965 continue;
2966 /* Distribute stmts which have defs that are used outside of
2967 the loop. */
2968 if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
2969 continue;
2970 work_list->safe_push (phi);
2972 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2973 !gsi_end_p (gsi); gsi_next (&gsi))
2975 gimple *stmt = gsi_stmt (gsi);
2977 /* If there is a stmt with side-effects bail out - we
2978 cannot and should not distribute this loop. */
2979 if (gimple_has_side_effects (stmt))
2981 free (bbs);
2982 return false;
2985 /* Distribute stmts which have defs that are used outside of
2986 the loop. */
2987 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
2989 /* Otherwise only distribute stores for now. */
2990 else if (!gimple_vdef (stmt))
2991 continue;
2993 work_list->safe_push (stmt);
2996 free (bbs);
2997 return work_list->length () > 0;
3000 /* Given innermost LOOP, return the outermost enclosing loop that forms a
3001 perfect loop nest. */
3003 static struct loop *
3004 prepare_perfect_loop_nest (struct loop *loop)
3006 struct loop *outer = loop_outer (loop);
3007 tree niters = number_of_latch_executions (loop);
3009 /* TODO: We only support the innermost 3-level loop nest distribution
3010 because of compilation time issue for now. This should be relaxed
3011 in the future. Note we only allow 3-level loop nest distribution
3012 when parallelizing loops. */
3013 while ((loop->inner == NULL
3014 || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
3015 && loop_outer (outer)
3016 && outer->inner == loop && loop->next == NULL
3017 && single_exit (outer)
3018 && optimize_loop_for_speed_p (outer)
3019 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3020 && (niters = number_of_latch_executions (outer)) != NULL_TREE
3021 && niters != chrec_dont_know)
3023 loop = outer;
3024 outer = loop_outer (loop);
3027 return loop;
3030 unsigned int
3031 pass_loop_distribution::execute (function *fun)
3033 struct loop *loop;
3034 bool changed = false;
3035 basic_block bb;
3036 control_dependences *cd = NULL;
3037 auto_vec<loop_p> loops_to_be_destroyed;
3039 if (number_of_loops (fun) <= 1)
3040 return 0;
3042 /* Compute topological order for basic blocks. Topological order is
3043 needed because data dependence is computed for data references in
3044 lexicographical order. */
3045 if (bb_top_order_index == NULL)
3047 int rpo_num;
3048 int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun));
3050 bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
3051 bb_top_order_index_size = last_basic_block_for_fn (cfun);
3052 rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true);
3053 for (int i = 0; i < rpo_num; i++)
3054 bb_top_order_index[rpo[i]] = i;
3056 free (rpo);
3059 FOR_ALL_BB_FN (bb, fun)
3061 gimple_stmt_iterator gsi;
3062 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3063 gimple_set_uid (gsi_stmt (gsi), -1);
3064 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3065 gimple_set_uid (gsi_stmt (gsi), -1);
3068 /* We can at the moment only distribute non-nested loops, thus restrict
3069 walking to innermost loops. */
3070 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
3072 /* Don't distribute multiple exit edges loop, or cold loop. */
3073 if (!single_exit (loop)
3074 || !optimize_loop_for_speed_p (loop))
3075 continue;
3077 /* Don't distribute loop if niters is unknown. */
3078 tree niters = number_of_latch_executions (loop);
3079 if (niters == NULL_TREE || niters == chrec_dont_know)
3080 continue;
3082 /* Get the perfect loop nest for distribution. */
3083 loop = prepare_perfect_loop_nest (loop);
3084 for (; loop; loop = loop->inner)
3086 auto_vec<gimple *> work_list;
3087 if (!find_seed_stmts_for_distribution (loop, &work_list))
3088 break;
3090 const char *str = loop->inner ? " nest" : "";
3091 location_t loc = find_loop_location (loop);
3092 if (!cd)
3094 calculate_dominance_info (CDI_DOMINATORS);
3095 calculate_dominance_info (CDI_POST_DOMINATORS);
3096 cd = new control_dependences ();
3097 free_dominance_info (CDI_POST_DOMINATORS);
3100 bool destroy_p;
3101 int nb_generated_loops, nb_generated_calls;
3102 nb_generated_loops = distribute_loop (loop, work_list, cd,
3103 &nb_generated_calls,
3104 &destroy_p);
3105 if (destroy_p)
3106 loops_to_be_destroyed.safe_push (loop);
3108 if (nb_generated_loops + nb_generated_calls > 0)
3110 changed = true;
3111 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3112 loc, "Loop%s %d distributed: split to %d loops "
3113 "and %d library calls.\n", str, loop->num,
3114 nb_generated_loops, nb_generated_calls);
3116 break;
3119 if (dump_file && (dump_flags & TDF_DETAILS))
3120 fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
3124 if (cd)
3125 delete cd;
3127 if (bb_top_order_index != NULL)
3129 free (bb_top_order_index);
3130 bb_top_order_index = NULL;
3131 bb_top_order_index_size = 0;
3134 if (changed)
3136 /* Destroy loop bodies that could not be reused. Do this late as we
3137 otherwise can end up refering to stale data in control dependences. */
3138 unsigned i;
3139 FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
3140 destroy_loop (loop);
3142 /* Cached scalar evolutions now may refer to wrong or non-existing
3143 loops. */
3144 scev_reset_htab ();
3145 mark_virtual_operands_for_renaming (fun);
3146 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3149 checking_verify_loop_structure ();
3151 return changed ? TODO_cleanup_cfg : 0;
3154 } // anon namespace
3156 gimple_opt_pass *
3157 make_pass_loop_distribution (gcc::context *ctxt)
3159 return new pass_loop_distribution (ctxt);