Default to dwarf version 4 on hppa64-hpux
[official-gcc.git] / gcc / tree-loop-distribution.c
blob2df762c8aa88d84eaf615f4a6e67f7f23f6cdbfe
1 /* Loop distribution.
2 Copyright (C) 2006-2021 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 /* This pass performs loop distribution: for example, the loop
24 |DO I = 2, N
25 | A(I) = B(I) + C
26 | D(I) = A(I-1)*E
27 |ENDDO
29 is transformed to
31 |DOALL I = 2, N
32 | A(I) = B(I) + C
33 |ENDDO
35 |DOALL I = 2, N
36 | D(I) = A(I-1)*E
37 |ENDDO
39 Loop distribution is the dual of loop fusion. It separates statements
40 of a loop (or loop nest) into multiple loops (or loop nests) with the
41 same loop header. The major goal is to separate statements which may
42 be vectorized from those that can't. This pass implements distribution
43 in the following steps:
45 1) Seed partitions with specific type statements. For now we support
46 two types seed statements: statement defining variable used outside
47 of loop; statement storing to memory.
48 2) Build reduced dependence graph (RDG) for loop to be distributed.
49 The vertices (RDG:V) model all statements in the loop and the edges
50 (RDG:E) model flow and control dependencies between statements.
51 3) Apart from RDG, compute data dependencies between memory references.
52 4) Starting from seed statement, build up partition by adding depended
53 statements according to RDG's dependence information. Partition is
54 classified as parallel type if it can be executed paralleled; or as
55 sequential type if it can't. Parallel type partition is further
56 classified as different builtin kinds if it can be implemented as
57 builtin function calls.
58 5) Build partition dependence graph (PG) based on data dependencies.
59 The vertices (PG:V) model all partitions and the edges (PG:E) model
60 all data dependencies between every partitions pair. In general,
61 data dependence is either compilation time known or unknown. In C
62 family languages, there exists quite amount compilation time unknown
63 dependencies because of possible alias relation of data references.
64 We categorize PG's edge to two types: "true" edge that represents
65 compilation time known data dependencies; "alias" edge for all other
66 data dependencies.
67 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge
68 partitions in each strong connected component (SCC) correspondingly.
69 Build new PG for merged partitions.
70 7) Traverse PG again and this time with both "true" and "alias" edges
71 included. We try to break SCCs by removing some edges. Because
72 SCCs by "true" edges are all fused in step 6), we can break SCCs
73 by removing some "alias" edges. It's NP-hard to choose optimal
74 edge set, fortunately simple approximation is good enough for us
75 given the small problem scale.
76 8) Collect all data dependencies of the removed "alias" edges. Create
77 runtime alias checks for collected data dependencies.
78 9) Version loop under the condition of runtime alias checks. Given
79 loop distribution generally introduces additional overhead, it is
80 only useful if vectorization is achieved in distributed loop. We
81 version loop with internal function call IFN_LOOP_DIST_ALIAS. If
82 no distributed loop can be vectorized, we simply remove distributed
83 loops and recover to the original one.
85 TODO:
86 1) We only distribute innermost two-level loop nest now. We should
87 extend it for arbitrary loop nests in the future.
88 2) We only fuse partitions in SCC now. A better fusion algorithm is
89 desired to minimize loop overhead, maximize parallelism and maximize
90 data reuse. */
92 #include "config.h"
93 #include "system.h"
94 #include "coretypes.h"
95 #include "backend.h"
96 #include "tree.h"
97 #include "gimple.h"
98 #include "cfghooks.h"
99 #include "tree-pass.h"
100 #include "ssa.h"
101 #include "gimple-pretty-print.h"
102 #include "fold-const.h"
103 #include "cfganal.h"
104 #include "gimple-iterator.h"
105 #include "gimplify-me.h"
106 #include "stor-layout.h"
107 #include "tree-cfg.h"
108 #include "tree-ssa-loop-manip.h"
109 #include "tree-ssa-loop-ivopts.h"
110 #include "tree-ssa-loop.h"
111 #include "tree-into-ssa.h"
112 #include "tree-ssa.h"
113 #include "cfgloop.h"
114 #include "tree-scalar-evolution.h"
115 #include "tree-vectorizer.h"
116 #include "tree-eh.h"
117 #include "gimple-fold.h"
118 #include "tree-affine.h"
121 #define MAX_DATAREFS_NUM \
122 ((unsigned) param_loop_max_datarefs_for_datadeps)
124 /* Threshold controlling number of distributed partitions. Given it may
125 be unnecessary if a memory stream cost model is invented in the future,
126 we define it as a temporary macro, rather than a parameter. */
127 #define NUM_PARTITION_THRESHOLD (4)
129 /* Hashtable helpers. */
131 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
133 static inline hashval_t hash (const data_dependence_relation *);
134 static inline bool equal (const data_dependence_relation *,
135 const data_dependence_relation *);
138 /* Hash function for data dependence. */
140 inline hashval_t
141 ddr_hasher::hash (const data_dependence_relation *ddr)
143 inchash::hash h;
144 h.add_ptr (DDR_A (ddr));
145 h.add_ptr (DDR_B (ddr));
146 return h.end ();
149 /* Hash table equality function for data dependence. */
151 inline bool
152 ddr_hasher::equal (const data_dependence_relation *ddr1,
153 const data_dependence_relation *ddr2)
155 return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
160 #define DR_INDEX(dr) ((uintptr_t) (dr)->aux)
162 /* A Reduced Dependence Graph (RDG) vertex representing a statement. */
163 struct rdg_vertex
165 /* The statement represented by this vertex. */
166 gimple *stmt;
168 /* Vector of data-references in this statement. */
169 vec<data_reference_p> datarefs;
171 /* True when the statement contains a write to memory. */
172 bool has_mem_write;
174 /* True when the statement contains a read from memory. */
175 bool has_mem_reads;
178 #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt
179 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
180 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
181 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
182 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
183 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
184 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
185 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
187 /* Data dependence type. */
189 enum rdg_dep_type
191 /* Read After Write (RAW). */
192 flow_dd = 'f',
194 /* Control dependence (execute conditional on). */
195 control_dd = 'c'
198 /* Dependence information attached to an edge of the RDG. */
200 struct rdg_edge
202 /* Type of the dependence. */
203 enum rdg_dep_type type;
206 #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type
208 /* Kind of distributed loop. */
209 enum partition_kind {
210 PKIND_NORMAL,
211 /* Partial memset stands for a paritition can be distributed into a loop
212 of memset calls, rather than a single memset call. It's handled just
213 like a normal parition, i.e, distributed as separate loop, no memset
214 call is generated.
216 Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
217 loop nest as deep as possible. As a result, parloop achieves better
218 parallelization by parallelizing deeper loop nest. This hack should
219 be unnecessary and removed once distributed memset can be understood
220 and analyzed in data reference analysis. See PR82604 for more. */
221 PKIND_PARTIAL_MEMSET,
222 PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
225 /* Type of distributed loop. */
226 enum partition_type {
227 /* The distributed loop can be executed parallelly. */
228 PTYPE_PARALLEL = 0,
229 /* The distributed loop has to be executed sequentially. */
230 PTYPE_SEQUENTIAL
233 /* Builtin info for loop distribution. */
234 struct builtin_info
236 /* data-references a kind != PKIND_NORMAL partition is about. */
237 data_reference_p dst_dr;
238 data_reference_p src_dr;
239 /* Base address and size of memory objects operated by the builtin. Note
240 both dest and source memory objects must have the same size. */
241 tree dst_base;
242 tree src_base;
243 tree size;
244 /* Base and offset part of dst_base after stripping constant offset. This
245 is only used in memset builtin distribution for now. */
246 tree dst_base_base;
247 unsigned HOST_WIDE_INT dst_base_offset;
250 /* Partition for loop distribution. */
251 struct partition
253 /* Statements of the partition. */
254 bitmap stmts;
255 /* True if the partition defines variable which is used outside of loop. */
256 bool reduction_p;
257 location_t loc;
258 enum partition_kind kind;
259 enum partition_type type;
260 /* Data references in the partition. */
261 bitmap datarefs;
262 /* Information of builtin parition. */
263 struct builtin_info *builtin;
266 /* Partitions are fused because of different reasons. */
267 enum fuse_type
269 FUSE_NON_BUILTIN = 0,
270 FUSE_REDUCTION = 1,
271 FUSE_SHARE_REF = 2,
272 FUSE_SAME_SCC = 3,
273 FUSE_FINALIZE = 4
276 /* Description on different fusing reason. */
277 static const char *fuse_message[] = {
278 "they are non-builtins",
279 "they have reductions",
280 "they have shared memory refs",
281 "they are in the same dependence scc",
282 "there is no point to distribute loop"};
285 /* Dump vertex I in RDG to FILE. */
287 static void
288 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
290 struct vertex *v = &(rdg->vertices[i]);
291 struct graph_edge *e;
293 fprintf (file, "(vertex %d: (%s%s) (in:", i,
294 RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
295 RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
297 if (v->pred)
298 for (e = v->pred; e; e = e->pred_next)
299 fprintf (file, " %d", e->src);
301 fprintf (file, ") (out:");
303 if (v->succ)
304 for (e = v->succ; e; e = e->succ_next)
305 fprintf (file, " %d", e->dest);
307 fprintf (file, ")\n");
308 print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
309 fprintf (file, ")\n");
312 /* Call dump_rdg_vertex on stderr. */
314 DEBUG_FUNCTION void
315 debug_rdg_vertex (struct graph *rdg, int i)
317 dump_rdg_vertex (stderr, rdg, i);
320 /* Dump the reduced dependence graph RDG to FILE. */
322 static void
323 dump_rdg (FILE *file, struct graph *rdg)
325 fprintf (file, "(rdg\n");
326 for (int i = 0; i < rdg->n_vertices; i++)
327 dump_rdg_vertex (file, rdg, i);
328 fprintf (file, ")\n");
331 /* Call dump_rdg on stderr. */
333 DEBUG_FUNCTION void
334 debug_rdg (struct graph *rdg)
336 dump_rdg (stderr, rdg);
339 static void
340 dot_rdg_1 (FILE *file, struct graph *rdg)
342 int i;
343 pretty_printer buffer;
344 pp_needs_newline (&buffer) = false;
345 buffer.buffer->stream = file;
347 fprintf (file, "digraph RDG {\n");
349 for (i = 0; i < rdg->n_vertices; i++)
351 struct vertex *v = &(rdg->vertices[i]);
352 struct graph_edge *e;
354 fprintf (file, "%d [label=\"[%d] ", i, i);
355 pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
356 pp_flush (&buffer);
357 fprintf (file, "\"]\n");
359 /* Highlight reads from memory. */
360 if (RDG_MEM_READS_STMT (rdg, i))
361 fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
363 /* Highlight stores to memory. */
364 if (RDG_MEM_WRITE_STMT (rdg, i))
365 fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
367 if (v->succ)
368 for (e = v->succ; e; e = e->succ_next)
369 switch (RDGE_TYPE (e))
371 case flow_dd:
372 /* These are the most common dependences: don't print these. */
373 fprintf (file, "%d -> %d \n", i, e->dest);
374 break;
376 case control_dd:
377 fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
378 break;
380 default:
381 gcc_unreachable ();
385 fprintf (file, "}\n\n");
388 /* Display the Reduced Dependence Graph using dotty. */
390 DEBUG_FUNCTION void
391 dot_rdg (struct graph *rdg)
393 /* When debugging, you may want to enable the following code. */
394 #ifdef HAVE_POPEN
395 FILE *file = popen ("dot -Tx11", "w");
396 if (!file)
397 return;
398 dot_rdg_1 (file, rdg);
399 fflush (file);
400 close (fileno (file));
401 pclose (file);
402 #else
403 dot_rdg_1 (stderr, rdg);
404 #endif
407 /* Returns the index of STMT in RDG. */
409 static int
410 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
412 int index = gimple_uid (stmt);
413 gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
414 return index;
417 /* Creates dependence edges in RDG for all the uses of DEF. IDEF is
418 the index of DEF in RDG. */
420 static void
421 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
423 use_operand_p imm_use_p;
424 imm_use_iterator iterator;
426 FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
428 struct graph_edge *e;
429 int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
431 if (use < 0)
432 continue;
434 e = add_edge (rdg, idef, use);
435 e->data = XNEW (struct rdg_edge);
436 RDGE_TYPE (e) = flow_dd;
440 /* Creates an edge for the control dependences of BB to the vertex V. */
442 static void
443 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
444 int v, control_dependences *cd)
446 bitmap_iterator bi;
447 unsigned edge_n;
448 EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
449 0, edge_n, bi)
451 basic_block cond_bb = cd->get_edge_src (edge_n);
452 gimple *stmt = last_stmt (cond_bb);
453 if (stmt && is_ctrl_stmt (stmt))
455 struct graph_edge *e;
456 int c = rdg_vertex_for_stmt (rdg, stmt);
457 if (c < 0)
458 continue;
460 e = add_edge (rdg, c, v);
461 e->data = XNEW (struct rdg_edge);
462 RDGE_TYPE (e) = control_dd;
467 /* Creates the edges of the reduced dependence graph RDG. */
469 static void
470 create_rdg_flow_edges (struct graph *rdg)
472 int i;
473 def_operand_p def_p;
474 ssa_op_iter iter;
476 for (i = 0; i < rdg->n_vertices; i++)
477 FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
478 iter, SSA_OP_DEF)
479 create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
482 /* Creates the edges of the reduced dependence graph RDG. */
484 static void
485 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
487 int i;
489 for (i = 0; i < rdg->n_vertices; i++)
491 gimple *stmt = RDG_STMT (rdg, i);
492 if (gimple_code (stmt) == GIMPLE_PHI)
494 edge_iterator ei;
495 edge e;
496 FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
497 if (flow_bb_inside_loop_p (loop, e->src))
498 create_edge_for_control_dependence (rdg, e->src, i, cd);
500 else
501 create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
506 class loop_distribution
508 private:
509 /* The loop (nest) to be distributed. */
510 vec<loop_p> loop_nest;
512 /* Vector of data references in the loop to be distributed. */
513 vec<data_reference_p> datarefs_vec;
515 /* If there is nonaddressable data reference in above vector. */
516 bool has_nonaddressable_dataref_p;
518 /* Store index of data reference in aux field. */
520 /* Hash table for data dependence relation in the loop to be distributed. */
521 hash_table<ddr_hasher> *ddrs_table;
523 /* Array mapping basic block's index to its topological order. */
524 int *bb_top_order_index;
525 /* And size of the array. */
526 int bb_top_order_index_size;
528 /* Build the vertices of the reduced dependence graph RDG. Return false
529 if that failed. */
530 bool create_rdg_vertices (struct graph *rdg, const vec<gimple *> &stmts,
531 loop_p loop);
533 /* Initialize STMTS with all the statements of LOOP. We use topological
534 order to discover all statements. The order is important because
535 generate_loops_for_partition is using the same traversal for identifying
536 statements in loop copies. */
537 void stmts_from_loop (class loop *loop, vec<gimple *> *stmts);
540 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
541 LOOP, and one edge per flow dependence or control dependence from control
542 dependence CD. During visiting each statement, data references are also
543 collected and recorded in global data DATAREFS_VEC. */
544 struct graph * build_rdg (class loop *loop, control_dependences *cd);
546 /* Merge PARTITION into the partition DEST. RDG is the reduced dependence
547 graph and we update type for result partition if it is non-NULL. */
548 void partition_merge_into (struct graph *rdg,
549 partition *dest, partition *partition,
550 enum fuse_type ft);
553 /* Return data dependence relation for data references A and B. The two
554 data references must be in lexicographic order wrto reduced dependence
555 graph RDG. We firstly try to find ddr from global ddr hash table. If
556 it doesn't exist, compute the ddr and cache it. */
557 data_dependence_relation * get_data_dependence (struct graph *rdg,
558 data_reference_p a,
559 data_reference_p b);
562 /* In reduced dependence graph RDG for loop distribution, return true if
563 dependence between references DR1 and DR2 leads to a dependence cycle
564 and such dependence cycle can't be resolved by runtime alias check. */
565 bool data_dep_in_cycle_p (struct graph *rdg, data_reference_p dr1,
566 data_reference_p dr2);
569 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
570 PARTITION1's type after merging PARTITION2 into PARTITION1. */
571 void update_type_for_merge (struct graph *rdg,
572 partition *partition1, partition *partition2);
575 /* Returns a partition with all the statements needed for computing
576 the vertex V of the RDG, also including the loop exit conditions. */
577 partition *build_rdg_partition_for_vertex (struct graph *rdg, int v);
579 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
580 if it forms builtin memcpy or memmove call. */
581 void classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
582 data_reference_p dst_dr, data_reference_p src_dr);
584 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
585 For the moment we detect memset, memcpy and memmove patterns. Bitmap
586 STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions.
587 Returns true if there is a reduction in all partitions and we
588 possibly did not mark PARTITION as having one for this reason. */
590 bool
591 classify_partition (loop_p loop,
592 struct graph *rdg, partition *partition,
593 bitmap stmt_in_all_partitions);
596 /* Returns true when PARTITION1 and PARTITION2 access the same memory
597 object in RDG. */
598 bool share_memory_accesses (struct graph *rdg,
599 partition *partition1, partition *partition2);
601 /* For each seed statement in STARTING_STMTS, this function builds
602 partition for it by adding depended statements according to RDG.
603 All partitions are recorded in PARTITIONS. */
604 void rdg_build_partitions (struct graph *rdg,
605 vec<gimple *> starting_stmts,
606 vec<partition *> *partitions);
608 /* Compute partition dependence created by the data references in DRS1
609 and DRS2, modify and return DIR according to that. IF ALIAS_DDR is
610 not NULL, we record dependence introduced by possible alias between
611 two data references in ALIAS_DDRS; otherwise, we simply ignore such
612 dependence as if it doesn't exist at all. */
613 int pg_add_dependence_edges (struct graph *rdg, int dir, bitmap drs1,
614 bitmap drs2, vec<ddr_p> *alias_ddrs);
617 /* Build and return partition dependence graph for PARTITIONS. RDG is
618 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
619 is true, data dependence caused by possible alias between references
620 is ignored, as if it doesn't exist at all; otherwise all depdendences
621 are considered. */
622 struct graph *build_partition_graph (struct graph *rdg,
623 vec<struct partition *> *partitions,
624 bool ignore_alias_p);
626 /* Given reduced dependence graph RDG merge strong connected components
627 of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by
628 possible alias between references is ignored, as if it doesn't exist
629 at all; otherwise all depdendences are considered. */
630 void merge_dep_scc_partitions (struct graph *rdg, vec<struct partition *>
631 *partitions, bool ignore_alias_p);
633 /* This is the main function breaking strong conected components in
634 PARTITIONS giving reduced depdendence graph RDG. Store data dependence
635 relations for runtime alias check in ALIAS_DDRS. */
636 void break_alias_scc_partitions (struct graph *rdg, vec<struct partition *>
637 *partitions, vec<ddr_p> *alias_ddrs);
640 /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
641 ALIAS_DDRS contains ddrs which need runtime alias check. */
642 void finalize_partitions (class loop *loop, vec<struct partition *>
643 *partitions, vec<ddr_p> *alias_ddrs);
645 /* Distributes the code from LOOP in such a way that producer statements
646 are placed before consumer statements. Tries to separate only the
647 statements from STMTS into separate loops. Returns the number of
648 distributed loops. Set NB_CALLS to number of generated builtin calls.
649 Set *DESTROY_P to whether LOOP needs to be destroyed. */
650 int distribute_loop (class loop *loop, const vec<gimple *> &stmts,
651 control_dependences *cd, int *nb_calls, bool *destroy_p,
652 bool only_patterns_p);
654 /* Compute topological order for basic blocks. Topological order is
655 needed because data dependence is computed for data references in
656 lexicographical order. */
657 void bb_top_order_init (void);
659 void bb_top_order_destroy (void);
661 public:
663 /* Getter for bb_top_order. */
665 inline int get_bb_top_order_index_size (void)
667 return bb_top_order_index_size;
670 inline int get_bb_top_order_index (int i)
672 return bb_top_order_index[i];
675 unsigned int execute (function *fun);
679 /* If X has a smaller topological sort number than Y, returns -1;
680 if greater, returns 1. */
681 static int
682 bb_top_order_cmp_r (const void *x, const void *y, void *loop)
684 loop_distribution *_loop =
685 (loop_distribution *) loop;
687 basic_block bb1 = *(const basic_block *) x;
688 basic_block bb2 = *(const basic_block *) y;
690 int bb_top_order_index_size = _loop->get_bb_top_order_index_size ();
692 gcc_assert (bb1->index < bb_top_order_index_size
693 && bb2->index < bb_top_order_index_size);
694 gcc_assert (bb1 == bb2
695 || _loop->get_bb_top_order_index(bb1->index)
696 != _loop->get_bb_top_order_index(bb2->index));
698 return (_loop->get_bb_top_order_index(bb1->index) -
699 _loop->get_bb_top_order_index(bb2->index));
702 bool
703 loop_distribution::create_rdg_vertices (struct graph *rdg,
704 const vec<gimple *> &stmts,
705 loop_p loop)
707 int i;
708 gimple *stmt;
710 FOR_EACH_VEC_ELT (stmts, i, stmt)
712 struct vertex *v = &(rdg->vertices[i]);
714 /* Record statement to vertex mapping. */
715 gimple_set_uid (stmt, i);
717 v->data = XNEW (struct rdg_vertex);
718 RDGV_STMT (v) = stmt;
719 RDGV_DATAREFS (v).create (0);
720 RDGV_HAS_MEM_WRITE (v) = false;
721 RDGV_HAS_MEM_READS (v) = false;
722 if (gimple_code (stmt) == GIMPLE_PHI)
723 continue;
725 unsigned drp = datarefs_vec.length ();
726 if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
727 return false;
728 for (unsigned j = drp; j < datarefs_vec.length (); ++j)
730 data_reference_p dr = datarefs_vec[j];
731 if (DR_IS_READ (dr))
732 RDGV_HAS_MEM_READS (v) = true;
733 else
734 RDGV_HAS_MEM_WRITE (v) = true;
735 RDGV_DATAREFS (v).safe_push (dr);
736 has_nonaddressable_dataref_p |= may_be_nonaddressable_p (dr->ref);
739 return true;
742 void
743 loop_distribution::stmts_from_loop (class loop *loop, vec<gimple *> *stmts)
745 unsigned int i;
746 basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r);
748 for (i = 0; i < loop->num_nodes; i++)
750 basic_block bb = bbs[i];
752 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
753 gsi_next (&bsi))
754 if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
755 stmts->safe_push (bsi.phi ());
757 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
758 gsi_next (&bsi))
760 gimple *stmt = gsi_stmt (bsi);
761 if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
762 stmts->safe_push (stmt);
766 free (bbs);
769 /* Free the reduced dependence graph RDG. */
771 static void
772 free_rdg (struct graph *rdg)
774 int i;
776 for (i = 0; i < rdg->n_vertices; i++)
778 struct vertex *v = &(rdg->vertices[i]);
779 struct graph_edge *e;
781 for (e = v->succ; e; e = e->succ_next)
782 free (e->data);
784 if (v->data)
786 gimple_set_uid (RDGV_STMT (v), -1);
787 (RDGV_DATAREFS (v)).release ();
788 free (v->data);
792 free_graph (rdg);
795 struct graph *
796 loop_distribution::build_rdg (class loop *loop, control_dependences *cd)
798 struct graph *rdg;
800 /* Create the RDG vertices from the stmts of the loop nest. */
801 auto_vec<gimple *, 10> stmts;
802 stmts_from_loop (loop, &stmts);
803 rdg = new_graph (stmts.length ());
804 if (!create_rdg_vertices (rdg, stmts, loop))
806 free_rdg (rdg);
807 return NULL;
809 stmts.release ();
811 create_rdg_flow_edges (rdg);
812 if (cd)
813 create_rdg_cd_edges (rdg, cd, loop);
815 return rdg;
819 /* Allocate and initialize a partition from BITMAP. */
821 static partition *
822 partition_alloc (void)
824 partition *partition = XCNEW (struct partition);
825 partition->stmts = BITMAP_ALLOC (NULL);
826 partition->reduction_p = false;
827 partition->loc = UNKNOWN_LOCATION;
828 partition->kind = PKIND_NORMAL;
829 partition->type = PTYPE_PARALLEL;
830 partition->datarefs = BITMAP_ALLOC (NULL);
831 return partition;
834 /* Free PARTITION. */
836 static void
837 partition_free (partition *partition)
839 BITMAP_FREE (partition->stmts);
840 BITMAP_FREE (partition->datarefs);
841 if (partition->builtin)
842 free (partition->builtin);
844 free (partition);
847 /* Returns true if the partition can be generated as a builtin. */
849 static bool
850 partition_builtin_p (partition *partition)
852 return partition->kind > PKIND_PARTIAL_MEMSET;
855 /* Returns true if the partition contains a reduction. */
857 static bool
858 partition_reduction_p (partition *partition)
860 return partition->reduction_p;
863 void
864 loop_distribution::partition_merge_into (struct graph *rdg,
865 partition *dest, partition *partition, enum fuse_type ft)
867 if (dump_file && (dump_flags & TDF_DETAILS))
869 fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
870 fprintf (dump_file, " Part 1: ");
871 dump_bitmap (dump_file, dest->stmts);
872 fprintf (dump_file, " Part 2: ");
873 dump_bitmap (dump_file, partition->stmts);
876 dest->kind = PKIND_NORMAL;
877 if (dest->type == PTYPE_PARALLEL)
878 dest->type = partition->type;
880 bitmap_ior_into (dest->stmts, partition->stmts);
881 if (partition_reduction_p (partition))
882 dest->reduction_p = true;
884 /* Further check if any data dependence prevents us from executing the
885 new partition parallelly. */
886 if (dest->type == PTYPE_PARALLEL && rdg != NULL)
887 update_type_for_merge (rdg, dest, partition);
889 bitmap_ior_into (dest->datarefs, partition->datarefs);
893 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
894 the LOOP. */
896 static bool
897 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
899 imm_use_iterator imm_iter;
900 use_operand_p use_p;
902 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
904 if (is_gimple_debug (USE_STMT (use_p)))
905 continue;
907 basic_block use_bb = gimple_bb (USE_STMT (use_p));
908 if (!flow_bb_inside_loop_p (loop, use_bb))
909 return true;
912 return false;
915 /* Returns true when STMT defines a scalar variable used after the
916 loop LOOP. */
918 static bool
919 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
921 def_operand_p def_p;
922 ssa_op_iter op_iter;
924 if (gimple_code (stmt) == GIMPLE_PHI)
925 return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
927 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
928 if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
929 return true;
931 return false;
934 /* Return a copy of LOOP placed before LOOP. */
936 static class loop *
937 copy_loop_before (class loop *loop)
939 class loop *res;
940 edge preheader = loop_preheader_edge (loop);
942 initialize_original_copy_tables ();
943 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
944 gcc_assert (res != NULL);
945 free_original_copy_tables ();
946 delete_update_ssa ();
948 return res;
951 /* Creates an empty basic block after LOOP. */
953 static void
954 create_bb_after_loop (class loop *loop)
956 edge exit = single_exit (loop);
958 if (!exit)
959 return;
961 split_edge (exit);
964 /* Generate code for PARTITION from the code in LOOP. The loop is
965 copied when COPY_P is true. All the statements not flagged in the
966 PARTITION bitmap are removed from the loop or from its copy. The
967 statements are indexed in sequence inside a basic block, and the
968 basic blocks of a loop are taken in dom order. */
970 static void
971 generate_loops_for_partition (class loop *loop, partition *partition,
972 bool copy_p)
974 unsigned i;
975 basic_block *bbs;
977 if (copy_p)
979 int orig_loop_num = loop->orig_loop_num;
980 loop = copy_loop_before (loop);
981 gcc_assert (loop != NULL);
982 loop->orig_loop_num = orig_loop_num;
983 create_preheader (loop, CP_SIMPLE_PREHEADERS);
984 create_bb_after_loop (loop);
986 else
988 /* Origin number is set to the new versioned loop's num. */
989 gcc_assert (loop->orig_loop_num != loop->num);
992 /* Remove stmts not in the PARTITION bitmap. */
993 bbs = get_loop_body_in_dom_order (loop);
995 if (MAY_HAVE_DEBUG_BIND_STMTS)
996 for (i = 0; i < loop->num_nodes; i++)
998 basic_block bb = bbs[i];
1000 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
1001 gsi_next (&bsi))
1003 gphi *phi = bsi.phi ();
1004 if (!virtual_operand_p (gimple_phi_result (phi))
1005 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
1006 reset_debug_uses (phi);
1009 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
1011 gimple *stmt = gsi_stmt (bsi);
1012 if (gimple_code (stmt) != GIMPLE_LABEL
1013 && !is_gimple_debug (stmt)
1014 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
1015 reset_debug_uses (stmt);
1019 for (i = 0; i < loop->num_nodes; i++)
1021 basic_block bb = bbs[i];
1022 edge inner_exit = NULL;
1024 if (loop != bb->loop_father)
1025 inner_exit = single_exit (bb->loop_father);
1027 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
1029 gphi *phi = bsi.phi ();
1030 if (!virtual_operand_p (gimple_phi_result (phi))
1031 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
1032 remove_phi_node (&bsi, true);
1033 else
1034 gsi_next (&bsi);
1037 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
1039 gimple *stmt = gsi_stmt (bsi);
1040 if (gimple_code (stmt) != GIMPLE_LABEL
1041 && !is_gimple_debug (stmt)
1042 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
1044 /* In distribution of loop nest, if bb is inner loop's exit_bb,
1045 we choose its exit edge/path in order to avoid generating
1046 infinite loop. For all other cases, we choose an arbitrary
1047 path through the empty CFG part that this unnecessary
1048 control stmt controls. */
1049 if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
1051 if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
1052 gimple_cond_make_true (cond_stmt);
1053 else
1054 gimple_cond_make_false (cond_stmt);
1055 update_stmt (stmt);
1057 else if (gimple_code (stmt) == GIMPLE_SWITCH)
1059 gswitch *switch_stmt = as_a <gswitch *> (stmt);
1060 gimple_switch_set_index
1061 (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
1062 update_stmt (stmt);
1064 else
1066 unlink_stmt_vdef (stmt);
1067 gsi_remove (&bsi, true);
1068 release_defs (stmt);
1069 continue;
1072 gsi_next (&bsi);
1076 free (bbs);
1079 /* If VAL memory representation contains the same value in all bytes,
1080 return that value, otherwise return -1.
1081 E.g. for 0x24242424 return 0x24, for IEEE double
1082 747708026454360457216.0 return 0x44, etc. */
1084 static int
1085 const_with_all_bytes_same (tree val)
1087 unsigned char buf[64];
1088 int i, len;
1090 if (integer_zerop (val)
1091 || (TREE_CODE (val) == CONSTRUCTOR
1092 && !TREE_CLOBBER_P (val)
1093 && CONSTRUCTOR_NELTS (val) == 0))
1094 return 0;
1096 if (real_zerop (val))
1098 /* Only return 0 for +0.0, not for -0.0, which doesn't have
1099 an all bytes same memory representation. Don't transform
1100 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */
1101 switch (TREE_CODE (val))
1103 case REAL_CST:
1104 if (!real_isneg (TREE_REAL_CST_PTR (val)))
1105 return 0;
1106 break;
1107 case COMPLEX_CST:
1108 if (!const_with_all_bytes_same (TREE_REALPART (val))
1109 && !const_with_all_bytes_same (TREE_IMAGPART (val)))
1110 return 0;
1111 break;
1112 case VECTOR_CST:
1114 unsigned int count = vector_cst_encoded_nelts (val);
1115 unsigned int j;
1116 for (j = 0; j < count; ++j)
1117 if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
1118 break;
1119 if (j == count)
1120 return 0;
1121 break;
1123 default:
1124 break;
1128 if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
1129 return -1;
1131 len = native_encode_expr (val, buf, sizeof (buf));
1132 if (len == 0)
1133 return -1;
1134 for (i = 1; i < len; i++)
1135 if (buf[i] != buf[0])
1136 return -1;
1137 return buf[0];
1140 /* Generate a call to memset for PARTITION in LOOP. */
1142 static void
1143 generate_memset_builtin (class loop *loop, partition *partition)
1145 gimple_stmt_iterator gsi;
1146 tree mem, fn, nb_bytes;
1147 tree val;
1148 struct builtin_info *builtin = partition->builtin;
1149 gimple *fn_call;
1151 /* The new statements will be placed before LOOP. */
1152 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1154 nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1155 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1156 false, GSI_CONTINUE_LINKING);
1157 mem = rewrite_to_non_trapping_overflow (builtin->dst_base);
1158 mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1159 false, GSI_CONTINUE_LINKING);
1161 /* This exactly matches the pattern recognition in classify_partition. */
1162 val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
1163 /* Handle constants like 0x15151515 and similarly
1164 floating point constants etc. where all bytes are the same. */
1165 int bytev = const_with_all_bytes_same (val);
1166 if (bytev != -1)
1167 val = build_int_cst (integer_type_node, bytev);
1168 else if (TREE_CODE (val) == INTEGER_CST)
1169 val = fold_convert (integer_type_node, val);
1170 else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1172 tree tem = make_ssa_name (integer_type_node);
1173 gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1174 gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1175 val = tem;
1178 fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1179 fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1180 gimple_set_location (fn_call, partition->loc);
1181 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1182 fold_stmt (&gsi);
1184 if (dump_file && (dump_flags & TDF_DETAILS))
1186 fprintf (dump_file, "generated memset");
1187 if (bytev == 0)
1188 fprintf (dump_file, " zero\n");
1189 else
1190 fprintf (dump_file, "\n");
1194 /* Generate a call to memcpy for PARTITION in LOOP. */
1196 static void
1197 generate_memcpy_builtin (class loop *loop, partition *partition)
1199 gimple_stmt_iterator gsi;
1200 gimple *fn_call;
1201 tree dest, src, fn, nb_bytes;
1202 enum built_in_function kind;
1203 struct builtin_info *builtin = partition->builtin;
1205 /* The new statements will be placed before LOOP. */
1206 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1208 nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1209 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1210 false, GSI_CONTINUE_LINKING);
1211 dest = rewrite_to_non_trapping_overflow (builtin->dst_base);
1212 src = rewrite_to_non_trapping_overflow (builtin->src_base);
1213 if (partition->kind == PKIND_MEMCPY
1214 || ! ptr_derefs_may_alias_p (dest, src))
1215 kind = BUILT_IN_MEMCPY;
1216 else
1217 kind = BUILT_IN_MEMMOVE;
1218 /* Try harder if we're copying a constant size. */
1219 if (kind == BUILT_IN_MEMMOVE && poly_int_tree_p (nb_bytes))
1221 aff_tree asrc, adest;
1222 tree_to_aff_combination (src, ptr_type_node, &asrc);
1223 tree_to_aff_combination (dest, ptr_type_node, &adest);
1224 aff_combination_scale (&adest, -1);
1225 aff_combination_add (&asrc, &adest);
1226 if (aff_comb_cannot_overlap_p (&asrc, wi::to_poly_widest (nb_bytes),
1227 wi::to_poly_widest (nb_bytes)))
1228 kind = BUILT_IN_MEMCPY;
1231 dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1232 false, GSI_CONTINUE_LINKING);
1233 src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1234 false, GSI_CONTINUE_LINKING);
1235 fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1236 fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1237 gimple_set_location (fn_call, partition->loc);
1238 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1239 fold_stmt (&gsi);
1241 if (dump_file && (dump_flags & TDF_DETAILS))
1243 if (kind == BUILT_IN_MEMCPY)
1244 fprintf (dump_file, "generated memcpy\n");
1245 else
1246 fprintf (dump_file, "generated memmove\n");
1250 /* Remove and destroy the loop LOOP. */
1252 static void
1253 destroy_loop (class loop *loop)
1255 unsigned nbbs = loop->num_nodes;
1256 edge exit = single_exit (loop);
1257 basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1258 basic_block *bbs;
1259 unsigned i;
1261 bbs = get_loop_body_in_dom_order (loop);
1263 gimple_stmt_iterator dst_gsi = gsi_after_labels (exit->dest);
1264 bool safe_p = single_pred_p (exit->dest);
1265 for (unsigned i = 0; i < nbbs; ++i)
1267 /* We have made sure to not leave any dangling uses of SSA
1268 names defined in the loop. With the exception of virtuals.
1269 Make sure we replace all uses of virtual defs that will remain
1270 outside of the loop with the bare symbol as delete_basic_block
1271 will release them. */
1272 for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1273 gsi_next (&gsi))
1275 gphi *phi = gsi.phi ();
1276 if (virtual_operand_p (gimple_phi_result (phi)))
1277 mark_virtual_phi_result_for_renaming (phi);
1279 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);)
1281 gimple *stmt = gsi_stmt (gsi);
1282 tree vdef = gimple_vdef (stmt);
1283 if (vdef && TREE_CODE (vdef) == SSA_NAME)
1284 mark_virtual_operand_for_renaming (vdef);
1285 /* Also move and eventually reset debug stmts. We can leave
1286 constant values in place in case the stmt dominates the exit.
1287 ??? Non-constant values from the last iteration can be
1288 replaced with final values if we can compute them. */
1289 if (gimple_debug_bind_p (stmt))
1291 tree val = gimple_debug_bind_get_value (stmt);
1292 gsi_move_before (&gsi, &dst_gsi);
1293 if (val
1294 && (!safe_p
1295 || !is_gimple_min_invariant (val)
1296 || !dominated_by_p (CDI_DOMINATORS, exit->src, bbs[i])))
1298 gimple_debug_bind_reset_value (stmt);
1299 update_stmt (stmt);
1302 else
1303 gsi_next (&gsi);
1307 redirect_edge_pred (exit, src);
1308 exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1309 exit->flags |= EDGE_FALLTHRU;
1310 cancel_loop_tree (loop);
1311 rescan_loop_exit (exit, false, true);
1313 i = nbbs;
1316 --i;
1317 delete_basic_block (bbs[i]);
1319 while (i != 0);
1321 free (bbs);
1323 set_immediate_dominator (CDI_DOMINATORS, dest,
1324 recompute_dominator (CDI_DOMINATORS, dest));
1327 /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */
1329 static bool
1330 generate_code_for_partition (class loop *loop,
1331 partition *partition, bool copy_p)
1333 switch (partition->kind)
1335 case PKIND_NORMAL:
1336 case PKIND_PARTIAL_MEMSET:
1337 /* Reductions all have to be in the last partition. */
1338 gcc_assert (!partition_reduction_p (partition)
1339 || !copy_p);
1340 generate_loops_for_partition (loop, partition, copy_p);
1341 return false;
1343 case PKIND_MEMSET:
1344 generate_memset_builtin (loop, partition);
1345 break;
1347 case PKIND_MEMCPY:
1348 case PKIND_MEMMOVE:
1349 generate_memcpy_builtin (loop, partition);
1350 break;
1352 default:
1353 gcc_unreachable ();
1356 /* Common tail for partitions we turn into a call. If this was the last
1357 partition for which we generate code, we have to destroy the loop. */
1358 if (!copy_p)
1359 return true;
1360 return false;
1363 data_dependence_relation *
1364 loop_distribution::get_data_dependence (struct graph *rdg, data_reference_p a,
1365 data_reference_p b)
1367 struct data_dependence_relation ent, **slot;
1368 struct data_dependence_relation *ddr;
1370 gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1371 gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1372 <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1373 ent.a = a;
1374 ent.b = b;
1375 slot = ddrs_table->find_slot (&ent, INSERT);
1376 if (*slot == NULL)
1378 ddr = initialize_data_dependence_relation (a, b, loop_nest);
1379 compute_affine_dependence (ddr, loop_nest[0]);
1380 *slot = ddr;
1383 return *slot;
1386 bool
1387 loop_distribution::data_dep_in_cycle_p (struct graph *rdg,
1388 data_reference_p dr1,
1389 data_reference_p dr2)
1391 struct data_dependence_relation *ddr;
1393 /* Re-shuffle data-refs to be in topological order. */
1394 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1395 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1396 std::swap (dr1, dr2);
1398 ddr = get_data_dependence (rdg, dr1, dr2);
1400 /* In case of no data dependence. */
1401 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1402 return false;
1403 /* For unknown data dependence or known data dependence which can't be
1404 expressed in classic distance vector, we check if it can be resolved
1405 by runtime alias check. If yes, we still consider data dependence
1406 as won't introduce data dependence cycle. */
1407 else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1408 || DDR_NUM_DIST_VECTS (ddr) == 0)
1409 return !runtime_alias_check_p (ddr, NULL, true);
1410 else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1411 return true;
1412 else if (DDR_REVERSED_P (ddr)
1413 || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1414 return false;
1416 return true;
1419 void
1420 loop_distribution::update_type_for_merge (struct graph *rdg,
1421 partition *partition1,
1422 partition *partition2)
1424 unsigned i, j;
1425 bitmap_iterator bi, bj;
1426 data_reference_p dr1, dr2;
1428 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1430 unsigned start = (partition1 == partition2) ? i + 1 : 0;
1432 dr1 = datarefs_vec[i];
1433 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1435 dr2 = datarefs_vec[j];
1436 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1437 continue;
1439 /* Partition can only be executed sequentially if there is any
1440 data dependence cycle. */
1441 if (data_dep_in_cycle_p (rdg, dr1, dr2))
1443 partition1->type = PTYPE_SEQUENTIAL;
1444 return;
1450 partition *
1451 loop_distribution::build_rdg_partition_for_vertex (struct graph *rdg, int v)
1453 partition *partition = partition_alloc ();
1454 auto_vec<int, 3> nodes;
1455 unsigned i, j;
1456 int x;
1457 data_reference_p dr;
1459 graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1461 FOR_EACH_VEC_ELT (nodes, i, x)
1463 bitmap_set_bit (partition->stmts, x);
1465 for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1467 unsigned idx = (unsigned) DR_INDEX (dr);
1468 gcc_assert (idx < datarefs_vec.length ());
1470 /* Partition can only be executed sequentially if there is any
1471 unknown data reference. */
1472 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1473 || !DR_INIT (dr) || !DR_STEP (dr))
1474 partition->type = PTYPE_SEQUENTIAL;
1476 bitmap_set_bit (partition->datarefs, idx);
1480 if (partition->type == PTYPE_SEQUENTIAL)
1481 return partition;
1483 /* Further check if any data dependence prevents us from executing the
1484 partition parallelly. */
1485 update_type_for_merge (rdg, partition, partition);
1487 return partition;
1490 /* Given PARTITION of LOOP and RDG, record single load/store data references
1491 for builtin partition in SRC_DR/DST_DR, return false if there is no such
1492 data references. */
1494 static bool
1495 find_single_drs (class loop *loop, struct graph *rdg, partition *partition,
1496 data_reference_p *dst_dr, data_reference_p *src_dr)
1498 unsigned i;
1499 data_reference_p single_ld = NULL, single_st = NULL;
1500 bitmap_iterator bi;
1502 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1504 gimple *stmt = RDG_STMT (rdg, i);
1505 data_reference_p dr;
1507 if (gimple_code (stmt) == GIMPLE_PHI)
1508 continue;
1510 /* Any scalar stmts are ok. */
1511 if (!gimple_vuse (stmt))
1512 continue;
1514 /* Otherwise just regular loads/stores. */
1515 if (!gimple_assign_single_p (stmt))
1516 return false;
1518 /* But exactly one store and/or load. */
1519 for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1521 tree type = TREE_TYPE (DR_REF (dr));
1523 /* The memset, memcpy and memmove library calls are only
1524 able to deal with generic address space. */
1525 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1526 return false;
1528 if (DR_IS_READ (dr))
1530 if (single_ld != NULL)
1531 return false;
1532 single_ld = dr;
1534 else
1536 if (single_st != NULL)
1537 return false;
1538 single_st = dr;
1543 if (!single_st)
1544 return false;
1546 /* Bail out if this is a bitfield memory reference. */
1547 if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1548 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1549 return false;
1551 /* Data reference must be executed exactly once per iteration of each
1552 loop in the loop nest. We only need to check dominance information
1553 against the outermost one in a perfect loop nest because a bb can't
1554 dominate outermost loop's latch without dominating inner loop's. */
1555 basic_block bb_st = gimple_bb (DR_STMT (single_st));
1556 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1557 return false;
1559 if (single_ld)
1561 gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1562 /* Direct aggregate copy or via an SSA name temporary. */
1563 if (load != store
1564 && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1565 return false;
1567 /* Bail out if this is a bitfield memory reference. */
1568 if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1569 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1570 return false;
1572 /* Load and store must be in the same loop nest. */
1573 basic_block bb_ld = gimple_bb (DR_STMT (single_ld));
1574 if (bb_st->loop_father != bb_ld->loop_father)
1575 return false;
1577 /* Data reference must be executed exactly once per iteration.
1578 Same as single_st, we only need to check against the outermost
1579 loop. */
1580 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1581 return false;
1583 edge e = single_exit (bb_st->loop_father);
1584 bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1585 bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1586 if (dom_ld != dom_st)
1587 return false;
1590 *src_dr = single_ld;
1591 *dst_dr = single_st;
1592 return true;
1595 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1596 loops from inner to outer to see if loop's step equals to access size at
1597 each level of loop. Return 2 if we can prove this at all level loops;
1598 record access base and size in BASE and SIZE; save loop's step at each
1599 level of loop in STEPS if it is not null. For example:
1601 int arr[100][100][100];
1602 for (i = 0; i < 100; i++) ;steps[2] = 40000
1603 for (j = 100; j > 0; j--) ;steps[1] = -400
1604 for (k = 0; k < 100; k++) ;steps[0] = 4
1605 arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000
1607 Return 1 if we can prove the equality at the innermost loop, but not all
1608 level loops. In this case, no information is recorded.
1610 Return 0 if no equality can be proven at any level loops. */
1612 static int
1613 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1614 tree *size, vec<tree> *steps = NULL)
1616 location_t loc = gimple_location (DR_STMT (dr));
1617 basic_block bb = gimple_bb (DR_STMT (dr));
1618 class loop *loop = bb->loop_father;
1619 tree ref = DR_REF (dr);
1620 tree access_base = build_fold_addr_expr (ref);
1621 tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1622 int res = 0;
1624 do {
1625 tree scev_fn = analyze_scalar_evolution (loop, access_base);
1626 if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1627 return res;
1629 access_base = CHREC_LEFT (scev_fn);
1630 if (tree_contains_chrecs (access_base, NULL))
1631 return res;
1633 tree scev_step = CHREC_RIGHT (scev_fn);
1634 /* Only support constant steps. */
1635 if (TREE_CODE (scev_step) != INTEGER_CST)
1636 return res;
1638 enum ev_direction access_dir = scev_direction (scev_fn);
1639 if (access_dir == EV_DIR_UNKNOWN)
1640 return res;
1642 if (steps != NULL)
1643 steps->safe_push (scev_step);
1645 scev_step = fold_convert_loc (loc, sizetype, scev_step);
1646 /* Compute absolute value of scev step. */
1647 if (access_dir == EV_DIR_DECREASES)
1648 scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1650 /* At each level of loop, scev step must equal to access size. In other
1651 words, DR must access consecutive memory between loop iterations. */
1652 if (!operand_equal_p (scev_step, access_size, 0))
1653 return res;
1655 /* Access stride can be computed for data reference at least for the
1656 innermost loop. */
1657 res = 1;
1659 /* Compute DR's execution times in loop. */
1660 tree niters = number_of_latch_executions (loop);
1661 niters = fold_convert_loc (loc, sizetype, niters);
1662 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1663 niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1665 /* Compute DR's overall access size in loop. */
1666 access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1667 niters, scev_step);
1668 /* Adjust base address in case of negative step. */
1669 if (access_dir == EV_DIR_DECREASES)
1671 tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1672 scev_step, access_size);
1673 access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1675 } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1677 *base = access_base;
1678 *size = access_size;
1679 /* Access stride can be computed for data reference at each level loop. */
1680 return 2;
1683 /* Allocate and return builtin struct. Record information like DST_DR,
1684 SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */
1686 static struct builtin_info *
1687 alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1688 tree dst_base, tree src_base, tree size)
1690 struct builtin_info *builtin = XNEW (struct builtin_info);
1691 builtin->dst_dr = dst_dr;
1692 builtin->src_dr = src_dr;
1693 builtin->dst_base = dst_base;
1694 builtin->src_base = src_base;
1695 builtin->size = size;
1696 return builtin;
1699 /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1700 memset call. */
1702 static void
1703 classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1705 gimple *stmt = DR_STMT (dr);
1706 tree base, size, rhs = gimple_assign_rhs1 (stmt);
1708 if (const_with_all_bytes_same (rhs) == -1
1709 && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1710 || (TYPE_MODE (TREE_TYPE (rhs))
1711 != TYPE_MODE (unsigned_char_type_node))))
1712 return;
1714 if (TREE_CODE (rhs) == SSA_NAME
1715 && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1716 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1717 return;
1719 int res = compute_access_range (loop, dr, &base, &size);
1720 if (res == 0)
1721 return;
1722 if (res == 1)
1724 partition->kind = PKIND_PARTIAL_MEMSET;
1725 return;
1728 poly_uint64 base_offset;
1729 unsigned HOST_WIDE_INT const_base_offset;
1730 tree base_base = strip_offset (base, &base_offset);
1731 if (!base_offset.is_constant (&const_base_offset))
1732 return;
1734 struct builtin_info *builtin;
1735 builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1736 builtin->dst_base_base = base_base;
1737 builtin->dst_base_offset = const_base_offset;
1738 partition->builtin = builtin;
1739 partition->kind = PKIND_MEMSET;
1742 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1743 if it forms builtin memcpy or memmove call. */
1745 void
1746 loop_distribution::classify_builtin_ldst (loop_p loop, struct graph *rdg,
1747 partition *partition,
1748 data_reference_p dst_dr,
1749 data_reference_p src_dr)
1751 tree base, size, src_base, src_size;
1752 auto_vec<tree> dst_steps, src_steps;
1754 /* Compute access range of both load and store. */
1755 int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1756 if (res != 2)
1757 return;
1758 res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1759 if (res != 2)
1760 return;
1762 /* They much have the same access size. */
1763 if (!operand_equal_p (size, src_size, 0))
1764 return;
1766 /* Load and store in loop nest must access memory in the same way, i.e,
1767 their must have the same steps in each loop of the nest. */
1768 if (dst_steps.length () != src_steps.length ())
1769 return;
1770 for (unsigned i = 0; i < dst_steps.length (); ++i)
1771 if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1772 return;
1774 /* Now check that if there is a dependence. */
1775 ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1777 /* Classify as memmove if no dependence between load and store. */
1778 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1780 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1781 partition->kind = PKIND_MEMMOVE;
1782 return;
1785 /* Can't do memmove in case of unknown dependence or dependence without
1786 classical distance vector. */
1787 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1788 || DDR_NUM_DIST_VECTS (ddr) == 0)
1789 return;
1791 unsigned i;
1792 lambda_vector dist_v;
1793 int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1794 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1796 unsigned dep_lev = dependence_level (dist_v, num_lev);
1797 /* Can't do memmove if load depends on store. */
1798 if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1799 return;
1802 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1803 partition->kind = PKIND_MEMMOVE;
1804 return;
1807 bool
1808 loop_distribution::classify_partition (loop_p loop,
1809 struct graph *rdg, partition *partition,
1810 bitmap stmt_in_all_partitions)
1812 bitmap_iterator bi;
1813 unsigned i;
1814 data_reference_p single_ld = NULL, single_st = NULL;
1815 bool volatiles_p = false, has_reduction = false;
1817 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1819 gimple *stmt = RDG_STMT (rdg, i);
1821 if (gimple_has_volatile_ops (stmt))
1822 volatiles_p = true;
1824 /* If the stmt is not included by all partitions and there is uses
1825 outside of the loop, then mark the partition as reduction. */
1826 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1828 /* Due to limitation in the transform phase we have to fuse all
1829 reduction partitions. As a result, this could cancel valid
1830 loop distribution especially for loop that induction variable
1831 is used outside of loop. To workaround this issue, we skip
1832 marking partition as reudction if the reduction stmt belongs
1833 to all partitions. In such case, reduction will be computed
1834 correctly no matter how partitions are fused/distributed. */
1835 if (!bitmap_bit_p (stmt_in_all_partitions, i))
1836 partition->reduction_p = true;
1837 else
1838 has_reduction = true;
1842 /* Simple workaround to prevent classifying the partition as builtin
1843 if it contains any use outside of loop. For the case where all
1844 partitions have the reduction this simple workaround is delayed
1845 to only affect the last partition. */
1846 if (partition->reduction_p)
1847 return has_reduction;
1849 /* Perform general partition disqualification for builtins. */
1850 if (volatiles_p
1851 || !flag_tree_loop_distribute_patterns)
1852 return has_reduction;
1854 /* Find single load/store data references for builtin partition. */
1855 if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld))
1856 return has_reduction;
1858 partition->loc = gimple_location (DR_STMT (single_st));
1860 /* Classify the builtin kind. */
1861 if (single_ld == NULL)
1862 classify_builtin_st (loop, partition, single_st);
1863 else
1864 classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1865 return has_reduction;
1868 bool
1869 loop_distribution::share_memory_accesses (struct graph *rdg,
1870 partition *partition1, partition *partition2)
1872 unsigned i, j;
1873 bitmap_iterator bi, bj;
1874 data_reference_p dr1, dr2;
1876 /* First check whether in the intersection of the two partitions are
1877 any loads or stores. Common loads are the situation that happens
1878 most often. */
1879 EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1880 if (RDG_MEM_WRITE_STMT (rdg, i)
1881 || RDG_MEM_READS_STMT (rdg, i))
1882 return true;
1884 /* Then check whether the two partitions access the same memory object. */
1885 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1887 dr1 = datarefs_vec[i];
1889 if (!DR_BASE_ADDRESS (dr1)
1890 || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1891 continue;
1893 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1895 dr2 = datarefs_vec[j];
1897 if (!DR_BASE_ADDRESS (dr2)
1898 || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1899 continue;
1901 if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1902 && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1903 && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1904 && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1905 return true;
1909 return false;
1912 /* For each seed statement in STARTING_STMTS, this function builds
1913 partition for it by adding depended statements according to RDG.
1914 All partitions are recorded in PARTITIONS. */
1916 void
1917 loop_distribution::rdg_build_partitions (struct graph *rdg,
1918 vec<gimple *> starting_stmts,
1919 vec<partition *> *partitions)
1921 auto_bitmap processed;
1922 int i;
1923 gimple *stmt;
1925 FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1927 int v = rdg_vertex_for_stmt (rdg, stmt);
1929 if (dump_file && (dump_flags & TDF_DETAILS))
1930 fprintf (dump_file,
1931 "ldist asked to generate code for vertex %d\n", v);
1933 /* If the vertex is already contained in another partition so
1934 is the partition rooted at it. */
1935 if (bitmap_bit_p (processed, v))
1936 continue;
1938 partition *partition = build_rdg_partition_for_vertex (rdg, v);
1939 bitmap_ior_into (processed, partition->stmts);
1941 if (dump_file && (dump_flags & TDF_DETAILS))
1943 fprintf (dump_file, "ldist creates useful %s partition:\n",
1944 partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1945 bitmap_print (dump_file, partition->stmts, " ", "\n");
1948 partitions->safe_push (partition);
1951 /* All vertices should have been assigned to at least one partition now,
1952 other than vertices belonging to dead code. */
1955 /* Dump to FILE the PARTITIONS. */
1957 static void
1958 dump_rdg_partitions (FILE *file, const vec<partition *> &partitions)
1960 int i;
1961 partition *partition;
1963 FOR_EACH_VEC_ELT (partitions, i, partition)
1964 debug_bitmap_file (file, partition->stmts);
1967 /* Debug PARTITIONS. */
1968 extern void debug_rdg_partitions (const vec<partition *> &);
1970 DEBUG_FUNCTION void
1971 debug_rdg_partitions (const vec<partition *> &partitions)
1973 dump_rdg_partitions (stderr, partitions);
1976 /* Returns the number of read and write operations in the RDG. */
1978 static int
1979 number_of_rw_in_rdg (struct graph *rdg)
1981 int i, res = 0;
1983 for (i = 0; i < rdg->n_vertices; i++)
1985 if (RDG_MEM_WRITE_STMT (rdg, i))
1986 ++res;
1988 if (RDG_MEM_READS_STMT (rdg, i))
1989 ++res;
1992 return res;
1995 /* Returns the number of read and write operations in a PARTITION of
1996 the RDG. */
1998 static int
1999 number_of_rw_in_partition (struct graph *rdg, partition *partition)
2001 int res = 0;
2002 unsigned i;
2003 bitmap_iterator ii;
2005 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
2007 if (RDG_MEM_WRITE_STMT (rdg, i))
2008 ++res;
2010 if (RDG_MEM_READS_STMT (rdg, i))
2011 ++res;
2014 return res;
2017 /* Returns true when one of the PARTITIONS contains all the read or
2018 write operations of RDG. */
2020 static bool
2021 partition_contains_all_rw (struct graph *rdg,
2022 const vec<partition *> &partitions)
2024 int i;
2025 partition *partition;
2026 int nrw = number_of_rw_in_rdg (rdg);
2028 FOR_EACH_VEC_ELT (partitions, i, partition)
2029 if (nrw == number_of_rw_in_partition (rdg, partition))
2030 return true;
2032 return false;
2036 loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir,
2037 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
2039 unsigned i, j;
2040 bitmap_iterator bi, bj;
2041 data_reference_p dr1, dr2, saved_dr1;
2043 /* dependence direction - 0 is no dependence, -1 is back,
2044 1 is forth, 2 is both (we can stop then, merging will occur). */
2045 EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
2047 dr1 = datarefs_vec[i];
2049 EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
2051 int res, this_dir = 1;
2052 ddr_p ddr;
2054 dr2 = datarefs_vec[j];
2056 /* Skip all <read, read> data dependence. */
2057 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
2058 continue;
2060 saved_dr1 = dr1;
2061 /* Re-shuffle data-refs to be in topological order. */
2062 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
2063 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
2065 std::swap (dr1, dr2);
2066 this_dir = -this_dir;
2068 ddr = get_data_dependence (rdg, dr1, dr2);
2069 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2071 this_dir = 0;
2072 res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
2073 DR_BASE_ADDRESS (dr2));
2074 /* Be conservative. If data references are not well analyzed,
2075 or the two data references have the same base address and
2076 offset, add dependence and consider it alias to each other.
2077 In other words, the dependence cannot be resolved by
2078 runtime alias check. */
2079 if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
2080 || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
2081 || !DR_INIT (dr1) || !DR_INIT (dr2)
2082 || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
2083 || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
2084 || res == 0)
2085 this_dir = 2;
2086 /* Data dependence could be resolved by runtime alias check,
2087 record it in ALIAS_DDRS. */
2088 else if (alias_ddrs != NULL)
2089 alias_ddrs->safe_push (ddr);
2090 /* Or simply ignore it. */
2092 else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
2094 if (DDR_REVERSED_P (ddr))
2095 this_dir = -this_dir;
2097 /* Known dependences can still be unordered througout the
2098 iteration space, see gcc.dg/tree-ssa/ldist-16.c and
2099 gcc.dg/tree-ssa/pr94969.c. */
2100 if (DDR_NUM_DIST_VECTS (ddr) != 1)
2101 this_dir = 2;
2102 /* If the overlap is exact preserve stmt order. */
2103 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0),
2104 DDR_NB_LOOPS (ddr)))
2106 /* Else as the distance vector is lexicographic positive swap
2107 the dependence direction. */
2108 else
2109 this_dir = -this_dir;
2111 else
2112 this_dir = 0;
2113 if (this_dir == 2)
2114 return 2;
2115 else if (dir == 0)
2116 dir = this_dir;
2117 else if (this_dir != 0 && dir != this_dir)
2118 return 2;
2119 /* Shuffle "back" dr1. */
2120 dr1 = saved_dr1;
2123 return dir;
2126 /* Compare postorder number of the partition graph vertices V1 and V2. */
2128 static int
2129 pgcmp (const void *v1_, const void *v2_)
2131 const vertex *v1 = (const vertex *)v1_;
2132 const vertex *v2 = (const vertex *)v2_;
2133 return v2->post - v1->post;
2136 /* Data attached to vertices of partition dependence graph. */
2137 struct pg_vdata
2139 /* ID of the corresponding partition. */
2140 int id;
2141 /* The partition. */
2142 struct partition *partition;
2145 /* Data attached to edges of partition dependence graph. */
2146 struct pg_edata
2148 /* If the dependence edge can be resolved by runtime alias check,
2149 this vector contains data dependence relations for runtime alias
2150 check. On the other hand, if the dependence edge is introduced
2151 because of compilation time known data dependence, this vector
2152 contains nothing. */
2153 vec<ddr_p> alias_ddrs;
2156 /* Callback data for traversing edges in graph. */
2157 struct pg_edge_callback_data
2159 /* Bitmap contains strong connected components should be merged. */
2160 bitmap sccs_to_merge;
2161 /* Array constains component information for all vertices. */
2162 int *vertices_component;
2163 /* Array constains postorder information for all vertices. */
2164 int *vertices_post;
2165 /* Vector to record all data dependence relations which are needed
2166 to break strong connected components by runtime alias checks. */
2167 vec<ddr_p> *alias_ddrs;
2170 /* Initialize vertice's data for partition dependence graph PG with
2171 PARTITIONS. */
2173 static void
2174 init_partition_graph_vertices (struct graph *pg,
2175 vec<struct partition *> *partitions)
2177 int i;
2178 partition *partition;
2179 struct pg_vdata *data;
2181 for (i = 0; partitions->iterate (i, &partition); ++i)
2183 data = new pg_vdata;
2184 pg->vertices[i].data = data;
2185 data->id = i;
2186 data->partition = partition;
2190 /* Add edge <I, J> to partition dependence graph PG. Attach vector of data
2191 dependence relations to the EDGE if DDRS isn't NULL. */
2193 static void
2194 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2196 struct graph_edge *e = add_edge (pg, i, j);
2198 /* If the edge is attached with data dependence relations, it means this
2199 dependence edge can be resolved by runtime alias checks. */
2200 if (ddrs != NULL)
2202 struct pg_edata *data = new pg_edata;
2204 gcc_assert (ddrs->length () > 0);
2205 e->data = data;
2206 data->alias_ddrs = vNULL;
2207 data->alias_ddrs.safe_splice (*ddrs);
2211 /* Callback function for graph travesal algorithm. It returns true
2212 if edge E should skipped when traversing the graph. */
2214 static bool
2215 pg_skip_alias_edge (struct graph_edge *e)
2217 struct pg_edata *data = (struct pg_edata *)e->data;
2218 return (data != NULL && data->alias_ddrs.length () > 0);
2221 /* Callback function freeing data attached to edge E of graph. */
2223 static void
2224 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2226 if (e->data != NULL)
2228 struct pg_edata *data = (struct pg_edata *)e->data;
2229 data->alias_ddrs.release ();
2230 delete data;
2234 /* Free data attached to vertice of partition dependence graph PG. */
2236 static void
2237 free_partition_graph_vdata (struct graph *pg)
2239 int i;
2240 struct pg_vdata *data;
2242 for (i = 0; i < pg->n_vertices; ++i)
2244 data = (struct pg_vdata *)pg->vertices[i].data;
2245 delete data;
2249 /* Build and return partition dependence graph for PARTITIONS. RDG is
2250 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
2251 is true, data dependence caused by possible alias between references
2252 is ignored, as if it doesn't exist at all; otherwise all depdendences
2253 are considered. */
2255 struct graph *
2256 loop_distribution::build_partition_graph (struct graph *rdg,
2257 vec<struct partition *> *partitions,
2258 bool ignore_alias_p)
2260 int i, j;
2261 struct partition *partition1, *partition2;
2262 graph *pg = new_graph (partitions->length ());
2263 auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2265 alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2267 init_partition_graph_vertices (pg, partitions);
2269 for (i = 0; partitions->iterate (i, &partition1); ++i)
2271 for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2273 /* dependence direction - 0 is no dependence, -1 is back,
2274 1 is forth, 2 is both (we can stop then, merging will occur). */
2275 int dir = 0;
2277 /* If the first partition has reduction, add back edge; if the
2278 second partition has reduction, add forth edge. This makes
2279 sure that reduction partition will be sorted as the last one. */
2280 if (partition_reduction_p (partition1))
2281 dir = -1;
2282 else if (partition_reduction_p (partition2))
2283 dir = 1;
2285 /* Cleanup the temporary vector. */
2286 alias_ddrs.truncate (0);
2288 dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2289 partition2->datarefs, alias_ddrs_p);
2291 /* Add edge to partition graph if there exists dependence. There
2292 are two types of edges. One type edge is caused by compilation
2293 time known dependence, this type cannot be resolved by runtime
2294 alias check. The other type can be resolved by runtime alias
2295 check. */
2296 if (dir == 1 || dir == 2
2297 || alias_ddrs.length () > 0)
2299 /* Attach data dependence relations to edge that can be resolved
2300 by runtime alias check. */
2301 bool alias_edge_p = (dir != 1 && dir != 2);
2302 add_partition_graph_edge (pg, i, j,
2303 (alias_edge_p) ? &alias_ddrs : NULL);
2305 if (dir == -1 || dir == 2
2306 || alias_ddrs.length () > 0)
2308 /* Attach data dependence relations to edge that can be resolved
2309 by runtime alias check. */
2310 bool alias_edge_p = (dir != -1 && dir != 2);
2311 add_partition_graph_edge (pg, j, i,
2312 (alias_edge_p) ? &alias_ddrs : NULL);
2316 return pg;
2319 /* Sort partitions in PG in descending post order and store them in
2320 PARTITIONS. */
2322 static void
2323 sort_partitions_by_post_order (struct graph *pg,
2324 vec<struct partition *> *partitions)
2326 int i;
2327 struct pg_vdata *data;
2329 /* Now order the remaining nodes in descending postorder. */
2330 qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2331 partitions->truncate (0);
2332 for (i = 0; i < pg->n_vertices; ++i)
2334 data = (struct pg_vdata *)pg->vertices[i].data;
2335 if (data->partition)
2336 partitions->safe_push (data->partition);
2340 void
2341 loop_distribution::merge_dep_scc_partitions (struct graph *rdg,
2342 vec<struct partition *> *partitions,
2343 bool ignore_alias_p)
2345 struct partition *partition1, *partition2;
2346 struct pg_vdata *data;
2347 graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2348 int i, j, num_sccs = graphds_scc (pg, NULL);
2350 /* Strong connected compoenent means dependence cycle, we cannot distribute
2351 them. So fuse them together. */
2352 if ((unsigned) num_sccs < partitions->length ())
2354 for (i = 0; i < num_sccs; ++i)
2356 for (j = 0; partitions->iterate (j, &partition1); ++j)
2357 if (pg->vertices[j].component == i)
2358 break;
2359 for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2360 if (pg->vertices[j].component == i)
2362 partition_merge_into (NULL, partition1,
2363 partition2, FUSE_SAME_SCC);
2364 partition1->type = PTYPE_SEQUENTIAL;
2365 (*partitions)[j] = NULL;
2366 partition_free (partition2);
2367 data = (struct pg_vdata *)pg->vertices[j].data;
2368 data->partition = NULL;
2373 sort_partitions_by_post_order (pg, partitions);
2374 gcc_assert (partitions->length () == (unsigned)num_sccs);
2375 free_partition_graph_vdata (pg);
2376 for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2377 free_graph (pg);
2380 /* Callback function for traversing edge E in graph G. DATA is private
2381 callback data. */
2383 static void
2384 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2386 int i, j, component;
2387 struct pg_edge_callback_data *cbdata;
2388 struct pg_edata *edata = (struct pg_edata *) e->data;
2390 /* If the edge doesn't have attached data dependence, it represents
2391 compilation time known dependences. This type dependence cannot
2392 be resolved by runtime alias check. */
2393 if (edata == NULL || edata->alias_ddrs.length () == 0)
2394 return;
2396 cbdata = (struct pg_edge_callback_data *) data;
2397 i = e->src;
2398 j = e->dest;
2399 component = cbdata->vertices_component[i];
2400 /* Vertices are topologically sorted according to compilation time
2401 known dependences, so we can break strong connected components
2402 by removing edges of the opposite direction, i.e, edges pointing
2403 from vertice with smaller post number to vertice with bigger post
2404 number. */
2405 if (g->vertices[i].post < g->vertices[j].post
2406 /* We only need to remove edges connecting vertices in the same
2407 strong connected component to break it. */
2408 && component == cbdata->vertices_component[j]
2409 /* Check if we want to break the strong connected component or not. */
2410 && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2411 cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2414 /* This is the main function breaking strong conected components in
2415 PARTITIONS giving reduced depdendence graph RDG. Store data dependence
2416 relations for runtime alias check in ALIAS_DDRS. */
2417 void
2418 loop_distribution::break_alias_scc_partitions (struct graph *rdg,
2419 vec<struct partition *> *partitions,
2420 vec<ddr_p> *alias_ddrs)
2422 int i, j, k, num_sccs, num_sccs_no_alias = 0;
2423 /* Build partition dependence graph. */
2424 graph *pg = build_partition_graph (rdg, partitions, false);
2426 alias_ddrs->truncate (0);
2427 /* Find strong connected components in the graph, with all dependence edges
2428 considered. */
2429 num_sccs = graphds_scc (pg, NULL);
2430 /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2431 compilation time known dependences are merged before this function. */
2432 if ((unsigned) num_sccs < partitions->length ())
2434 struct pg_edge_callback_data cbdata;
2435 auto_bitmap sccs_to_merge;
2436 auto_vec<enum partition_type> scc_types;
2437 struct partition *partition, *first;
2439 /* If all partitions in a SCC have the same type, we can simply merge the
2440 SCC. This loop finds out such SCCS and record them in bitmap. */
2441 bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2442 for (i = 0; i < num_sccs; ++i)
2444 for (j = 0; partitions->iterate (j, &first); ++j)
2445 if (pg->vertices[j].component == i)
2446 break;
2448 bool same_type = true, all_builtins = partition_builtin_p (first);
2449 for (++j; partitions->iterate (j, &partition); ++j)
2451 if (pg->vertices[j].component != i)
2452 continue;
2454 if (first->type != partition->type)
2456 same_type = false;
2457 break;
2459 all_builtins &= partition_builtin_p (partition);
2461 /* Merge SCC if all partitions in SCC have the same type, though the
2462 result partition is sequential, because vectorizer can do better
2463 runtime alias check. One expecption is all partitions in SCC are
2464 builtins. */
2465 if (!same_type || all_builtins)
2466 bitmap_clear_bit (sccs_to_merge, i);
2469 /* Initialize callback data for traversing. */
2470 cbdata.sccs_to_merge = sccs_to_merge;
2471 cbdata.alias_ddrs = alias_ddrs;
2472 cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2473 cbdata.vertices_post = XNEWVEC (int, pg->n_vertices);
2474 /* Record the component information which will be corrupted by next
2475 graph scc finding call. */
2476 for (i = 0; i < pg->n_vertices; ++i)
2477 cbdata.vertices_component[i] = pg->vertices[i].component;
2479 /* Collect data dependences for runtime alias checks to break SCCs. */
2480 if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2482 /* Record the postorder information which will be corrupted by next
2483 graph SCC finding call. */
2484 for (i = 0; i < pg->n_vertices; ++i)
2485 cbdata.vertices_post[i] = pg->vertices[i].post;
2487 /* Run SCC finding algorithm again, with alias dependence edges
2488 skipped. This is to topologically sort partitions according to
2489 compilation time known dependence. Note the topological order
2490 is stored in the form of pg's post order number. */
2491 num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2492 gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias);
2493 /* With topological order, we can construct two subgraphs L and R.
2494 L contains edge <x, y> where x < y in terms of post order, while
2495 R contains edge <x, y> where x > y. Edges for compilation time
2496 known dependence all fall in R, so we break SCCs by removing all
2497 (alias) edges of in subgraph L. */
2498 for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2501 /* For SCC that doesn't need to be broken, merge it. */
2502 for (i = 0; i < num_sccs; ++i)
2504 if (!bitmap_bit_p (sccs_to_merge, i))
2505 continue;
2507 for (j = 0; partitions->iterate (j, &first); ++j)
2508 if (cbdata.vertices_component[j] == i)
2509 break;
2510 for (k = j + 1; partitions->iterate (k, &partition); ++k)
2512 struct pg_vdata *data;
2514 if (cbdata.vertices_component[k] != i)
2515 continue;
2517 partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2518 (*partitions)[k] = NULL;
2519 partition_free (partition);
2520 data = (struct pg_vdata *)pg->vertices[k].data;
2521 gcc_assert (data->id == k);
2522 data->partition = NULL;
2523 /* The result partition of merged SCC must be sequential. */
2524 first->type = PTYPE_SEQUENTIAL;
2527 /* Restore the postorder information if it's corrupted in finding SCC
2528 with alias dependence edges skipped. If reduction partition's SCC is
2529 broken by runtime alias checks, we force a negative post order to it
2530 making sure it will be scheduled in the last. */
2531 if (num_sccs_no_alias > 0)
2533 j = -1;
2534 for (i = 0; i < pg->n_vertices; ++i)
2536 pg->vertices[i].post = cbdata.vertices_post[i];
2537 struct pg_vdata *data = (struct pg_vdata *)pg->vertices[i].data;
2538 if (data->partition && partition_reduction_p (data->partition))
2540 gcc_assert (j == -1);
2541 j = i;
2544 if (j >= 0)
2545 pg->vertices[j].post = -1;
2548 free (cbdata.vertices_component);
2549 free (cbdata.vertices_post);
2552 sort_partitions_by_post_order (pg, partitions);
2553 free_partition_graph_vdata (pg);
2554 for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2555 free_graph (pg);
2557 if (dump_file && (dump_flags & TDF_DETAILS))
2559 fprintf (dump_file, "Possible alias data dependence to break:\n");
2560 dump_data_dependence_relations (dump_file, *alias_ddrs);
2564 /* Compute and return an expression whose value is the segment length which
2565 will be accessed by DR in NITERS iterations. */
2567 static tree
2568 data_ref_segment_size (struct data_reference *dr, tree niters)
2570 niters = size_binop (MINUS_EXPR,
2571 fold_convert (sizetype, niters),
2572 size_one_node);
2573 return size_binop (MULT_EXPR,
2574 fold_convert (sizetype, DR_STEP (dr)),
2575 fold_convert (sizetype, niters));
2578 /* Return true if LOOP's latch is dominated by statement for data reference
2579 DR. */
2581 static inline bool
2582 latch_dominated_by_data_ref (class loop *loop, data_reference *dr)
2584 return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2585 gimple_bb (DR_STMT (dr)));
2588 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2589 data dependence relations ALIAS_DDRS. */
2591 static void
2592 compute_alias_check_pairs (class loop *loop, vec<ddr_p> *alias_ddrs,
2593 vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2595 unsigned int i;
2596 unsigned HOST_WIDE_INT factor = 1;
2597 tree niters_plus_one, niters = number_of_latch_executions (loop);
2599 gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2600 niters = fold_convert (sizetype, niters);
2601 niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2603 if (dump_file && (dump_flags & TDF_DETAILS))
2604 fprintf (dump_file, "Creating alias check pairs:\n");
2606 /* Iterate all data dependence relations and compute alias check pairs. */
2607 for (i = 0; i < alias_ddrs->length (); i++)
2609 ddr_p ddr = (*alias_ddrs)[i];
2610 struct data_reference *dr_a = DDR_A (ddr);
2611 struct data_reference *dr_b = DDR_B (ddr);
2612 tree seg_length_a, seg_length_b;
2614 if (latch_dominated_by_data_ref (loop, dr_a))
2615 seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2616 else
2617 seg_length_a = data_ref_segment_size (dr_a, niters);
2619 if (latch_dominated_by_data_ref (loop, dr_b))
2620 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2621 else
2622 seg_length_b = data_ref_segment_size (dr_b, niters);
2624 unsigned HOST_WIDE_INT access_size_a
2625 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2626 unsigned HOST_WIDE_INT access_size_b
2627 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2628 unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2629 unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2631 dr_with_seg_len_pair_t dr_with_seg_len_pair
2632 (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2633 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b),
2634 /* ??? Would WELL_ORDERED be safe? */
2635 dr_with_seg_len_pair_t::REORDERED);
2637 comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2640 if (tree_fits_uhwi_p (niters))
2641 factor = tree_to_uhwi (niters);
2643 /* Prune alias check pairs. */
2644 prune_runtime_alias_test_list (comp_alias_pairs, factor);
2645 if (dump_file && (dump_flags & TDF_DETAILS))
2646 fprintf (dump_file,
2647 "Improved number of alias checks from %d to %d\n",
2648 alias_ddrs->length (), comp_alias_pairs->length ());
2651 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2652 checks and version LOOP under condition of these runtime alias checks. */
2654 static void
2655 version_loop_by_alias_check (vec<struct partition *> *partitions,
2656 class loop *loop, vec<ddr_p> *alias_ddrs)
2658 profile_probability prob;
2659 basic_block cond_bb;
2660 class loop *nloop;
2661 tree lhs, arg0, cond_expr = NULL_TREE;
2662 gimple_seq cond_stmts = NULL;
2663 gimple *call_stmt = NULL;
2664 auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2666 /* Generate code for runtime alias checks if necessary. */
2667 gcc_assert (alias_ddrs->length () > 0);
2669 if (dump_file && (dump_flags & TDF_DETAILS))
2670 fprintf (dump_file,
2671 "Version loop <%d> with runtime alias check\n", loop->num);
2673 compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2674 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2675 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2676 is_gimple_val, NULL_TREE);
2678 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */
2679 bool cancelable_p = flag_tree_loop_vectorize;
2680 if (cancelable_p)
2682 unsigned i = 0;
2683 struct partition *partition;
2684 for (; partitions->iterate (i, &partition); ++i)
2685 if (!partition_builtin_p (partition))
2686 break;
2688 /* If all partitions are builtins, distributing it would be profitable and
2689 we don't want to cancel the runtime alias checks. */
2690 if (i == partitions->length ())
2691 cancelable_p = false;
2694 /* Generate internal function call for loop distribution alias check if the
2695 runtime alias check should be cancelable. */
2696 if (cancelable_p)
2698 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2699 2, NULL_TREE, cond_expr);
2700 lhs = make_ssa_name (boolean_type_node);
2701 gimple_call_set_lhs (call_stmt, lhs);
2703 else
2704 lhs = cond_expr;
2706 prob = profile_probability::guessed_always ().apply_scale (9, 10);
2707 initialize_original_copy_tables ();
2708 nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2709 prob, prob.invert (), true);
2710 free_original_copy_tables ();
2711 /* Record the original loop number in newly generated loops. In case of
2712 distribution, the original loop will be distributed and the new loop
2713 is kept. */
2714 loop->orig_loop_num = nloop->num;
2715 nloop->orig_loop_num = nloop->num;
2716 nloop->dont_vectorize = true;
2717 nloop->force_vectorize = false;
2719 if (call_stmt)
2721 /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2722 loop could be destroyed. */
2723 arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2724 gimple_call_set_arg (call_stmt, 0, arg0);
2725 gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2728 if (cond_stmts)
2730 gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2731 gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2733 update_ssa (TODO_update_ssa);
2736 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2737 ALIAS_DDRS are data dependence relations for runtime alias check. */
2739 static inline bool
2740 version_for_distribution_p (vec<struct partition *> *partitions,
2741 vec<ddr_p> *alias_ddrs)
2743 /* No need to version loop if we have only one partition. */
2744 if (partitions->length () == 1)
2745 return false;
2747 /* Need to version loop if runtime alias check is necessary. */
2748 return (alias_ddrs->length () > 0);
2751 /* Compare base offset of builtin mem* partitions P1 and P2. */
2753 static int
2754 offset_cmp (const void *vp1, const void *vp2)
2756 struct partition *p1 = *(struct partition *const *) vp1;
2757 struct partition *p2 = *(struct partition *const *) vp2;
2758 unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset;
2759 unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset;
2760 return (o2 < o1) - (o1 < o2);
2763 /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special
2764 case optimization transforming below code:
2766 __builtin_memset (&obj, 0, 100);
2767 _1 = &obj + 100;
2768 __builtin_memset (_1, 0, 200);
2769 _2 = &obj + 300;
2770 __builtin_memset (_2, 0, 100);
2772 into:
2774 __builtin_memset (&obj, 0, 400);
2776 Note we don't have dependence information between different partitions
2777 at this point, as a result, we can't handle nonadjacent memset builtin
2778 partitions since dependence might be broken. */
2780 static void
2781 fuse_memset_builtins (vec<struct partition *> *partitions)
2783 unsigned i, j;
2784 struct partition *part1, *part2;
2785 tree rhs1, rhs2;
2787 for (i = 0; partitions->iterate (i, &part1);)
2789 if (part1->kind != PKIND_MEMSET)
2791 i++;
2792 continue;
2795 /* Find sub-array of memset builtins of the same base. Index range
2796 of the sub-array is [i, j) with "j > i". */
2797 for (j = i + 1; partitions->iterate (j, &part2); ++j)
2799 if (part2->kind != PKIND_MEMSET
2800 || !operand_equal_p (part1->builtin->dst_base_base,
2801 part2->builtin->dst_base_base, 0))
2802 break;
2804 /* Memset calls setting different values can't be merged. */
2805 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2806 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2807 if (!operand_equal_p (rhs1, rhs2, 0))
2808 break;
2811 /* Stable sort is required in order to avoid breaking dependence. */
2812 gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i],
2813 offset_cmp);
2814 /* Continue with next partition. */
2815 i = j;
2818 /* Merge all consecutive memset builtin partitions. */
2819 for (i = 0; i < partitions->length () - 1;)
2821 part1 = (*partitions)[i];
2822 if (part1->kind != PKIND_MEMSET)
2824 i++;
2825 continue;
2828 part2 = (*partitions)[i + 1];
2829 /* Only merge memset partitions of the same base and with constant
2830 access sizes. */
2831 if (part2->kind != PKIND_MEMSET
2832 || TREE_CODE (part1->builtin->size) != INTEGER_CST
2833 || TREE_CODE (part2->builtin->size) != INTEGER_CST
2834 || !operand_equal_p (part1->builtin->dst_base_base,
2835 part2->builtin->dst_base_base, 0))
2837 i++;
2838 continue;
2840 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2841 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2842 int bytev1 = const_with_all_bytes_same (rhs1);
2843 int bytev2 = const_with_all_bytes_same (rhs2);
2844 /* Only merge memset partitions of the same value. */
2845 if (bytev1 != bytev2 || bytev1 == -1)
2847 i++;
2848 continue;
2850 wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2851 wi::to_wide (part1->builtin->size));
2852 /* Only merge adjacent memset partitions. */
2853 if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2855 i++;
2856 continue;
2858 /* Merge partitions[i] and partitions[i+1]. */
2859 part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2860 part1->builtin->size,
2861 part2->builtin->size);
2862 partition_free (part2);
2863 partitions->ordered_remove (i + 1);
2867 void
2868 loop_distribution::finalize_partitions (class loop *loop,
2869 vec<struct partition *> *partitions,
2870 vec<ddr_p> *alias_ddrs)
2872 unsigned i;
2873 struct partition *partition, *a;
2875 if (partitions->length () == 1
2876 || alias_ddrs->length () > 0)
2877 return;
2879 unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2880 bool same_type_p = true;
2881 enum partition_type type = ((*partitions)[0])->type;
2882 for (i = 0; partitions->iterate (i, &partition); ++i)
2884 same_type_p &= (type == partition->type);
2885 if (partition_builtin_p (partition))
2887 num_builtin++;
2888 continue;
2890 num_normal++;
2891 if (partition->kind == PKIND_PARTIAL_MEMSET)
2892 num_partial_memset++;
2895 /* Don't distribute current loop into too many loops given we don't have
2896 memory stream cost model. Be even more conservative in case of loop
2897 nest distribution. */
2898 if ((same_type_p && num_builtin == 0
2899 && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2900 || (loop->inner != NULL
2901 && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2902 || (loop->inner == NULL
2903 && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2905 a = (*partitions)[0];
2906 for (i = 1; partitions->iterate (i, &partition); ++i)
2908 partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2909 partition_free (partition);
2911 partitions->truncate (1);
2914 /* Fuse memset builtins if possible. */
2915 if (partitions->length () > 1)
2916 fuse_memset_builtins (partitions);
2919 /* Distributes the code from LOOP in such a way that producer statements
2920 are placed before consumer statements. Tries to separate only the
2921 statements from STMTS into separate loops. Returns the number of
2922 distributed loops. Set NB_CALLS to number of generated builtin calls.
2923 Set *DESTROY_P to whether LOOP needs to be destroyed. */
2926 loop_distribution::distribute_loop (class loop *loop,
2927 const vec<gimple *> &stmts,
2928 control_dependences *cd, int *nb_calls, bool *destroy_p,
2929 bool only_patterns_p)
2931 ddrs_table = new hash_table<ddr_hasher> (389);
2932 struct graph *rdg;
2933 partition *partition;
2934 int i, nbp;
2936 *destroy_p = false;
2937 *nb_calls = 0;
2938 loop_nest.create (0);
2939 if (!find_loop_nest (loop, &loop_nest))
2941 loop_nest.release ();
2942 delete ddrs_table;
2943 return 0;
2946 datarefs_vec.create (20);
2947 has_nonaddressable_dataref_p = false;
2948 rdg = build_rdg (loop, cd);
2949 if (!rdg)
2951 if (dump_file && (dump_flags & TDF_DETAILS))
2952 fprintf (dump_file,
2953 "Loop %d not distributed: failed to build the RDG.\n",
2954 loop->num);
2956 loop_nest.release ();
2957 free_data_refs (datarefs_vec);
2958 delete ddrs_table;
2959 return 0;
2962 if (datarefs_vec.length () > MAX_DATAREFS_NUM)
2964 if (dump_file && (dump_flags & TDF_DETAILS))
2965 fprintf (dump_file,
2966 "Loop %d not distributed: too many memory references.\n",
2967 loop->num);
2969 free_rdg (rdg);
2970 loop_nest.release ();
2971 free_data_refs (datarefs_vec);
2972 delete ddrs_table;
2973 return 0;
2976 data_reference_p dref;
2977 for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
2978 dref->aux = (void *) (uintptr_t) i;
2980 if (dump_file && (dump_flags & TDF_DETAILS))
2981 dump_rdg (dump_file, rdg);
2983 auto_vec<struct partition *, 3> partitions;
2984 rdg_build_partitions (rdg, stmts, &partitions);
2986 auto_vec<ddr_p> alias_ddrs;
2988 auto_bitmap stmt_in_all_partitions;
2989 bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
2990 for (i = 1; partitions.iterate (i, &partition); ++i)
2991 bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
2993 bool any_builtin = false;
2994 bool reduction_in_all = false;
2995 FOR_EACH_VEC_ELT (partitions, i, partition)
2997 reduction_in_all
2998 |= classify_partition (loop, rdg, partition, stmt_in_all_partitions);
2999 any_builtin |= partition_builtin_p (partition);
3002 /* If we are only distributing patterns but did not detect any,
3003 simply bail out. */
3004 if (only_patterns_p
3005 && !any_builtin)
3007 nbp = 0;
3008 goto ldist_done;
3011 /* If we are only distributing patterns fuse all partitions that
3012 were not classified as builtins. This also avoids chopping
3013 a loop into pieces, separated by builtin calls. That is, we
3014 only want no or a single loop body remaining. */
3015 struct partition *into;
3016 if (only_patterns_p)
3018 for (i = 0; partitions.iterate (i, &into); ++i)
3019 if (!partition_builtin_p (into))
3020 break;
3021 for (++i; partitions.iterate (i, &partition); ++i)
3022 if (!partition_builtin_p (partition))
3024 partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
3025 partitions.unordered_remove (i);
3026 partition_free (partition);
3027 i--;
3031 /* Due to limitations in the transform phase we have to fuse all
3032 reduction partitions into the last partition so the existing
3033 loop will contain all loop-closed PHI nodes. */
3034 for (i = 0; partitions.iterate (i, &into); ++i)
3035 if (partition_reduction_p (into))
3036 break;
3037 for (i = i + 1; partitions.iterate (i, &partition); ++i)
3038 if (partition_reduction_p (partition))
3040 partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
3041 partitions.unordered_remove (i);
3042 partition_free (partition);
3043 i--;
3046 /* Apply our simple cost model - fuse partitions with similar
3047 memory accesses. */
3048 for (i = 0; partitions.iterate (i, &into); ++i)
3050 bool changed = false;
3051 if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
3052 continue;
3053 for (int j = i + 1;
3054 partitions.iterate (j, &partition); ++j)
3056 if (share_memory_accesses (rdg, into, partition))
3058 partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
3059 partitions.unordered_remove (j);
3060 partition_free (partition);
3061 j--;
3062 changed = true;
3065 /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
3066 accesses when 1 and 2 have similar accesses but not 0 and 1
3067 then in the next iteration we will fail to consider merging
3068 1 into 0,2. So try again if we did any merging into 0. */
3069 if (changed)
3070 i--;
3073 /* Put a non-builtin partition last if we need to preserve a reduction.
3074 ??? This is a workaround that makes sort_partitions_by_post_order do
3075 the correct thing while in reality it should sort each component
3076 separately and then put the component with a reduction or a non-builtin
3077 last. */
3078 if (reduction_in_all
3079 && partition_builtin_p (partitions.last()))
3080 FOR_EACH_VEC_ELT (partitions, i, partition)
3081 if (!partition_builtin_p (partition))
3083 partitions.unordered_remove (i);
3084 partitions.quick_push (partition);
3085 break;
3088 /* Build the partition dependency graph and fuse partitions in strong
3089 connected component. */
3090 if (partitions.length () > 1)
3092 /* Don't support loop nest distribution under runtime alias check
3093 since it's not likely to enable many vectorization opportunities.
3094 Also if loop has any data reference which may be not addressable
3095 since alias check needs to take, compare address of the object. */
3096 if (loop->inner || has_nonaddressable_dataref_p)
3097 merge_dep_scc_partitions (rdg, &partitions, false);
3098 else
3100 merge_dep_scc_partitions (rdg, &partitions, true);
3101 if (partitions.length () > 1)
3102 break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
3106 finalize_partitions (loop, &partitions, &alias_ddrs);
3108 /* If there is a reduction in all partitions make sure the last one
3109 is not classified for builtin code generation. */
3110 if (reduction_in_all)
3112 partition = partitions.last ();
3113 if (only_patterns_p
3114 && partition_builtin_p (partition)
3115 && !partition_builtin_p (partitions[0]))
3117 nbp = 0;
3118 goto ldist_done;
3120 partition->kind = PKIND_NORMAL;
3123 nbp = partitions.length ();
3124 if (nbp == 0
3125 || (nbp == 1 && !partition_builtin_p (partitions[0]))
3126 || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
3128 nbp = 0;
3129 goto ldist_done;
3132 if (version_for_distribution_p (&partitions, &alias_ddrs))
3133 version_loop_by_alias_check (&partitions, loop, &alias_ddrs);
3135 if (dump_file && (dump_flags & TDF_DETAILS))
3137 fprintf (dump_file,
3138 "distribute loop <%d> into partitions:\n", loop->num);
3139 dump_rdg_partitions (dump_file, partitions);
3142 FOR_EACH_VEC_ELT (partitions, i, partition)
3144 if (partition_builtin_p (partition))
3145 (*nb_calls)++;
3146 *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
3149 ldist_done:
3150 loop_nest.release ();
3151 free_data_refs (datarefs_vec);
3152 for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
3153 iter != ddrs_table->end (); ++iter)
3155 free_dependence_relation (*iter);
3156 *iter = NULL;
3158 delete ddrs_table;
3160 FOR_EACH_VEC_ELT (partitions, i, partition)
3161 partition_free (partition);
3163 free_rdg (rdg);
3164 return nbp - *nb_calls;
3168 void loop_distribution::bb_top_order_init (void)
3170 int rpo_num;
3171 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (cfun) - NUM_FIXED_BLOCKS);
3172 edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3173 bitmap exit_bbs = BITMAP_ALLOC (NULL);
3175 bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
3176 bb_top_order_index_size = last_basic_block_for_fn (cfun);
3178 entry->flags &= ~EDGE_DFS_BACK;
3179 bitmap_set_bit (exit_bbs, EXIT_BLOCK);
3180 rpo_num = rev_post_order_and_mark_dfs_back_seme (cfun, entry, exit_bbs, true,
3181 rpo, NULL);
3182 BITMAP_FREE (exit_bbs);
3184 for (int i = 0; i < rpo_num; i++)
3185 bb_top_order_index[rpo[i]] = i;
3187 free (rpo);
3190 void loop_distribution::bb_top_order_destroy ()
3192 free (bb_top_order_index);
3193 bb_top_order_index = NULL;
3194 bb_top_order_index_size = 0;
3198 /* Given LOOP, this function records seed statements for distribution in
3199 WORK_LIST. Return false if there is nothing for distribution. */
3201 static bool
3202 find_seed_stmts_for_distribution (class loop *loop, vec<gimple *> *work_list)
3204 basic_block *bbs = get_loop_body_in_dom_order (loop);
3206 /* Initialize the worklist with stmts we seed the partitions with. */
3207 for (unsigned i = 0; i < loop->num_nodes; ++i)
3209 /* In irreducible sub-regions we don't know how to redirect
3210 conditions, so fail. See PR100492. */
3211 if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
3213 if (dump_file && (dump_flags & TDF_DETAILS))
3214 fprintf (dump_file, "loop %d contains an irreducible region.\n",
3215 loop->num);
3216 work_list->truncate (0);
3217 break;
3219 for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
3220 !gsi_end_p (gsi); gsi_next (&gsi))
3222 gphi *phi = gsi.phi ();
3223 if (virtual_operand_p (gimple_phi_result (phi)))
3224 continue;
3225 /* Distribute stmts which have defs that are used outside of
3226 the loop. */
3227 if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
3228 continue;
3229 work_list->safe_push (phi);
3231 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3232 !gsi_end_p (gsi); gsi_next (&gsi))
3234 gimple *stmt = gsi_stmt (gsi);
3236 /* Ignore clobbers, they do not have true side effects. */
3237 if (gimple_clobber_p (stmt))
3238 continue;
3240 /* If there is a stmt with side-effects bail out - we
3241 cannot and should not distribute this loop. */
3242 if (gimple_has_side_effects (stmt))
3244 free (bbs);
3245 return false;
3248 /* Distribute stmts which have defs that are used outside of
3249 the loop. */
3250 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
3252 /* Otherwise only distribute stores for now. */
3253 else if (!gimple_vdef (stmt))
3254 continue;
3256 work_list->safe_push (stmt);
3259 free (bbs);
3260 return work_list->length () > 0;
3263 /* Given innermost LOOP, return the outermost enclosing loop that forms a
3264 perfect loop nest. */
3266 static class loop *
3267 prepare_perfect_loop_nest (class loop *loop)
3269 class loop *outer = loop_outer (loop);
3270 tree niters = number_of_latch_executions (loop);
3272 /* TODO: We only support the innermost 3-level loop nest distribution
3273 because of compilation time issue for now. This should be relaxed
3274 in the future. Note we only allow 3-level loop nest distribution
3275 when parallelizing loops. */
3276 while ((loop->inner == NULL
3277 || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
3278 && loop_outer (outer)
3279 && outer->inner == loop && loop->next == NULL
3280 && single_exit (outer)
3281 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3282 && (niters = number_of_latch_executions (outer)) != NULL_TREE
3283 && niters != chrec_dont_know)
3285 loop = outer;
3286 outer = loop_outer (loop);
3289 return loop;
3293 unsigned int
3294 loop_distribution::execute (function *fun)
3296 class loop *loop;
3297 bool changed = false;
3298 basic_block bb;
3299 control_dependences *cd = NULL;
3300 auto_vec<loop_p> loops_to_be_destroyed;
3302 if (number_of_loops (fun) <= 1)
3303 return 0;
3305 bb_top_order_init ();
3307 FOR_ALL_BB_FN (bb, fun)
3309 gimple_stmt_iterator gsi;
3310 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3311 gimple_set_uid (gsi_stmt (gsi), -1);
3312 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3313 gimple_set_uid (gsi_stmt (gsi), -1);
3316 /* We can at the moment only distribute non-nested loops, thus restrict
3317 walking to innermost loops. */
3318 for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
3320 /* Don't distribute multiple exit edges loop, or cold loop when
3321 not doing pattern detection. */
3322 if (!single_exit (loop)
3323 || (!flag_tree_loop_distribute_patterns
3324 && !optimize_loop_for_speed_p (loop)))
3325 continue;
3327 /* Don't distribute loop if niters is unknown. */
3328 tree niters = number_of_latch_executions (loop);
3329 if (niters == NULL_TREE || niters == chrec_dont_know)
3330 continue;
3332 /* Get the perfect loop nest for distribution. */
3333 loop = prepare_perfect_loop_nest (loop);
3334 for (; loop; loop = loop->inner)
3336 auto_vec<gimple *> work_list;
3337 if (!find_seed_stmts_for_distribution (loop, &work_list))
3338 break;
3340 const char *str = loop->inner ? " nest" : "";
3341 dump_user_location_t loc = find_loop_location (loop);
3342 if (!cd)
3344 calculate_dominance_info (CDI_DOMINATORS);
3345 calculate_dominance_info (CDI_POST_DOMINATORS);
3346 cd = new control_dependences ();
3347 free_dominance_info (CDI_POST_DOMINATORS);
3350 bool destroy_p;
3351 int nb_generated_loops, nb_generated_calls;
3352 nb_generated_loops
3353 = distribute_loop (loop, work_list, cd, &nb_generated_calls,
3354 &destroy_p, (!optimize_loop_for_speed_p (loop)
3355 || !flag_tree_loop_distribution));
3356 if (destroy_p)
3357 loops_to_be_destroyed.safe_push (loop);
3359 if (nb_generated_loops + nb_generated_calls > 0)
3361 changed = true;
3362 if (dump_enabled_p ())
3363 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3364 loc, "Loop%s %d distributed: split to %d loops "
3365 "and %d library calls.\n", str, loop->num,
3366 nb_generated_loops, nb_generated_calls);
3368 break;
3371 if (dump_file && (dump_flags & TDF_DETAILS))
3372 fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
3376 if (cd)
3377 delete cd;
3379 if (bb_top_order_index != NULL)
3380 bb_top_order_destroy ();
3382 if (changed)
3384 /* Destroy loop bodies that could not be reused. Do this late as we
3385 otherwise can end up refering to stale data in control dependences. */
3386 unsigned i;
3387 FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
3388 destroy_loop (loop);
3390 /* Cached scalar evolutions now may refer to wrong or non-existing
3391 loops. */
3392 scev_reset_htab ();
3393 mark_virtual_operands_for_renaming (fun);
3394 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3397 checking_verify_loop_structure ();
3399 return changed ? TODO_cleanup_cfg : 0;
3403 /* Distribute all loops in the current function. */
3405 namespace {
3407 const pass_data pass_data_loop_distribution =
3409 GIMPLE_PASS, /* type */
3410 "ldist", /* name */
3411 OPTGROUP_LOOP, /* optinfo_flags */
3412 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
3413 ( PROP_cfg | PROP_ssa ), /* properties_required */
3414 0, /* properties_provided */
3415 0, /* properties_destroyed */
3416 0, /* todo_flags_start */
3417 0, /* todo_flags_finish */
3420 class pass_loop_distribution : public gimple_opt_pass
3422 public:
3423 pass_loop_distribution (gcc::context *ctxt)
3424 : gimple_opt_pass (pass_data_loop_distribution, ctxt)
3427 /* opt_pass methods: */
3428 virtual bool gate (function *)
3430 return flag_tree_loop_distribution
3431 || flag_tree_loop_distribute_patterns;
3434 virtual unsigned int execute (function *);
3436 }; // class pass_loop_distribution
3438 unsigned int
3439 pass_loop_distribution::execute (function *fun)
3441 return loop_distribution ().execute (fun);
3444 } // anon namespace
3446 gimple_opt_pass *
3447 make_pass_loop_distribution (gcc::context *ctxt)
3449 return new pass_loop_distribution (ctxt);