RISC-V: Make stack_save_restore tests more robust
[official-gcc.git] / gcc / tree-loop-distribution.cc
bloba28470b66ea935741a61fb73961ed7c927543a3d
1 /* Loop distribution.
2 Copyright (C) 2006-2023 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 /* This pass performs loop distribution: for example, the loop
24 |DO I = 2, N
25 | A(I) = B(I) + C
26 | D(I) = A(I-1)*E
27 |ENDDO
29 is transformed to
31 |DOALL I = 2, N
32 | A(I) = B(I) + C
33 |ENDDO
35 |DOALL I = 2, N
36 | D(I) = A(I-1)*E
37 |ENDDO
39 Loop distribution is the dual of loop fusion. It separates statements
40 of a loop (or loop nest) into multiple loops (or loop nests) with the
41 same loop header. The major goal is to separate statements which may
42 be vectorized from those that can't. This pass implements distribution
43 in the following steps:
45 1) Seed partitions with specific type statements. For now we support
46 two types seed statements: statement defining variable used outside
47 of loop; statement storing to memory.
48 2) Build reduced dependence graph (RDG) for loop to be distributed.
49 The vertices (RDG:V) model all statements in the loop and the edges
50 (RDG:E) model flow and control dependencies between statements.
51 3) Apart from RDG, compute data dependencies between memory references.
52 4) Starting from seed statement, build up partition by adding depended
53 statements according to RDG's dependence information. Partition is
54 classified as parallel type if it can be executed paralleled; or as
55 sequential type if it can't. Parallel type partition is further
56 classified as different builtin kinds if it can be implemented as
57 builtin function calls.
58 5) Build partition dependence graph (PG) based on data dependencies.
59 The vertices (PG:V) model all partitions and the edges (PG:E) model
60 all data dependencies between every partitions pair. In general,
61 data dependence is either compilation time known or unknown. In C
62 family languages, there exists quite amount compilation time unknown
63 dependencies because of possible alias relation of data references.
64 We categorize PG's edge to two types: "true" edge that represents
65 compilation time known data dependencies; "alias" edge for all other
66 data dependencies.
67 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge
68 partitions in each strong connected component (SCC) correspondingly.
69 Build new PG for merged partitions.
70 7) Traverse PG again and this time with both "true" and "alias" edges
71 included. We try to break SCCs by removing some edges. Because
72 SCCs by "true" edges are all fused in step 6), we can break SCCs
73 by removing some "alias" edges. It's NP-hard to choose optimal
74 edge set, fortunately simple approximation is good enough for us
75 given the small problem scale.
76 8) Collect all data dependencies of the removed "alias" edges. Create
77 runtime alias checks for collected data dependencies.
78 9) Version loop under the condition of runtime alias checks. Given
79 loop distribution generally introduces additional overhead, it is
80 only useful if vectorization is achieved in distributed loop. We
81 version loop with internal function call IFN_LOOP_DIST_ALIAS. If
82 no distributed loop can be vectorized, we simply remove distributed
83 loops and recover to the original one.
85 TODO:
86 1) We only distribute innermost two-level loop nest now. We should
87 extend it for arbitrary loop nests in the future.
88 2) We only fuse partitions in SCC now. A better fusion algorithm is
89 desired to minimize loop overhead, maximize parallelism and maximize
90 data reuse. */
92 #include "config.h"
93 #include "system.h"
94 #include "coretypes.h"
95 #include "backend.h"
96 #include "tree.h"
97 #include "gimple.h"
98 #include "cfghooks.h"
99 #include "tree-pass.h"
100 #include "ssa.h"
101 #include "gimple-pretty-print.h"
102 #include "fold-const.h"
103 #include "cfganal.h"
104 #include "gimple-iterator.h"
105 #include "gimplify-me.h"
106 #include "stor-layout.h"
107 #include "tree-cfg.h"
108 #include "tree-ssa-loop-manip.h"
109 #include "tree-ssa-loop-ivopts.h"
110 #include "tree-ssa-loop.h"
111 #include "tree-into-ssa.h"
112 #include "tree-ssa.h"
113 #include "cfgloop.h"
114 #include "tree-scalar-evolution.h"
115 #include "tree-vectorizer.h"
116 #include "tree-eh.h"
117 #include "gimple-fold.h"
118 #include "tree-affine.h"
119 #include "intl.h"
120 #include "rtl.h"
121 #include "memmodel.h"
122 #include "optabs.h"
123 #include "tree-ssa-loop-niter.h"
126 #define MAX_DATAREFS_NUM \
127 ((unsigned) param_loop_max_datarefs_for_datadeps)
129 /* Threshold controlling number of distributed partitions. Given it may
130 be unnecessary if a memory stream cost model is invented in the future,
131 we define it as a temporary macro, rather than a parameter. */
132 #define NUM_PARTITION_THRESHOLD (4)
134 /* Hashtable helpers. */
136 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
138 static inline hashval_t hash (const data_dependence_relation *);
139 static inline bool equal (const data_dependence_relation *,
140 const data_dependence_relation *);
143 /* Hash function for data dependence. */
145 inline hashval_t
146 ddr_hasher::hash (const data_dependence_relation *ddr)
148 inchash::hash h;
149 h.add_ptr (DDR_A (ddr));
150 h.add_ptr (DDR_B (ddr));
151 return h.end ();
154 /* Hash table equality function for data dependence. */
156 inline bool
157 ddr_hasher::equal (const data_dependence_relation *ddr1,
158 const data_dependence_relation *ddr2)
160 return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
165 #define DR_INDEX(dr) ((uintptr_t) (dr)->aux)
167 /* A Reduced Dependence Graph (RDG) vertex representing a statement. */
168 struct rdg_vertex
170 /* The statement represented by this vertex. */
171 gimple *stmt;
173 /* Vector of data-references in this statement. */
174 vec<data_reference_p> datarefs;
176 /* True when the statement contains a write to memory. */
177 bool has_mem_write;
179 /* True when the statement contains a read from memory. */
180 bool has_mem_reads;
183 #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt
184 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
185 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
186 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
187 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
188 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
189 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
190 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
192 /* Data dependence type. */
194 enum rdg_dep_type
196 /* Read After Write (RAW). */
197 flow_dd = 'f',
199 /* Control dependence (execute conditional on). */
200 control_dd = 'c'
203 /* Dependence information attached to an edge of the RDG. */
205 struct rdg_edge
207 /* Type of the dependence. */
208 enum rdg_dep_type type;
211 #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type
213 /* Kind of distributed loop. */
214 enum partition_kind {
215 PKIND_NORMAL,
216 /* Partial memset stands for a paritition can be distributed into a loop
217 of memset calls, rather than a single memset call. It's handled just
218 like a normal parition, i.e, distributed as separate loop, no memset
219 call is generated.
221 Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
222 loop nest as deep as possible. As a result, parloop achieves better
223 parallelization by parallelizing deeper loop nest. This hack should
224 be unnecessary and removed once distributed memset can be understood
225 and analyzed in data reference analysis. See PR82604 for more. */
226 PKIND_PARTIAL_MEMSET,
227 PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
230 /* Type of distributed loop. */
231 enum partition_type {
232 /* The distributed loop can be executed parallelly. */
233 PTYPE_PARALLEL = 0,
234 /* The distributed loop has to be executed sequentially. */
235 PTYPE_SEQUENTIAL
238 /* Builtin info for loop distribution. */
239 struct builtin_info
241 /* data-references a kind != PKIND_NORMAL partition is about. */
242 data_reference_p dst_dr;
243 data_reference_p src_dr;
244 /* Base address and size of memory objects operated by the builtin. Note
245 both dest and source memory objects must have the same size. */
246 tree dst_base;
247 tree src_base;
248 tree size;
249 /* Base and offset part of dst_base after stripping constant offset. This
250 is only used in memset builtin distribution for now. */
251 tree dst_base_base;
252 unsigned HOST_WIDE_INT dst_base_offset;
255 /* Partition for loop distribution. */
256 struct partition
258 /* Statements of the partition. */
259 bitmap stmts;
260 /* True if the partition defines variable which is used outside of loop. */
261 bool reduction_p;
262 location_t loc;
263 enum partition_kind kind;
264 enum partition_type type;
265 /* Data references in the partition. */
266 bitmap datarefs;
267 /* Information of builtin parition. */
268 struct builtin_info *builtin;
271 /* Partitions are fused because of different reasons. */
272 enum fuse_type
274 FUSE_NON_BUILTIN = 0,
275 FUSE_REDUCTION = 1,
276 FUSE_SHARE_REF = 2,
277 FUSE_SAME_SCC = 3,
278 FUSE_FINALIZE = 4
281 /* Description on different fusing reason. */
282 static const char *fuse_message[] = {
283 "they are non-builtins",
284 "they have reductions",
285 "they have shared memory refs",
286 "they are in the same dependence scc",
287 "there is no point to distribute loop"};
290 /* Dump vertex I in RDG to FILE. */
292 static void
293 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
295 struct vertex *v = &(rdg->vertices[i]);
296 struct graph_edge *e;
298 fprintf (file, "(vertex %d: (%s%s) (in:", i,
299 RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
300 RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
302 if (v->pred)
303 for (e = v->pred; e; e = e->pred_next)
304 fprintf (file, " %d", e->src);
306 fprintf (file, ") (out:");
308 if (v->succ)
309 for (e = v->succ; e; e = e->succ_next)
310 fprintf (file, " %d", e->dest);
312 fprintf (file, ")\n");
313 print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
314 fprintf (file, ")\n");
317 /* Call dump_rdg_vertex on stderr. */
319 DEBUG_FUNCTION void
320 debug_rdg_vertex (struct graph *rdg, int i)
322 dump_rdg_vertex (stderr, rdg, i);
325 /* Dump the reduced dependence graph RDG to FILE. */
327 static void
328 dump_rdg (FILE *file, struct graph *rdg)
330 fprintf (file, "(rdg\n");
331 for (int i = 0; i < rdg->n_vertices; i++)
332 dump_rdg_vertex (file, rdg, i);
333 fprintf (file, ")\n");
336 /* Call dump_rdg on stderr. */
338 DEBUG_FUNCTION void
339 debug_rdg (struct graph *rdg)
341 dump_rdg (stderr, rdg);
344 static void
345 dot_rdg_1 (FILE *file, struct graph *rdg)
347 int i;
348 pretty_printer buffer;
349 pp_needs_newline (&buffer) = false;
350 buffer.buffer->stream = file;
352 fprintf (file, "digraph RDG {\n");
354 for (i = 0; i < rdg->n_vertices; i++)
356 struct vertex *v = &(rdg->vertices[i]);
357 struct graph_edge *e;
359 fprintf (file, "%d [label=\"[%d] ", i, i);
360 pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
361 pp_flush (&buffer);
362 fprintf (file, "\"]\n");
364 /* Highlight reads from memory. */
365 if (RDG_MEM_READS_STMT (rdg, i))
366 fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
368 /* Highlight stores to memory. */
369 if (RDG_MEM_WRITE_STMT (rdg, i))
370 fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
372 if (v->succ)
373 for (e = v->succ; e; e = e->succ_next)
374 switch (RDGE_TYPE (e))
376 case flow_dd:
377 /* These are the most common dependences: don't print these. */
378 fprintf (file, "%d -> %d \n", i, e->dest);
379 break;
381 case control_dd:
382 fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
383 break;
385 default:
386 gcc_unreachable ();
390 fprintf (file, "}\n\n");
393 /* Display the Reduced Dependence Graph using dotty. */
395 DEBUG_FUNCTION void
396 dot_rdg (struct graph *rdg)
398 /* When debugging, you may want to enable the following code. */
399 #ifdef HAVE_POPEN
400 FILE *file = popen ("dot -Tx11", "w");
401 if (!file)
402 return;
403 dot_rdg_1 (file, rdg);
404 fflush (file);
405 close (fileno (file));
406 pclose (file);
407 #else
408 dot_rdg_1 (stderr, rdg);
409 #endif
412 /* Returns the index of STMT in RDG. */
414 static int
415 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
417 int index = gimple_uid (stmt);
418 gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
419 return index;
422 /* Creates dependence edges in RDG for all the uses of DEF. IDEF is
423 the index of DEF in RDG. */
425 static void
426 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
428 use_operand_p imm_use_p;
429 imm_use_iterator iterator;
431 FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
433 struct graph_edge *e;
434 int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
436 if (use < 0)
437 continue;
439 e = add_edge (rdg, idef, use);
440 e->data = XNEW (struct rdg_edge);
441 RDGE_TYPE (e) = flow_dd;
445 /* Creates an edge for the control dependences of BB to the vertex V. */
447 static void
448 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
449 int v, control_dependences *cd)
451 bitmap_iterator bi;
452 unsigned edge_n;
453 EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
454 0, edge_n, bi)
456 basic_block cond_bb = cd->get_edge_src (edge_n);
457 gimple *stmt = *gsi_last_bb (cond_bb);
458 if (stmt && is_ctrl_stmt (stmt))
460 struct graph_edge *e;
461 int c = rdg_vertex_for_stmt (rdg, stmt);
462 if (c < 0)
463 continue;
465 e = add_edge (rdg, c, v);
466 e->data = XNEW (struct rdg_edge);
467 RDGE_TYPE (e) = control_dd;
472 /* Creates the edges of the reduced dependence graph RDG. */
474 static void
475 create_rdg_flow_edges (struct graph *rdg)
477 int i;
478 def_operand_p def_p;
479 ssa_op_iter iter;
481 for (i = 0; i < rdg->n_vertices; i++)
482 FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
483 iter, SSA_OP_DEF)
484 create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
487 /* Creates the edges of the reduced dependence graph RDG. */
489 static void
490 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
492 int i;
494 for (i = 0; i < rdg->n_vertices; i++)
496 gimple *stmt = RDG_STMT (rdg, i);
497 if (gimple_code (stmt) == GIMPLE_PHI)
499 edge_iterator ei;
500 edge e;
501 FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
502 if (flow_bb_inside_loop_p (loop, e->src))
503 create_edge_for_control_dependence (rdg, e->src, i, cd);
505 else
506 create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
511 class loop_distribution
513 private:
514 /* The loop (nest) to be distributed. */
515 vec<loop_p> loop_nest;
517 /* Vector of data references in the loop to be distributed. */
518 vec<data_reference_p> datarefs_vec;
520 /* If there is nonaddressable data reference in above vector. */
521 bool has_nonaddressable_dataref_p;
523 /* Store index of data reference in aux field. */
525 /* Hash table for data dependence relation in the loop to be distributed. */
526 hash_table<ddr_hasher> *ddrs_table;
528 /* Array mapping basic block's index to its topological order. */
529 int *bb_top_order_index;
530 /* And size of the array. */
531 int bb_top_order_index_size;
533 /* Build the vertices of the reduced dependence graph RDG. Return false
534 if that failed. */
535 bool create_rdg_vertices (struct graph *rdg, const vec<gimple *> &stmts,
536 loop_p loop);
538 /* Initialize STMTS with all the statements of LOOP. We use topological
539 order to discover all statements. The order is important because
540 generate_loops_for_partition is using the same traversal for identifying
541 statements in loop copies. */
542 void stmts_from_loop (class loop *loop, vec<gimple *> *stmts);
545 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
546 LOOP, and one edge per flow dependence or control dependence from control
547 dependence CD. During visiting each statement, data references are also
548 collected and recorded in global data DATAREFS_VEC. */
549 struct graph * build_rdg (class loop *loop, control_dependences *cd);
551 /* Merge PARTITION into the partition DEST. RDG is the reduced dependence
552 graph and we update type for result partition if it is non-NULL. */
553 void partition_merge_into (struct graph *rdg,
554 partition *dest, partition *partition,
555 enum fuse_type ft);
558 /* Return data dependence relation for data references A and B. The two
559 data references must be in lexicographic order wrto reduced dependence
560 graph RDG. We firstly try to find ddr from global ddr hash table. If
561 it doesn't exist, compute the ddr and cache it. */
562 data_dependence_relation * get_data_dependence (struct graph *rdg,
563 data_reference_p a,
564 data_reference_p b);
567 /* In reduced dependence graph RDG for loop distribution, return true if
568 dependence between references DR1 and DR2 leads to a dependence cycle
569 and such dependence cycle can't be resolved by runtime alias check. */
570 bool data_dep_in_cycle_p (struct graph *rdg, data_reference_p dr1,
571 data_reference_p dr2);
574 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
575 PARTITION1's type after merging PARTITION2 into PARTITION1. */
576 void update_type_for_merge (struct graph *rdg,
577 partition *partition1, partition *partition2);
580 /* Returns a partition with all the statements needed for computing
581 the vertex V of the RDG, also including the loop exit conditions. */
582 partition *build_rdg_partition_for_vertex (struct graph *rdg, int v);
584 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
585 if it forms builtin memcpy or memmove call. */
586 void classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
587 data_reference_p dst_dr, data_reference_p src_dr);
589 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
590 For the moment we detect memset, memcpy and memmove patterns. Bitmap
591 STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions.
592 Returns true if there is a reduction in all partitions and we
593 possibly did not mark PARTITION as having one for this reason. */
595 bool
596 classify_partition (loop_p loop,
597 struct graph *rdg, partition *partition,
598 bitmap stmt_in_all_partitions);
601 /* Returns true when PARTITION1 and PARTITION2 access the same memory
602 object in RDG. */
603 bool share_memory_accesses (struct graph *rdg,
604 partition *partition1, partition *partition2);
606 /* For each seed statement in STARTING_STMTS, this function builds
607 partition for it by adding depended statements according to RDG.
608 All partitions are recorded in PARTITIONS. */
609 void rdg_build_partitions (struct graph *rdg,
610 vec<gimple *> starting_stmts,
611 vec<partition *> *partitions);
613 /* Compute partition dependence created by the data references in DRS1
614 and DRS2, modify and return DIR according to that. IF ALIAS_DDR is
615 not NULL, we record dependence introduced by possible alias between
616 two data references in ALIAS_DDRS; otherwise, we simply ignore such
617 dependence as if it doesn't exist at all. */
618 int pg_add_dependence_edges (struct graph *rdg, int dir, bitmap drs1,
619 bitmap drs2, vec<ddr_p> *alias_ddrs);
622 /* Build and return partition dependence graph for PARTITIONS. RDG is
623 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
624 is true, data dependence caused by possible alias between references
625 is ignored, as if it doesn't exist at all; otherwise all depdendences
626 are considered. */
627 struct graph *build_partition_graph (struct graph *rdg,
628 vec<struct partition *> *partitions,
629 bool ignore_alias_p);
631 /* Given reduced dependence graph RDG merge strong connected components
632 of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by
633 possible alias between references is ignored, as if it doesn't exist
634 at all; otherwise all depdendences are considered. */
635 void merge_dep_scc_partitions (struct graph *rdg, vec<struct partition *>
636 *partitions, bool ignore_alias_p);
638 /* This is the main function breaking strong conected components in
639 PARTITIONS giving reduced depdendence graph RDG. Store data dependence
640 relations for runtime alias check in ALIAS_DDRS. */
641 void break_alias_scc_partitions (struct graph *rdg, vec<struct partition *>
642 *partitions, vec<ddr_p> *alias_ddrs);
645 /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
646 ALIAS_DDRS contains ddrs which need runtime alias check. */
647 void finalize_partitions (class loop *loop, vec<struct partition *>
648 *partitions, vec<ddr_p> *alias_ddrs);
650 /* Distributes the code from LOOP in such a way that producer statements
651 are placed before consumer statements. Tries to separate only the
652 statements from STMTS into separate loops. Returns the number of
653 distributed loops. Set NB_CALLS to number of generated builtin calls.
654 Set *DESTROY_P to whether LOOP needs to be destroyed. */
655 int distribute_loop (class loop *loop, const vec<gimple *> &stmts,
656 control_dependences *cd, int *nb_calls, bool *destroy_p,
657 bool only_patterns_p);
659 /* Transform loops which mimic the effects of builtins rawmemchr or strlen and
660 replace them accordingly. */
661 bool transform_reduction_loop (loop_p loop);
663 /* Compute topological order for basic blocks. Topological order is
664 needed because data dependence is computed for data references in
665 lexicographical order. */
666 void bb_top_order_init (void);
668 void bb_top_order_destroy (void);
670 public:
672 /* Getter for bb_top_order. */
674 inline int get_bb_top_order_index_size (void)
676 return bb_top_order_index_size;
679 inline int get_bb_top_order_index (int i)
681 return bb_top_order_index[i];
684 unsigned int execute (function *fun);
688 /* If X has a smaller topological sort number than Y, returns -1;
689 if greater, returns 1. */
690 static int
691 bb_top_order_cmp_r (const void *x, const void *y, void *loop)
693 loop_distribution *_loop =
694 (loop_distribution *) loop;
696 basic_block bb1 = *(const basic_block *) x;
697 basic_block bb2 = *(const basic_block *) y;
699 int bb_top_order_index_size = _loop->get_bb_top_order_index_size ();
701 gcc_assert (bb1->index < bb_top_order_index_size
702 && bb2->index < bb_top_order_index_size);
703 gcc_assert (bb1 == bb2
704 || _loop->get_bb_top_order_index(bb1->index)
705 != _loop->get_bb_top_order_index(bb2->index));
707 return (_loop->get_bb_top_order_index(bb1->index) -
708 _loop->get_bb_top_order_index(bb2->index));
711 bool
712 loop_distribution::create_rdg_vertices (struct graph *rdg,
713 const vec<gimple *> &stmts,
714 loop_p loop)
716 int i;
717 gimple *stmt;
719 FOR_EACH_VEC_ELT (stmts, i, stmt)
721 struct vertex *v = &(rdg->vertices[i]);
723 /* Record statement to vertex mapping. */
724 gimple_set_uid (stmt, i);
726 v->data = XNEW (struct rdg_vertex);
727 RDGV_STMT (v) = stmt;
728 RDGV_DATAREFS (v).create (0);
729 RDGV_HAS_MEM_WRITE (v) = false;
730 RDGV_HAS_MEM_READS (v) = false;
731 if (gimple_code (stmt) == GIMPLE_PHI)
732 continue;
734 unsigned drp = datarefs_vec.length ();
735 if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
736 return false;
737 for (unsigned j = drp; j < datarefs_vec.length (); ++j)
739 data_reference_p dr = datarefs_vec[j];
740 if (DR_IS_READ (dr))
741 RDGV_HAS_MEM_READS (v) = true;
742 else
743 RDGV_HAS_MEM_WRITE (v) = true;
744 RDGV_DATAREFS (v).safe_push (dr);
745 has_nonaddressable_dataref_p |= may_be_nonaddressable_p (dr->ref);
748 return true;
751 void
752 loop_distribution::stmts_from_loop (class loop *loop, vec<gimple *> *stmts)
754 unsigned int i;
755 basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r);
757 for (i = 0; i < loop->num_nodes; i++)
759 basic_block bb = bbs[i];
761 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
762 gsi_next (&bsi))
763 if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
764 stmts->safe_push (bsi.phi ());
766 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
767 gsi_next (&bsi))
769 gimple *stmt = gsi_stmt (bsi);
770 if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
771 stmts->safe_push (stmt);
775 free (bbs);
778 /* Free the reduced dependence graph RDG. */
780 static void
781 free_rdg (struct graph *rdg)
783 int i;
785 for (i = 0; i < rdg->n_vertices; i++)
787 struct vertex *v = &(rdg->vertices[i]);
788 struct graph_edge *e;
790 for (e = v->succ; e; e = e->succ_next)
791 free (e->data);
793 if (v->data)
795 gimple_set_uid (RDGV_STMT (v), -1);
796 (RDGV_DATAREFS (v)).release ();
797 free (v->data);
801 free_graph (rdg);
804 struct graph *
805 loop_distribution::build_rdg (class loop *loop, control_dependences *cd)
807 struct graph *rdg;
809 /* Create the RDG vertices from the stmts of the loop nest. */
810 auto_vec<gimple *, 10> stmts;
811 stmts_from_loop (loop, &stmts);
812 rdg = new_graph (stmts.length ());
813 if (!create_rdg_vertices (rdg, stmts, loop))
815 free_rdg (rdg);
816 return NULL;
818 stmts.release ();
820 create_rdg_flow_edges (rdg);
821 if (cd)
822 create_rdg_cd_edges (rdg, cd, loop);
824 return rdg;
828 /* Allocate and initialize a partition from BITMAP. */
830 static partition *
831 partition_alloc (void)
833 partition *partition = XCNEW (struct partition);
834 partition->stmts = BITMAP_ALLOC (NULL);
835 partition->reduction_p = false;
836 partition->loc = UNKNOWN_LOCATION;
837 partition->kind = PKIND_NORMAL;
838 partition->type = PTYPE_PARALLEL;
839 partition->datarefs = BITMAP_ALLOC (NULL);
840 return partition;
843 /* Free PARTITION. */
845 static void
846 partition_free (partition *partition)
848 BITMAP_FREE (partition->stmts);
849 BITMAP_FREE (partition->datarefs);
850 if (partition->builtin)
851 free (partition->builtin);
853 free (partition);
856 /* Returns true if the partition can be generated as a builtin. */
858 static bool
859 partition_builtin_p (partition *partition)
861 return partition->kind > PKIND_PARTIAL_MEMSET;
864 /* Returns true if the partition contains a reduction. */
866 static bool
867 partition_reduction_p (partition *partition)
869 return partition->reduction_p;
872 void
873 loop_distribution::partition_merge_into (struct graph *rdg,
874 partition *dest, partition *partition, enum fuse_type ft)
876 if (dump_file && (dump_flags & TDF_DETAILS))
878 fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
879 fprintf (dump_file, " Part 1: ");
880 dump_bitmap (dump_file, dest->stmts);
881 fprintf (dump_file, " Part 2: ");
882 dump_bitmap (dump_file, partition->stmts);
885 dest->kind = PKIND_NORMAL;
886 if (dest->type == PTYPE_PARALLEL)
887 dest->type = partition->type;
889 bitmap_ior_into (dest->stmts, partition->stmts);
890 if (partition_reduction_p (partition))
891 dest->reduction_p = true;
893 /* Further check if any data dependence prevents us from executing the
894 new partition parallelly. */
895 if (dest->type == PTYPE_PARALLEL && rdg != NULL)
896 update_type_for_merge (rdg, dest, partition);
898 bitmap_ior_into (dest->datarefs, partition->datarefs);
902 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
903 the LOOP. */
905 static bool
906 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
908 imm_use_iterator imm_iter;
909 use_operand_p use_p;
911 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
913 if (is_gimple_debug (USE_STMT (use_p)))
914 continue;
916 basic_block use_bb = gimple_bb (USE_STMT (use_p));
917 if (!flow_bb_inside_loop_p (loop, use_bb))
918 return true;
921 return false;
924 /* Returns true when STMT defines a scalar variable used after the
925 loop LOOP. */
927 static bool
928 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
930 def_operand_p def_p;
931 ssa_op_iter op_iter;
933 if (gimple_code (stmt) == GIMPLE_PHI)
934 return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
936 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
937 if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
938 return true;
940 return false;
943 /* Return a copy of LOOP placed before LOOP. */
945 static class loop *
946 copy_loop_before (class loop *loop, bool redirect_lc_phi_defs)
948 class loop *res;
949 edge preheader = loop_preheader_edge (loop);
951 initialize_original_copy_tables ();
952 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
953 gcc_assert (res != NULL);
955 /* When a not last partition is supposed to keep the LC PHIs computed
956 adjust their definitions. */
957 if (redirect_lc_phi_defs)
959 edge exit = single_exit (loop);
960 for (gphi_iterator si = gsi_start_phis (exit->dest); !gsi_end_p (si);
961 gsi_next (&si))
963 gphi *phi = si.phi ();
964 if (virtual_operand_p (gimple_phi_result (phi)))
965 continue;
966 use_operand_p use_p = PHI_ARG_DEF_PTR_FROM_EDGE (phi, exit);
967 tree new_def = get_current_def (USE_FROM_PTR (use_p));
968 SET_USE (use_p, new_def);
972 free_original_copy_tables ();
973 delete_update_ssa ();
975 return res;
978 /* Creates an empty basic block after LOOP. */
980 static void
981 create_bb_after_loop (class loop *loop)
983 edge exit = single_exit (loop);
985 if (!exit)
986 return;
988 split_edge (exit);
991 /* Generate code for PARTITION from the code in LOOP. The loop is
992 copied when COPY_P is true. All the statements not flagged in the
993 PARTITION bitmap are removed from the loop or from its copy. The
994 statements are indexed in sequence inside a basic block, and the
995 basic blocks of a loop are taken in dom order. */
997 static void
998 generate_loops_for_partition (class loop *loop, partition *partition,
999 bool copy_p, bool keep_lc_phis_p)
1001 unsigned i;
1002 basic_block *bbs;
1004 if (copy_p)
1006 int orig_loop_num = loop->orig_loop_num;
1007 loop = copy_loop_before (loop, keep_lc_phis_p);
1008 gcc_assert (loop != NULL);
1009 loop->orig_loop_num = orig_loop_num;
1010 create_preheader (loop, CP_SIMPLE_PREHEADERS);
1011 create_bb_after_loop (loop);
1013 else
1015 /* Origin number is set to the new versioned loop's num. */
1016 gcc_assert (loop->orig_loop_num != loop->num);
1019 /* Remove stmts not in the PARTITION bitmap. */
1020 bbs = get_loop_body_in_dom_order (loop);
1022 if (MAY_HAVE_DEBUG_BIND_STMTS)
1023 for (i = 0; i < loop->num_nodes; i++)
1025 basic_block bb = bbs[i];
1027 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
1028 gsi_next (&bsi))
1030 gphi *phi = bsi.phi ();
1031 if (!virtual_operand_p (gimple_phi_result (phi))
1032 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
1033 reset_debug_uses (phi);
1036 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
1038 gimple *stmt = gsi_stmt (bsi);
1039 if (gimple_code (stmt) != GIMPLE_LABEL
1040 && !is_gimple_debug (stmt)
1041 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
1042 reset_debug_uses (stmt);
1046 for (i = 0; i < loop->num_nodes; i++)
1048 basic_block bb = bbs[i];
1049 edge inner_exit = NULL;
1051 if (loop != bb->loop_father)
1052 inner_exit = single_exit (bb->loop_father);
1054 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
1056 gphi *phi = bsi.phi ();
1057 if (!virtual_operand_p (gimple_phi_result (phi))
1058 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
1059 remove_phi_node (&bsi, true);
1060 else
1061 gsi_next (&bsi);
1064 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
1066 gimple *stmt = gsi_stmt (bsi);
1067 if (gimple_code (stmt) != GIMPLE_LABEL
1068 && !is_gimple_debug (stmt)
1069 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
1071 /* In distribution of loop nest, if bb is inner loop's exit_bb,
1072 we choose its exit edge/path in order to avoid generating
1073 infinite loop. For all other cases, we choose an arbitrary
1074 path through the empty CFG part that this unnecessary
1075 control stmt controls. */
1076 if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
1078 if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
1079 gimple_cond_make_true (cond_stmt);
1080 else
1081 gimple_cond_make_false (cond_stmt);
1082 update_stmt (stmt);
1084 else if (gimple_code (stmt) == GIMPLE_SWITCH)
1086 gswitch *switch_stmt = as_a <gswitch *> (stmt);
1087 gimple_switch_set_index
1088 (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
1089 update_stmt (stmt);
1091 else
1093 unlink_stmt_vdef (stmt);
1094 gsi_remove (&bsi, true);
1095 release_defs (stmt);
1096 continue;
1099 gsi_next (&bsi);
1103 free (bbs);
1106 /* If VAL memory representation contains the same value in all bytes,
1107 return that value, otherwise return -1.
1108 E.g. for 0x24242424 return 0x24, for IEEE double
1109 747708026454360457216.0 return 0x44, etc. */
1111 static int
1112 const_with_all_bytes_same (tree val)
1114 unsigned char buf[64];
1115 int i, len;
1117 if (integer_zerop (val)
1118 || (TREE_CODE (val) == CONSTRUCTOR
1119 && !TREE_CLOBBER_P (val)
1120 && CONSTRUCTOR_NELTS (val) == 0))
1121 return 0;
1123 if (real_zerop (val))
1125 /* Only return 0 for +0.0, not for -0.0, which doesn't have
1126 an all bytes same memory representation. Don't transform
1127 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */
1128 switch (TREE_CODE (val))
1130 case REAL_CST:
1131 if (!real_isneg (TREE_REAL_CST_PTR (val)))
1132 return 0;
1133 break;
1134 case COMPLEX_CST:
1135 if (!const_with_all_bytes_same (TREE_REALPART (val))
1136 && !const_with_all_bytes_same (TREE_IMAGPART (val)))
1137 return 0;
1138 break;
1139 case VECTOR_CST:
1141 unsigned int count = vector_cst_encoded_nelts (val);
1142 unsigned int j;
1143 for (j = 0; j < count; ++j)
1144 if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
1145 break;
1146 if (j == count)
1147 return 0;
1148 break;
1150 default:
1151 break;
1155 if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
1156 return -1;
1158 len = native_encode_expr (val, buf, sizeof (buf));
1159 if (len == 0)
1160 return -1;
1161 for (i = 1; i < len; i++)
1162 if (buf[i] != buf[0])
1163 return -1;
1164 return buf[0];
1167 /* Generate a call to memset for PARTITION in LOOP. */
1169 static void
1170 generate_memset_builtin (class loop *loop, partition *partition)
1172 gimple_stmt_iterator gsi;
1173 tree mem, fn, nb_bytes;
1174 tree val;
1175 struct builtin_info *builtin = partition->builtin;
1176 gimple *fn_call;
1178 /* The new statements will be placed before LOOP. */
1179 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1181 nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1182 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1183 false, GSI_CONTINUE_LINKING);
1184 mem = rewrite_to_non_trapping_overflow (builtin->dst_base);
1185 mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1186 false, GSI_CONTINUE_LINKING);
1188 /* This exactly matches the pattern recognition in classify_partition. */
1189 val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
1190 /* Handle constants like 0x15151515 and similarly
1191 floating point constants etc. where all bytes are the same. */
1192 int bytev = const_with_all_bytes_same (val);
1193 if (bytev != -1)
1194 val = build_int_cst (integer_type_node, bytev);
1195 else if (TREE_CODE (val) == INTEGER_CST)
1196 val = fold_convert (integer_type_node, val);
1197 else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1199 tree tem = make_ssa_name (integer_type_node);
1200 gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1201 gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1202 val = tem;
1205 fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1206 fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1207 gimple_set_location (fn_call, partition->loc);
1208 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1209 fold_stmt (&gsi);
1211 if (dump_file && (dump_flags & TDF_DETAILS))
1213 fprintf (dump_file, "generated memset");
1214 if (bytev == 0)
1215 fprintf (dump_file, " zero\n");
1216 else
1217 fprintf (dump_file, "\n");
1221 /* Generate a call to memcpy for PARTITION in LOOP. */
1223 static void
1224 generate_memcpy_builtin (class loop *loop, partition *partition)
1226 gimple_stmt_iterator gsi;
1227 gimple *fn_call;
1228 tree dest, src, fn, nb_bytes;
1229 enum built_in_function kind;
1230 struct builtin_info *builtin = partition->builtin;
1232 /* The new statements will be placed before LOOP. */
1233 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1235 nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1236 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1237 false, GSI_CONTINUE_LINKING);
1238 dest = rewrite_to_non_trapping_overflow (builtin->dst_base);
1239 src = rewrite_to_non_trapping_overflow (builtin->src_base);
1240 if (partition->kind == PKIND_MEMCPY
1241 || ! ptr_derefs_may_alias_p (dest, src))
1242 kind = BUILT_IN_MEMCPY;
1243 else
1244 kind = BUILT_IN_MEMMOVE;
1245 /* Try harder if we're copying a constant size. */
1246 if (kind == BUILT_IN_MEMMOVE && poly_int_tree_p (nb_bytes))
1248 aff_tree asrc, adest;
1249 tree_to_aff_combination (src, ptr_type_node, &asrc);
1250 tree_to_aff_combination (dest, ptr_type_node, &adest);
1251 aff_combination_scale (&adest, -1);
1252 aff_combination_add (&asrc, &adest);
1253 if (aff_comb_cannot_overlap_p (&asrc, wi::to_poly_widest (nb_bytes),
1254 wi::to_poly_widest (nb_bytes)))
1255 kind = BUILT_IN_MEMCPY;
1258 dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1259 false, GSI_CONTINUE_LINKING);
1260 src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1261 false, GSI_CONTINUE_LINKING);
1262 fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1263 fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1264 gimple_set_location (fn_call, partition->loc);
1265 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1266 fold_stmt (&gsi);
1268 if (dump_file && (dump_flags & TDF_DETAILS))
1270 if (kind == BUILT_IN_MEMCPY)
1271 fprintf (dump_file, "generated memcpy\n");
1272 else
1273 fprintf (dump_file, "generated memmove\n");
1277 /* Remove and destroy the loop LOOP. */
1279 static void
1280 destroy_loop (class loop *loop)
1282 unsigned nbbs = loop->num_nodes;
1283 edge exit = single_exit (loop);
1284 basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1285 basic_block *bbs;
1286 unsigned i;
1288 bbs = get_loop_body_in_dom_order (loop);
1290 gimple_stmt_iterator dst_gsi = gsi_after_labels (exit->dest);
1291 bool safe_p = single_pred_p (exit->dest);
1292 for (unsigned i = 0; i < nbbs; ++i)
1294 /* We have made sure to not leave any dangling uses of SSA
1295 names defined in the loop. With the exception of virtuals.
1296 Make sure we replace all uses of virtual defs that will remain
1297 outside of the loop with the bare symbol as delete_basic_block
1298 will release them. */
1299 for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1300 gsi_next (&gsi))
1302 gphi *phi = gsi.phi ();
1303 if (virtual_operand_p (gimple_phi_result (phi)))
1304 mark_virtual_phi_result_for_renaming (phi);
1306 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);)
1308 gimple *stmt = gsi_stmt (gsi);
1309 tree vdef = gimple_vdef (stmt);
1310 if (vdef && TREE_CODE (vdef) == SSA_NAME)
1311 mark_virtual_operand_for_renaming (vdef);
1312 /* Also move and eventually reset debug stmts. We can leave
1313 constant values in place in case the stmt dominates the exit.
1314 ??? Non-constant values from the last iteration can be
1315 replaced with final values if we can compute them. */
1316 if (gimple_debug_bind_p (stmt))
1318 tree val = gimple_debug_bind_get_value (stmt);
1319 gsi_move_before (&gsi, &dst_gsi);
1320 if (val
1321 && (!safe_p
1322 || !is_gimple_min_invariant (val)
1323 || !dominated_by_p (CDI_DOMINATORS, exit->src, bbs[i])))
1325 gimple_debug_bind_reset_value (stmt);
1326 update_stmt (stmt);
1329 else
1330 gsi_next (&gsi);
1334 redirect_edge_pred (exit, src);
1335 exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1336 exit->flags |= EDGE_FALLTHRU;
1337 cancel_loop_tree (loop);
1338 rescan_loop_exit (exit, false, true);
1340 i = nbbs;
1343 --i;
1344 delete_basic_block (bbs[i]);
1346 while (i != 0);
1348 free (bbs);
1350 set_immediate_dominator (CDI_DOMINATORS, dest,
1351 recompute_dominator (CDI_DOMINATORS, dest));
1354 /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */
1356 static bool
1357 generate_code_for_partition (class loop *loop,
1358 partition *partition, bool copy_p,
1359 bool keep_lc_phis_p)
1361 switch (partition->kind)
1363 case PKIND_NORMAL:
1364 case PKIND_PARTIAL_MEMSET:
1365 /* Reductions all have to be in the last partition. */
1366 gcc_assert (!partition_reduction_p (partition)
1367 || !copy_p);
1368 generate_loops_for_partition (loop, partition, copy_p,
1369 keep_lc_phis_p);
1370 return false;
1372 case PKIND_MEMSET:
1373 generate_memset_builtin (loop, partition);
1374 break;
1376 case PKIND_MEMCPY:
1377 case PKIND_MEMMOVE:
1378 generate_memcpy_builtin (loop, partition);
1379 break;
1381 default:
1382 gcc_unreachable ();
1385 /* Common tail for partitions we turn into a call. If this was the last
1386 partition for which we generate code, we have to destroy the loop. */
1387 if (!copy_p)
1388 return true;
1389 return false;
1392 data_dependence_relation *
1393 loop_distribution::get_data_dependence (struct graph *rdg, data_reference_p a,
1394 data_reference_p b)
1396 struct data_dependence_relation ent, **slot;
1397 struct data_dependence_relation *ddr;
1399 gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1400 gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1401 <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1402 ent.a = a;
1403 ent.b = b;
1404 slot = ddrs_table->find_slot (&ent, INSERT);
1405 if (*slot == NULL)
1407 ddr = initialize_data_dependence_relation (a, b, loop_nest);
1408 compute_affine_dependence (ddr, loop_nest[0]);
1409 *slot = ddr;
1412 return *slot;
1415 bool
1416 loop_distribution::data_dep_in_cycle_p (struct graph *rdg,
1417 data_reference_p dr1,
1418 data_reference_p dr2)
1420 struct data_dependence_relation *ddr;
1422 /* Re-shuffle data-refs to be in topological order. */
1423 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1424 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1425 std::swap (dr1, dr2);
1427 ddr = get_data_dependence (rdg, dr1, dr2);
1429 /* In case of no data dependence. */
1430 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1431 return false;
1432 /* For unknown data dependence or known data dependence which can't be
1433 expressed in classic distance vector, we check if it can be resolved
1434 by runtime alias check. If yes, we still consider data dependence
1435 as won't introduce data dependence cycle. */
1436 else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1437 || DDR_NUM_DIST_VECTS (ddr) == 0)
1438 return !runtime_alias_check_p (ddr, NULL, true);
1439 else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1440 return true;
1441 else if (DDR_REVERSED_P (ddr)
1442 || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1443 return false;
1445 return true;
1448 void
1449 loop_distribution::update_type_for_merge (struct graph *rdg,
1450 partition *partition1,
1451 partition *partition2)
1453 unsigned i, j;
1454 bitmap_iterator bi, bj;
1455 data_reference_p dr1, dr2;
1457 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1459 unsigned start = (partition1 == partition2) ? i + 1 : 0;
1461 dr1 = datarefs_vec[i];
1462 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1464 dr2 = datarefs_vec[j];
1465 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1466 continue;
1468 /* Partition can only be executed sequentially if there is any
1469 data dependence cycle. */
1470 if (data_dep_in_cycle_p (rdg, dr1, dr2))
1472 partition1->type = PTYPE_SEQUENTIAL;
1473 return;
1479 partition *
1480 loop_distribution::build_rdg_partition_for_vertex (struct graph *rdg, int v)
1482 partition *partition = partition_alloc ();
1483 auto_vec<int, 3> nodes;
1484 unsigned i, j;
1485 int x;
1486 data_reference_p dr;
1488 graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1490 FOR_EACH_VEC_ELT (nodes, i, x)
1492 bitmap_set_bit (partition->stmts, x);
1494 for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1496 unsigned idx = (unsigned) DR_INDEX (dr);
1497 gcc_assert (idx < datarefs_vec.length ());
1499 /* Partition can only be executed sequentially if there is any
1500 unknown data reference. */
1501 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1502 || !DR_INIT (dr) || !DR_STEP (dr))
1503 partition->type = PTYPE_SEQUENTIAL;
1505 bitmap_set_bit (partition->datarefs, idx);
1509 if (partition->type == PTYPE_SEQUENTIAL)
1510 return partition;
1512 /* Further check if any data dependence prevents us from executing the
1513 partition parallelly. */
1514 update_type_for_merge (rdg, partition, partition);
1516 return partition;
1519 /* Given PARTITION of LOOP and RDG, record single load/store data references
1520 for builtin partition in SRC_DR/DST_DR, return false if there is no such
1521 data references. */
1523 static bool
1524 find_single_drs (class loop *loop, struct graph *rdg, const bitmap &partition_stmts,
1525 data_reference_p *dst_dr, data_reference_p *src_dr)
1527 unsigned i;
1528 data_reference_p single_ld = NULL, single_st = NULL;
1529 bitmap_iterator bi;
1531 EXECUTE_IF_SET_IN_BITMAP (partition_stmts, 0, i, bi)
1533 gimple *stmt = RDG_STMT (rdg, i);
1534 data_reference_p dr;
1536 if (gimple_code (stmt) == GIMPLE_PHI)
1537 continue;
1539 /* Any scalar stmts are ok. */
1540 if (!gimple_vuse (stmt))
1541 continue;
1543 /* Otherwise just regular loads/stores. */
1544 if (!gimple_assign_single_p (stmt))
1545 return false;
1547 /* But exactly one store and/or load. */
1548 for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1550 tree type = TREE_TYPE (DR_REF (dr));
1552 /* The memset, memcpy and memmove library calls are only
1553 able to deal with generic address space. */
1554 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1555 return false;
1557 if (DR_IS_READ (dr))
1559 if (single_ld != NULL)
1560 return false;
1561 single_ld = dr;
1563 else
1565 if (single_st != NULL)
1566 return false;
1567 single_st = dr;
1572 if (!single_ld && !single_st)
1573 return false;
1575 basic_block bb_ld = NULL;
1576 basic_block bb_st = NULL;
1578 if (single_ld)
1580 /* Bail out if this is a bitfield memory reference. */
1581 if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1582 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1583 return false;
1585 /* Data reference must be executed exactly once per iteration of each
1586 loop in the loop nest. We only need to check dominance information
1587 against the outermost one in a perfect loop nest because a bb can't
1588 dominate outermost loop's latch without dominating inner loop's. */
1589 bb_ld = gimple_bb (DR_STMT (single_ld));
1590 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1591 return false;
1594 if (single_st)
1596 /* Bail out if this is a bitfield memory reference. */
1597 if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1598 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1599 return false;
1601 /* Data reference must be executed exactly once per iteration.
1602 Same as single_ld, we only need to check against the outermost
1603 loop. */
1604 bb_st = gimple_bb (DR_STMT (single_st));
1605 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1606 return false;
1609 if (single_ld && single_st)
1611 /* Load and store must be in the same loop nest. */
1612 if (bb_st->loop_father != bb_ld->loop_father)
1613 return false;
1615 edge e = single_exit (bb_st->loop_father);
1616 bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1617 bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1618 if (dom_ld != dom_st)
1619 return false;
1622 *src_dr = single_ld;
1623 *dst_dr = single_st;
1624 return true;
1627 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1628 loops from inner to outer to see if loop's step equals to access size at
1629 each level of loop. Return 2 if we can prove this at all level loops;
1630 record access base and size in BASE and SIZE; save loop's step at each
1631 level of loop in STEPS if it is not null. For example:
1633 int arr[100][100][100];
1634 for (i = 0; i < 100; i++) ;steps[2] = 40000
1635 for (j = 100; j > 0; j--) ;steps[1] = -400
1636 for (k = 0; k < 100; k++) ;steps[0] = 4
1637 arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000
1639 Return 1 if we can prove the equality at the innermost loop, but not all
1640 level loops. In this case, no information is recorded.
1642 Return 0 if no equality can be proven at any level loops. */
1644 static int
1645 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1646 tree *size, vec<tree> *steps = NULL)
1648 location_t loc = gimple_location (DR_STMT (dr));
1649 basic_block bb = gimple_bb (DR_STMT (dr));
1650 class loop *loop = bb->loop_father;
1651 tree ref = DR_REF (dr);
1652 tree access_base = build_fold_addr_expr (ref);
1653 tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1654 int res = 0;
1656 do {
1657 tree scev_fn = analyze_scalar_evolution (loop, access_base);
1658 if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1659 return res;
1661 access_base = CHREC_LEFT (scev_fn);
1662 if (tree_contains_chrecs (access_base, NULL))
1663 return res;
1665 tree scev_step = CHREC_RIGHT (scev_fn);
1666 /* Only support constant steps. */
1667 if (TREE_CODE (scev_step) != INTEGER_CST)
1668 return res;
1670 enum ev_direction access_dir = scev_direction (scev_fn);
1671 if (access_dir == EV_DIR_UNKNOWN)
1672 return res;
1674 if (steps != NULL)
1675 steps->safe_push (scev_step);
1677 scev_step = fold_convert_loc (loc, sizetype, scev_step);
1678 /* Compute absolute value of scev step. */
1679 if (access_dir == EV_DIR_DECREASES)
1680 scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1682 /* At each level of loop, scev step must equal to access size. In other
1683 words, DR must access consecutive memory between loop iterations. */
1684 if (!operand_equal_p (scev_step, access_size, 0))
1685 return res;
1687 /* Access stride can be computed for data reference at least for the
1688 innermost loop. */
1689 res = 1;
1691 /* Compute DR's execution times in loop. */
1692 tree niters = number_of_latch_executions (loop);
1693 niters = fold_convert_loc (loc, sizetype, niters);
1694 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1695 niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1697 /* Compute DR's overall access size in loop. */
1698 access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1699 niters, scev_step);
1700 /* Adjust base address in case of negative step. */
1701 if (access_dir == EV_DIR_DECREASES)
1703 tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1704 scev_step, access_size);
1705 access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1707 } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1709 *base = access_base;
1710 *size = access_size;
1711 /* Access stride can be computed for data reference at each level loop. */
1712 return 2;
1715 /* Allocate and return builtin struct. Record information like DST_DR,
1716 SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */
1718 static struct builtin_info *
1719 alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1720 tree dst_base, tree src_base, tree size)
1722 struct builtin_info *builtin = XNEW (struct builtin_info);
1723 builtin->dst_dr = dst_dr;
1724 builtin->src_dr = src_dr;
1725 builtin->dst_base = dst_base;
1726 builtin->src_base = src_base;
1727 builtin->size = size;
1728 return builtin;
1731 /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1732 memset call. */
1734 static void
1735 classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1737 gimple *stmt = DR_STMT (dr);
1738 tree base, size, rhs = gimple_assign_rhs1 (stmt);
1740 if (const_with_all_bytes_same (rhs) == -1
1741 && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1742 || (TYPE_MODE (TREE_TYPE (rhs))
1743 != TYPE_MODE (unsigned_char_type_node))))
1744 return;
1746 if (TREE_CODE (rhs) == SSA_NAME
1747 && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1748 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1749 return;
1751 int res = compute_access_range (loop, dr, &base, &size);
1752 if (res == 0)
1753 return;
1754 if (res == 1)
1756 partition->kind = PKIND_PARTIAL_MEMSET;
1757 return;
1760 tree base_offset;
1761 tree base_base;
1762 split_constant_offset (base, &base_base, &base_offset);
1763 if (!cst_and_fits_in_hwi (base_offset))
1764 return;
1765 unsigned HOST_WIDE_INT const_base_offset = int_cst_value (base_offset);
1767 struct builtin_info *builtin;
1768 builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1769 builtin->dst_base_base = base_base;
1770 builtin->dst_base_offset = const_base_offset;
1771 partition->builtin = builtin;
1772 partition->kind = PKIND_MEMSET;
1775 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1776 if it forms builtin memcpy or memmove call. */
1778 void
1779 loop_distribution::classify_builtin_ldst (loop_p loop, struct graph *rdg,
1780 partition *partition,
1781 data_reference_p dst_dr,
1782 data_reference_p src_dr)
1784 tree base, size, src_base, src_size;
1785 auto_vec<tree> dst_steps, src_steps;
1787 /* Compute access range of both load and store. */
1788 int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1789 if (res != 2)
1790 return;
1791 res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1792 if (res != 2)
1793 return;
1795 /* They must have the same access size. */
1796 if (!operand_equal_p (size, src_size, 0))
1797 return;
1799 /* They must have the same storage order. */
1800 if (reverse_storage_order_for_component_p (DR_REF (dst_dr))
1801 != reverse_storage_order_for_component_p (DR_REF (src_dr)))
1802 return;
1804 /* Load and store in loop nest must access memory in the same way, i.e,
1805 their must have the same steps in each loop of the nest. */
1806 if (dst_steps.length () != src_steps.length ())
1807 return;
1808 for (unsigned i = 0; i < dst_steps.length (); ++i)
1809 if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1810 return;
1812 /* Now check that if there is a dependence. */
1813 ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1815 /* Classify as memmove if no dependence between load and store. */
1816 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1818 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1819 partition->kind = PKIND_MEMMOVE;
1820 return;
1823 /* Can't do memmove in case of unknown dependence or dependence without
1824 classical distance vector. */
1825 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1826 || DDR_NUM_DIST_VECTS (ddr) == 0)
1827 return;
1829 unsigned i;
1830 lambda_vector dist_v;
1831 int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1832 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1834 unsigned dep_lev = dependence_level (dist_v, num_lev);
1835 /* Can't do memmove if load depends on store. */
1836 if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1837 return;
1840 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1841 partition->kind = PKIND_MEMMOVE;
1842 return;
1845 bool
1846 loop_distribution::classify_partition (loop_p loop,
1847 struct graph *rdg, partition *partition,
1848 bitmap stmt_in_all_partitions)
1850 bitmap_iterator bi;
1851 unsigned i;
1852 data_reference_p single_ld = NULL, single_st = NULL;
1853 bool volatiles_p = false, has_reduction = false;
1855 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1857 gimple *stmt = RDG_STMT (rdg, i);
1859 if (gimple_has_volatile_ops (stmt))
1860 volatiles_p = true;
1862 /* If the stmt is not included by all partitions and there is uses
1863 outside of the loop, then mark the partition as reduction. */
1864 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1866 /* Due to limitation in the transform phase we have to fuse all
1867 reduction partitions. As a result, this could cancel valid
1868 loop distribution especially for loop that induction variable
1869 is used outside of loop. To workaround this issue, we skip
1870 marking partition as reudction if the reduction stmt belongs
1871 to all partitions. In such case, reduction will be computed
1872 correctly no matter how partitions are fused/distributed. */
1873 if (!bitmap_bit_p (stmt_in_all_partitions, i))
1874 partition->reduction_p = true;
1875 else
1876 has_reduction = true;
1880 /* Simple workaround to prevent classifying the partition as builtin
1881 if it contains any use outside of loop. For the case where all
1882 partitions have the reduction this simple workaround is delayed
1883 to only affect the last partition. */
1884 if (partition->reduction_p)
1885 return has_reduction;
1887 /* Perform general partition disqualification for builtins. */
1888 if (volatiles_p
1889 || !flag_tree_loop_distribute_patterns)
1890 return has_reduction;
1892 /* Find single load/store data references for builtin partition. */
1893 if (!find_single_drs (loop, rdg, partition->stmts, &single_st, &single_ld)
1894 || !single_st)
1895 return has_reduction;
1897 if (single_ld && single_st)
1899 gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1900 /* Direct aggregate copy or via an SSA name temporary. */
1901 if (load != store
1902 && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1903 return has_reduction;
1906 partition->loc = gimple_location (DR_STMT (single_st));
1908 /* Classify the builtin kind. */
1909 if (single_ld == NULL)
1910 classify_builtin_st (loop, partition, single_st);
1911 else
1912 classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1913 return has_reduction;
1916 bool
1917 loop_distribution::share_memory_accesses (struct graph *rdg,
1918 partition *partition1, partition *partition2)
1920 unsigned i, j;
1921 bitmap_iterator bi, bj;
1922 data_reference_p dr1, dr2;
1924 /* First check whether in the intersection of the two partitions are
1925 any loads or stores. Common loads are the situation that happens
1926 most often. */
1927 EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1928 if (RDG_MEM_WRITE_STMT (rdg, i)
1929 || RDG_MEM_READS_STMT (rdg, i))
1930 return true;
1932 /* Then check whether the two partitions access the same memory object. */
1933 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1935 dr1 = datarefs_vec[i];
1937 if (!DR_BASE_ADDRESS (dr1)
1938 || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1939 continue;
1941 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1943 dr2 = datarefs_vec[j];
1945 if (!DR_BASE_ADDRESS (dr2)
1946 || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1947 continue;
1949 if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1950 && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1951 && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1952 && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1953 return true;
1957 return false;
1960 /* For each seed statement in STARTING_STMTS, this function builds
1961 partition for it by adding depended statements according to RDG.
1962 All partitions are recorded in PARTITIONS. */
1964 void
1965 loop_distribution::rdg_build_partitions (struct graph *rdg,
1966 vec<gimple *> starting_stmts,
1967 vec<partition *> *partitions)
1969 auto_bitmap processed;
1970 int i;
1971 gimple *stmt;
1973 FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1975 int v = rdg_vertex_for_stmt (rdg, stmt);
1977 if (dump_file && (dump_flags & TDF_DETAILS))
1978 fprintf (dump_file,
1979 "ldist asked to generate code for vertex %d\n", v);
1981 /* If the vertex is already contained in another partition so
1982 is the partition rooted at it. */
1983 if (bitmap_bit_p (processed, v))
1984 continue;
1986 partition *partition = build_rdg_partition_for_vertex (rdg, v);
1987 bitmap_ior_into (processed, partition->stmts);
1989 if (dump_file && (dump_flags & TDF_DETAILS))
1991 fprintf (dump_file, "ldist creates useful %s partition:\n",
1992 partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1993 bitmap_print (dump_file, partition->stmts, " ", "\n");
1996 partitions->safe_push (partition);
1999 /* All vertices should have been assigned to at least one partition now,
2000 other than vertices belonging to dead code. */
2003 /* Dump to FILE the PARTITIONS. */
2005 static void
2006 dump_rdg_partitions (FILE *file, const vec<partition *> &partitions)
2008 int i;
2009 partition *partition;
2011 FOR_EACH_VEC_ELT (partitions, i, partition)
2012 debug_bitmap_file (file, partition->stmts);
2015 /* Debug PARTITIONS. */
2016 extern void debug_rdg_partitions (const vec<partition *> &);
2018 DEBUG_FUNCTION void
2019 debug_rdg_partitions (const vec<partition *> &partitions)
2021 dump_rdg_partitions (stderr, partitions);
2024 /* Returns the number of read and write operations in the RDG. */
2026 static int
2027 number_of_rw_in_rdg (struct graph *rdg)
2029 int i, res = 0;
2031 for (i = 0; i < rdg->n_vertices; i++)
2033 if (RDG_MEM_WRITE_STMT (rdg, i))
2034 ++res;
2036 if (RDG_MEM_READS_STMT (rdg, i))
2037 ++res;
2040 return res;
2043 /* Returns the number of read and write operations in a PARTITION of
2044 the RDG. */
2046 static int
2047 number_of_rw_in_partition (struct graph *rdg, partition *partition)
2049 int res = 0;
2050 unsigned i;
2051 bitmap_iterator ii;
2053 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
2055 if (RDG_MEM_WRITE_STMT (rdg, i))
2056 ++res;
2058 if (RDG_MEM_READS_STMT (rdg, i))
2059 ++res;
2062 return res;
2065 /* Returns true when one of the PARTITIONS contains all the read or
2066 write operations of RDG. */
2068 static bool
2069 partition_contains_all_rw (struct graph *rdg,
2070 const vec<partition *> &partitions)
2072 int i;
2073 partition *partition;
2074 int nrw = number_of_rw_in_rdg (rdg);
2076 FOR_EACH_VEC_ELT (partitions, i, partition)
2077 if (nrw == number_of_rw_in_partition (rdg, partition))
2078 return true;
2080 return false;
2084 loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir,
2085 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
2087 unsigned i, j;
2088 bitmap_iterator bi, bj;
2089 data_reference_p dr1, dr2, saved_dr1;
2091 /* dependence direction - 0 is no dependence, -1 is back,
2092 1 is forth, 2 is both (we can stop then, merging will occur). */
2093 EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
2095 dr1 = datarefs_vec[i];
2097 EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
2099 int res, this_dir = 1;
2100 ddr_p ddr;
2102 dr2 = datarefs_vec[j];
2104 /* Skip all <read, read> data dependence. */
2105 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
2106 continue;
2108 saved_dr1 = dr1;
2109 /* Re-shuffle data-refs to be in topological order. */
2110 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
2111 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
2113 std::swap (dr1, dr2);
2114 this_dir = -this_dir;
2116 ddr = get_data_dependence (rdg, dr1, dr2);
2117 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2119 this_dir = 0;
2120 res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
2121 DR_BASE_ADDRESS (dr2));
2122 /* Be conservative. If data references are not well analyzed,
2123 or the two data references have the same base address and
2124 offset, add dependence and consider it alias to each other.
2125 In other words, the dependence cannot be resolved by
2126 runtime alias check. */
2127 if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
2128 || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
2129 || !DR_INIT (dr1) || !DR_INIT (dr2)
2130 || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
2131 || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
2132 || res == 0)
2133 this_dir = 2;
2134 /* Data dependence could be resolved by runtime alias check,
2135 record it in ALIAS_DDRS. */
2136 else if (alias_ddrs != NULL)
2137 alias_ddrs->safe_push (ddr);
2138 /* Or simply ignore it. */
2140 else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
2142 if (DDR_REVERSED_P (ddr))
2143 this_dir = -this_dir;
2145 /* Known dependences can still be unordered througout the
2146 iteration space, see gcc.dg/tree-ssa/ldist-16.c and
2147 gcc.dg/tree-ssa/pr94969.c. */
2148 if (DDR_NUM_DIST_VECTS (ddr) != 1)
2149 this_dir = 2;
2150 /* If the overlap is exact preserve stmt order. */
2151 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0),
2152 DDR_NB_LOOPS (ddr)))
2154 /* Else as the distance vector is lexicographic positive swap
2155 the dependence direction. */
2156 else
2157 this_dir = -this_dir;
2159 else
2160 this_dir = 0;
2161 if (this_dir == 2)
2162 return 2;
2163 else if (dir == 0)
2164 dir = this_dir;
2165 else if (this_dir != 0 && dir != this_dir)
2166 return 2;
2167 /* Shuffle "back" dr1. */
2168 dr1 = saved_dr1;
2171 return dir;
2174 /* Compare postorder number of the partition graph vertices V1 and V2. */
2176 static int
2177 pgcmp (const void *v1_, const void *v2_)
2179 const vertex *v1 = (const vertex *)v1_;
2180 const vertex *v2 = (const vertex *)v2_;
2181 return v2->post - v1->post;
2184 /* Data attached to vertices of partition dependence graph. */
2185 struct pg_vdata
2187 /* ID of the corresponding partition. */
2188 int id;
2189 /* The partition. */
2190 struct partition *partition;
2193 /* Data attached to edges of partition dependence graph. */
2194 struct pg_edata
2196 /* If the dependence edge can be resolved by runtime alias check,
2197 this vector contains data dependence relations for runtime alias
2198 check. On the other hand, if the dependence edge is introduced
2199 because of compilation time known data dependence, this vector
2200 contains nothing. */
2201 vec<ddr_p> alias_ddrs;
2204 /* Callback data for traversing edges in graph. */
2205 struct pg_edge_callback_data
2207 /* Bitmap contains strong connected components should be merged. */
2208 bitmap sccs_to_merge;
2209 /* Array constains component information for all vertices. */
2210 int *vertices_component;
2211 /* Vector to record all data dependence relations which are needed
2212 to break strong connected components by runtime alias checks. */
2213 vec<ddr_p> *alias_ddrs;
2216 /* Initialize vertice's data for partition dependence graph PG with
2217 PARTITIONS. */
2219 static void
2220 init_partition_graph_vertices (struct graph *pg,
2221 vec<struct partition *> *partitions)
2223 int i;
2224 partition *partition;
2225 struct pg_vdata *data;
2227 for (i = 0; partitions->iterate (i, &partition); ++i)
2229 data = new pg_vdata;
2230 pg->vertices[i].data = data;
2231 data->id = i;
2232 data->partition = partition;
2236 /* Add edge <I, J> to partition dependence graph PG. Attach vector of data
2237 dependence relations to the EDGE if DDRS isn't NULL. */
2239 static void
2240 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2242 struct graph_edge *e = add_edge (pg, i, j);
2244 /* If the edge is attached with data dependence relations, it means this
2245 dependence edge can be resolved by runtime alias checks. */
2246 if (ddrs != NULL)
2248 struct pg_edata *data = new pg_edata;
2250 gcc_assert (ddrs->length () > 0);
2251 e->data = data;
2252 data->alias_ddrs = vNULL;
2253 data->alias_ddrs.safe_splice (*ddrs);
2257 /* Callback function for graph travesal algorithm. It returns true
2258 if edge E should skipped when traversing the graph. */
2260 static bool
2261 pg_skip_alias_edge (struct graph_edge *e)
2263 struct pg_edata *data = (struct pg_edata *)e->data;
2264 return (data != NULL && data->alias_ddrs.length () > 0);
2267 /* Callback function freeing data attached to edge E of graph. */
2269 static void
2270 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2272 if (e->data != NULL)
2274 struct pg_edata *data = (struct pg_edata *)e->data;
2275 data->alias_ddrs.release ();
2276 delete data;
2280 /* Free data attached to vertice of partition dependence graph PG. */
2282 static void
2283 free_partition_graph_vdata (struct graph *pg)
2285 int i;
2286 struct pg_vdata *data;
2288 for (i = 0; i < pg->n_vertices; ++i)
2290 data = (struct pg_vdata *)pg->vertices[i].data;
2291 delete data;
2295 /* Build and return partition dependence graph for PARTITIONS. RDG is
2296 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
2297 is true, data dependence caused by possible alias between references
2298 is ignored, as if it doesn't exist at all; otherwise all depdendences
2299 are considered. */
2301 struct graph *
2302 loop_distribution::build_partition_graph (struct graph *rdg,
2303 vec<struct partition *> *partitions,
2304 bool ignore_alias_p)
2306 int i, j;
2307 struct partition *partition1, *partition2;
2308 graph *pg = new_graph (partitions->length ());
2309 auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2311 alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2313 init_partition_graph_vertices (pg, partitions);
2315 for (i = 0; partitions->iterate (i, &partition1); ++i)
2317 for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2319 /* dependence direction - 0 is no dependence, -1 is back,
2320 1 is forth, 2 is both (we can stop then, merging will occur). */
2321 int dir = 0;
2323 /* If the first partition has reduction, add back edge; if the
2324 second partition has reduction, add forth edge. This makes
2325 sure that reduction partition will be sorted as the last one. */
2326 if (partition_reduction_p (partition1))
2327 dir = -1;
2328 else if (partition_reduction_p (partition2))
2329 dir = 1;
2331 /* Cleanup the temporary vector. */
2332 alias_ddrs.truncate (0);
2334 dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2335 partition2->datarefs, alias_ddrs_p);
2337 /* Add edge to partition graph if there exists dependence. There
2338 are two types of edges. One type edge is caused by compilation
2339 time known dependence, this type cannot be resolved by runtime
2340 alias check. The other type can be resolved by runtime alias
2341 check. */
2342 if (dir == 1 || dir == 2
2343 || alias_ddrs.length () > 0)
2345 /* Attach data dependence relations to edge that can be resolved
2346 by runtime alias check. */
2347 bool alias_edge_p = (dir != 1 && dir != 2);
2348 add_partition_graph_edge (pg, i, j,
2349 (alias_edge_p) ? &alias_ddrs : NULL);
2351 if (dir == -1 || dir == 2
2352 || alias_ddrs.length () > 0)
2354 /* Attach data dependence relations to edge that can be resolved
2355 by runtime alias check. */
2356 bool alias_edge_p = (dir != -1 && dir != 2);
2357 add_partition_graph_edge (pg, j, i,
2358 (alias_edge_p) ? &alias_ddrs : NULL);
2362 return pg;
2365 /* Sort partitions in PG in descending post order and store them in
2366 PARTITIONS. */
2368 static void
2369 sort_partitions_by_post_order (struct graph *pg,
2370 vec<struct partition *> *partitions)
2372 int i;
2373 struct pg_vdata *data;
2375 /* Now order the remaining nodes in descending postorder. */
2376 qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2377 partitions->truncate (0);
2378 for (i = 0; i < pg->n_vertices; ++i)
2380 data = (struct pg_vdata *)pg->vertices[i].data;
2381 if (data->partition)
2382 partitions->safe_push (data->partition);
2386 void
2387 loop_distribution::merge_dep_scc_partitions (struct graph *rdg,
2388 vec<struct partition *> *partitions,
2389 bool ignore_alias_p)
2391 struct partition *partition1, *partition2;
2392 struct pg_vdata *data;
2393 graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2394 int i, j, num_sccs = graphds_scc (pg, NULL);
2396 /* Strong connected compoenent means dependence cycle, we cannot distribute
2397 them. So fuse them together. */
2398 if ((unsigned) num_sccs < partitions->length ())
2400 for (i = 0; i < num_sccs; ++i)
2402 for (j = 0; partitions->iterate (j, &partition1); ++j)
2403 if (pg->vertices[j].component == i)
2404 break;
2405 for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2406 if (pg->vertices[j].component == i)
2408 partition_merge_into (NULL, partition1,
2409 partition2, FUSE_SAME_SCC);
2410 partition1->type = PTYPE_SEQUENTIAL;
2411 (*partitions)[j] = NULL;
2412 partition_free (partition2);
2413 data = (struct pg_vdata *)pg->vertices[j].data;
2414 data->partition = NULL;
2419 sort_partitions_by_post_order (pg, partitions);
2420 gcc_assert (partitions->length () == (unsigned)num_sccs);
2421 free_partition_graph_vdata (pg);
2422 for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2423 free_graph (pg);
2426 /* Callback function for traversing edge E in graph G. DATA is private
2427 callback data. */
2429 static void
2430 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2432 int i, j, component;
2433 struct pg_edge_callback_data *cbdata;
2434 struct pg_edata *edata = (struct pg_edata *) e->data;
2436 /* If the edge doesn't have attached data dependence, it represents
2437 compilation time known dependences. This type dependence cannot
2438 be resolved by runtime alias check. */
2439 if (edata == NULL || edata->alias_ddrs.length () == 0)
2440 return;
2442 cbdata = (struct pg_edge_callback_data *) data;
2443 i = e->src;
2444 j = e->dest;
2445 component = cbdata->vertices_component[i];
2446 /* Vertices are topologically sorted according to compilation time
2447 known dependences, so we can break strong connected components
2448 by removing edges of the opposite direction, i.e, edges pointing
2449 from vertice with smaller post number to vertice with bigger post
2450 number. */
2451 if (g->vertices[i].post < g->vertices[j].post
2452 /* We only need to remove edges connecting vertices in the same
2453 strong connected component to break it. */
2454 && component == cbdata->vertices_component[j]
2455 /* Check if we want to break the strong connected component or not. */
2456 && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2457 cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2460 /* Callback function for traversing edge E. DATA is private
2461 callback data. */
2463 static void
2464 pg_unmark_merged_alias_ddrs (struct graph *, struct graph_edge *e, void *data)
2466 int i, j, component;
2467 struct pg_edge_callback_data *cbdata;
2468 struct pg_edata *edata = (struct pg_edata *) e->data;
2470 if (edata == NULL || edata->alias_ddrs.length () == 0)
2471 return;
2473 cbdata = (struct pg_edge_callback_data *) data;
2474 i = e->src;
2475 j = e->dest;
2476 component = cbdata->vertices_component[i];
2477 /* Make sure to not skip vertices inside SCCs we are going to merge. */
2478 if (component == cbdata->vertices_component[j]
2479 && bitmap_bit_p (cbdata->sccs_to_merge, component))
2481 edata->alias_ddrs.release ();
2482 delete edata;
2483 e->data = NULL;
2487 /* This is the main function breaking strong conected components in
2488 PARTITIONS giving reduced depdendence graph RDG. Store data dependence
2489 relations for runtime alias check in ALIAS_DDRS. */
2490 void
2491 loop_distribution::break_alias_scc_partitions (struct graph *rdg,
2492 vec<struct partition *> *partitions,
2493 vec<ddr_p> *alias_ddrs)
2495 int i, j, k, num_sccs, num_sccs_no_alias = 0;
2496 /* Build partition dependence graph. */
2497 graph *pg = build_partition_graph (rdg, partitions, false);
2499 alias_ddrs->truncate (0);
2500 /* Find strong connected components in the graph, with all dependence edges
2501 considered. */
2502 num_sccs = graphds_scc (pg, NULL);
2503 /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2504 compilation time known dependences are merged before this function. */
2505 if ((unsigned) num_sccs < partitions->length ())
2507 struct pg_edge_callback_data cbdata;
2508 auto_bitmap sccs_to_merge;
2509 auto_vec<enum partition_type> scc_types;
2510 struct partition *partition, *first;
2512 /* If all partitions in a SCC have the same type, we can simply merge the
2513 SCC. This loop finds out such SCCS and record them in bitmap. */
2514 bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2515 for (i = 0; i < num_sccs; ++i)
2517 for (j = 0; partitions->iterate (j, &first); ++j)
2518 if (pg->vertices[j].component == i)
2519 break;
2521 bool same_type = true, all_builtins = partition_builtin_p (first);
2522 for (++j; partitions->iterate (j, &partition); ++j)
2524 if (pg->vertices[j].component != i)
2525 continue;
2527 if (first->type != partition->type)
2529 same_type = false;
2530 break;
2532 all_builtins &= partition_builtin_p (partition);
2534 /* Merge SCC if all partitions in SCC have the same type, though the
2535 result partition is sequential, because vectorizer can do better
2536 runtime alias check. One expecption is all partitions in SCC are
2537 builtins. */
2538 if (!same_type || all_builtins)
2539 bitmap_clear_bit (sccs_to_merge, i);
2542 /* Initialize callback data for traversing. */
2543 cbdata.sccs_to_merge = sccs_to_merge;
2544 cbdata.alias_ddrs = alias_ddrs;
2545 cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2546 /* Record the component information which will be corrupted by next
2547 graph scc finding call. */
2548 for (i = 0; i < pg->n_vertices; ++i)
2549 cbdata.vertices_component[i] = pg->vertices[i].component;
2551 /* Collect data dependences for runtime alias checks to break SCCs. */
2552 if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2554 /* For SCCs we want to merge clear all alias_ddrs for edges
2555 inside the component. */
2556 for_each_edge (pg, pg_unmark_merged_alias_ddrs, &cbdata);
2558 /* Run SCC finding algorithm again, with alias dependence edges
2559 skipped. This is to topologically sort partitions according to
2560 compilation time known dependence. Note the topological order
2561 is stored in the form of pg's post order number. */
2562 num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2563 /* We cannot assert partitions->length () == num_sccs_no_alias
2564 since we are not ignoring alias edges in cycles we are
2565 going to merge. That's required to compute correct postorder. */
2566 /* With topological order, we can construct two subgraphs L and R.
2567 L contains edge <x, y> where x < y in terms of post order, while
2568 R contains edge <x, y> where x > y. Edges for compilation time
2569 known dependence all fall in R, so we break SCCs by removing all
2570 (alias) edges of in subgraph L. */
2571 for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2574 /* For SCC that doesn't need to be broken, merge it. */
2575 for (i = 0; i < num_sccs; ++i)
2577 if (!bitmap_bit_p (sccs_to_merge, i))
2578 continue;
2580 for (j = 0; partitions->iterate (j, &first); ++j)
2581 if (cbdata.vertices_component[j] == i)
2582 break;
2583 for (k = j + 1; partitions->iterate (k, &partition); ++k)
2585 struct pg_vdata *data;
2587 if (cbdata.vertices_component[k] != i)
2588 continue;
2590 partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2591 (*partitions)[k] = NULL;
2592 partition_free (partition);
2593 data = (struct pg_vdata *)pg->vertices[k].data;
2594 gcc_assert (data->id == k);
2595 data->partition = NULL;
2596 /* The result partition of merged SCC must be sequential. */
2597 first->type = PTYPE_SEQUENTIAL;
2600 /* If reduction partition's SCC is broken by runtime alias checks,
2601 we force a negative post order to it making sure it will be scheduled
2602 in the last. */
2603 if (num_sccs_no_alias > 0)
2605 j = -1;
2606 for (i = 0; i < pg->n_vertices; ++i)
2608 struct pg_vdata *data = (struct pg_vdata *)pg->vertices[i].data;
2609 if (data->partition && partition_reduction_p (data->partition))
2611 gcc_assert (j == -1);
2612 j = i;
2615 if (j >= 0)
2616 pg->vertices[j].post = -1;
2619 free (cbdata.vertices_component);
2622 sort_partitions_by_post_order (pg, partitions);
2623 free_partition_graph_vdata (pg);
2624 for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2625 free_graph (pg);
2627 if (dump_file && (dump_flags & TDF_DETAILS))
2629 fprintf (dump_file, "Possible alias data dependence to break:\n");
2630 dump_data_dependence_relations (dump_file, *alias_ddrs);
2634 /* Compute and return an expression whose value is the segment length which
2635 will be accessed by DR in NITERS iterations. */
2637 static tree
2638 data_ref_segment_size (struct data_reference *dr, tree niters)
2640 niters = size_binop (MINUS_EXPR,
2641 fold_convert (sizetype, niters),
2642 size_one_node);
2643 return size_binop (MULT_EXPR,
2644 fold_convert (sizetype, DR_STEP (dr)),
2645 fold_convert (sizetype, niters));
2648 /* Return true if LOOP's latch is dominated by statement for data reference
2649 DR. */
2651 static inline bool
2652 latch_dominated_by_data_ref (class loop *loop, data_reference *dr)
2654 return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2655 gimple_bb (DR_STMT (dr)));
2658 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2659 data dependence relations ALIAS_DDRS. */
2661 static void
2662 compute_alias_check_pairs (class loop *loop, vec<ddr_p> *alias_ddrs,
2663 vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2665 unsigned int i;
2666 unsigned HOST_WIDE_INT factor = 1;
2667 tree niters_plus_one, niters = number_of_latch_executions (loop);
2669 gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2670 niters = fold_convert (sizetype, niters);
2671 niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2673 if (dump_file && (dump_flags & TDF_DETAILS))
2674 fprintf (dump_file, "Creating alias check pairs:\n");
2676 /* Iterate all data dependence relations and compute alias check pairs. */
2677 for (i = 0; i < alias_ddrs->length (); i++)
2679 ddr_p ddr = (*alias_ddrs)[i];
2680 struct data_reference *dr_a = DDR_A (ddr);
2681 struct data_reference *dr_b = DDR_B (ddr);
2682 tree seg_length_a, seg_length_b;
2684 if (latch_dominated_by_data_ref (loop, dr_a))
2685 seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2686 else
2687 seg_length_a = data_ref_segment_size (dr_a, niters);
2689 if (latch_dominated_by_data_ref (loop, dr_b))
2690 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2691 else
2692 seg_length_b = data_ref_segment_size (dr_b, niters);
2694 unsigned HOST_WIDE_INT access_size_a
2695 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2696 unsigned HOST_WIDE_INT access_size_b
2697 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2698 unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2699 unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2701 dr_with_seg_len_pair_t dr_with_seg_len_pair
2702 (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2703 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b),
2704 /* ??? Would WELL_ORDERED be safe? */
2705 dr_with_seg_len_pair_t::REORDERED);
2707 comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2710 if (tree_fits_uhwi_p (niters))
2711 factor = tree_to_uhwi (niters);
2713 /* Prune alias check pairs. */
2714 prune_runtime_alias_test_list (comp_alias_pairs, factor);
2715 if (dump_file && (dump_flags & TDF_DETAILS))
2716 fprintf (dump_file,
2717 "Improved number of alias checks from %d to %d\n",
2718 alias_ddrs->length (), comp_alias_pairs->length ());
2721 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2722 checks and version LOOP under condition of these runtime alias checks. */
2724 static void
2725 version_loop_by_alias_check (vec<struct partition *> *partitions,
2726 class loop *loop, vec<ddr_p> *alias_ddrs)
2728 profile_probability prob;
2729 basic_block cond_bb;
2730 class loop *nloop;
2731 tree lhs, arg0, cond_expr = NULL_TREE;
2732 gimple_seq cond_stmts = NULL;
2733 gimple *call_stmt = NULL;
2734 auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2736 /* Generate code for runtime alias checks if necessary. */
2737 gcc_assert (alias_ddrs->length () > 0);
2739 if (dump_file && (dump_flags & TDF_DETAILS))
2740 fprintf (dump_file,
2741 "Version loop <%d> with runtime alias check\n", loop->num);
2743 compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2744 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2745 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2746 is_gimple_val, NULL_TREE);
2748 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */
2749 bool cancelable_p = flag_tree_loop_vectorize;
2750 if (cancelable_p)
2752 unsigned i = 0;
2753 struct partition *partition;
2754 for (; partitions->iterate (i, &partition); ++i)
2755 if (!partition_builtin_p (partition))
2756 break;
2758 /* If all partitions are builtins, distributing it would be profitable and
2759 we don't want to cancel the runtime alias checks. */
2760 if (i == partitions->length ())
2761 cancelable_p = false;
2764 /* Generate internal function call for loop distribution alias check if the
2765 runtime alias check should be cancelable. */
2766 if (cancelable_p)
2768 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2769 2, NULL_TREE, cond_expr);
2770 lhs = make_ssa_name (boolean_type_node);
2771 gimple_call_set_lhs (call_stmt, lhs);
2773 else
2774 lhs = cond_expr;
2776 prob = profile_probability::guessed_always ().apply_scale (9, 10);
2777 initialize_original_copy_tables ();
2778 nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2779 prob, prob.invert (), true);
2780 free_original_copy_tables ();
2781 /* Record the original loop number in newly generated loops. In case of
2782 distribution, the original loop will be distributed and the new loop
2783 is kept. */
2784 loop->orig_loop_num = nloop->num;
2785 nloop->orig_loop_num = nloop->num;
2786 nloop->dont_vectorize = true;
2787 nloop->force_vectorize = false;
2789 if (call_stmt)
2791 /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2792 loop could be destroyed. */
2793 arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2794 gimple_call_set_arg (call_stmt, 0, arg0);
2795 gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2798 if (cond_stmts)
2800 gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2801 gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2803 update_ssa (TODO_update_ssa_no_phi);
2806 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2807 ALIAS_DDRS are data dependence relations for runtime alias check. */
2809 static inline bool
2810 version_for_distribution_p (vec<struct partition *> *partitions,
2811 vec<ddr_p> *alias_ddrs)
2813 /* No need to version loop if we have only one partition. */
2814 if (partitions->length () == 1)
2815 return false;
2817 /* Need to version loop if runtime alias check is necessary. */
2818 return (alias_ddrs->length () > 0);
2821 /* Compare base offset of builtin mem* partitions P1 and P2. */
2823 static int
2824 offset_cmp (const void *vp1, const void *vp2)
2826 struct partition *p1 = *(struct partition *const *) vp1;
2827 struct partition *p2 = *(struct partition *const *) vp2;
2828 unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset;
2829 unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset;
2830 return (o2 < o1) - (o1 < o2);
2833 /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special
2834 case optimization transforming below code:
2836 __builtin_memset (&obj, 0, 100);
2837 _1 = &obj + 100;
2838 __builtin_memset (_1, 0, 200);
2839 _2 = &obj + 300;
2840 __builtin_memset (_2, 0, 100);
2842 into:
2844 __builtin_memset (&obj, 0, 400);
2846 Note we don't have dependence information between different partitions
2847 at this point, as a result, we can't handle nonadjacent memset builtin
2848 partitions since dependence might be broken. */
2850 static void
2851 fuse_memset_builtins (vec<struct partition *> *partitions)
2853 unsigned i, j;
2854 struct partition *part1, *part2;
2855 tree rhs1, rhs2;
2857 for (i = 0; partitions->iterate (i, &part1);)
2859 if (part1->kind != PKIND_MEMSET)
2861 i++;
2862 continue;
2865 /* Find sub-array of memset builtins of the same base. Index range
2866 of the sub-array is [i, j) with "j > i". */
2867 for (j = i + 1; partitions->iterate (j, &part2); ++j)
2869 if (part2->kind != PKIND_MEMSET
2870 || !operand_equal_p (part1->builtin->dst_base_base,
2871 part2->builtin->dst_base_base, 0))
2872 break;
2874 /* Memset calls setting different values can't be merged. */
2875 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2876 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2877 if (!operand_equal_p (rhs1, rhs2, 0))
2878 break;
2881 /* Stable sort is required in order to avoid breaking dependence. */
2882 gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i],
2883 offset_cmp);
2884 /* Continue with next partition. */
2885 i = j;
2888 /* Merge all consecutive memset builtin partitions. */
2889 for (i = 0; i < partitions->length () - 1;)
2891 part1 = (*partitions)[i];
2892 if (part1->kind != PKIND_MEMSET)
2894 i++;
2895 continue;
2898 part2 = (*partitions)[i + 1];
2899 /* Only merge memset partitions of the same base and with constant
2900 access sizes. */
2901 if (part2->kind != PKIND_MEMSET
2902 || TREE_CODE (part1->builtin->size) != INTEGER_CST
2903 || TREE_CODE (part2->builtin->size) != INTEGER_CST
2904 || !operand_equal_p (part1->builtin->dst_base_base,
2905 part2->builtin->dst_base_base, 0))
2907 i++;
2908 continue;
2910 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2911 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2912 int bytev1 = const_with_all_bytes_same (rhs1);
2913 int bytev2 = const_with_all_bytes_same (rhs2);
2914 /* Only merge memset partitions of the same value. */
2915 if (bytev1 != bytev2 || bytev1 == -1)
2917 i++;
2918 continue;
2920 wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2921 wi::to_wide (part1->builtin->size));
2922 /* Only merge adjacent memset partitions. */
2923 if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2925 i++;
2926 continue;
2928 /* Merge partitions[i] and partitions[i+1]. */
2929 part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2930 part1->builtin->size,
2931 part2->builtin->size);
2932 partition_free (part2);
2933 partitions->ordered_remove (i + 1);
2937 void
2938 loop_distribution::finalize_partitions (class loop *loop,
2939 vec<struct partition *> *partitions,
2940 vec<ddr_p> *alias_ddrs)
2942 unsigned i;
2943 struct partition *partition, *a;
2945 if (partitions->length () == 1
2946 || alias_ddrs->length () > 0)
2947 return;
2949 unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2950 bool same_type_p = true;
2951 enum partition_type type = ((*partitions)[0])->type;
2952 for (i = 0; partitions->iterate (i, &partition); ++i)
2954 same_type_p &= (type == partition->type);
2955 if (partition_builtin_p (partition))
2957 num_builtin++;
2958 continue;
2960 num_normal++;
2961 if (partition->kind == PKIND_PARTIAL_MEMSET)
2962 num_partial_memset++;
2965 /* Don't distribute current loop into too many loops given we don't have
2966 memory stream cost model. Be even more conservative in case of loop
2967 nest distribution. */
2968 if ((same_type_p && num_builtin == 0
2969 && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2970 || (loop->inner != NULL
2971 && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2972 || (loop->inner == NULL
2973 && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2975 a = (*partitions)[0];
2976 for (i = 1; partitions->iterate (i, &partition); ++i)
2978 partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2979 partition_free (partition);
2981 partitions->truncate (1);
2984 /* Fuse memset builtins if possible. */
2985 if (partitions->length () > 1)
2986 fuse_memset_builtins (partitions);
2989 /* Distributes the code from LOOP in such a way that producer statements
2990 are placed before consumer statements. Tries to separate only the
2991 statements from STMTS into separate loops. Returns the number of
2992 distributed loops. Set NB_CALLS to number of generated builtin calls.
2993 Set *DESTROY_P to whether LOOP needs to be destroyed. */
2996 loop_distribution::distribute_loop (class loop *loop,
2997 const vec<gimple *> &stmts,
2998 control_dependences *cd, int *nb_calls, bool *destroy_p,
2999 bool only_patterns_p)
3001 ddrs_table = new hash_table<ddr_hasher> (389);
3002 struct graph *rdg;
3003 partition *partition;
3004 int i, nbp;
3006 *destroy_p = false;
3007 *nb_calls = 0;
3008 loop_nest.create (0);
3009 if (!find_loop_nest (loop, &loop_nest))
3011 loop_nest.release ();
3012 delete ddrs_table;
3013 return 0;
3016 datarefs_vec.create (20);
3017 has_nonaddressable_dataref_p = false;
3018 rdg = build_rdg (loop, cd);
3019 if (!rdg)
3021 if (dump_file && (dump_flags & TDF_DETAILS))
3022 fprintf (dump_file,
3023 "Loop %d not distributed: failed to build the RDG.\n",
3024 loop->num);
3026 loop_nest.release ();
3027 free_data_refs (datarefs_vec);
3028 delete ddrs_table;
3029 return 0;
3032 if (datarefs_vec.length () > MAX_DATAREFS_NUM)
3034 if (dump_file && (dump_flags & TDF_DETAILS))
3035 fprintf (dump_file,
3036 "Loop %d not distributed: too many memory references.\n",
3037 loop->num);
3039 free_rdg (rdg);
3040 loop_nest.release ();
3041 free_data_refs (datarefs_vec);
3042 delete ddrs_table;
3043 return 0;
3046 data_reference_p dref;
3047 for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
3048 dref->aux = (void *) (uintptr_t) i;
3050 if (dump_file && (dump_flags & TDF_DETAILS))
3051 dump_rdg (dump_file, rdg);
3053 auto_vec<struct partition *, 3> partitions;
3054 rdg_build_partitions (rdg, stmts, &partitions);
3056 auto_vec<ddr_p> alias_ddrs;
3058 auto_bitmap stmt_in_all_partitions;
3059 bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
3060 for (i = 1; partitions.iterate (i, &partition); ++i)
3061 bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
3063 bool any_builtin = false;
3064 bool reduction_in_all = false;
3065 int reduction_partition_num = -1;
3066 FOR_EACH_VEC_ELT (partitions, i, partition)
3068 reduction_in_all
3069 |= classify_partition (loop, rdg, partition, stmt_in_all_partitions);
3070 any_builtin |= partition_builtin_p (partition);
3073 /* If we are only distributing patterns but did not detect any,
3074 simply bail out. */
3075 if (only_patterns_p
3076 && !any_builtin)
3078 nbp = 0;
3079 goto ldist_done;
3082 /* If we are only distributing patterns fuse all partitions that
3083 were not classified as builtins. This also avoids chopping
3084 a loop into pieces, separated by builtin calls. That is, we
3085 only want no or a single loop body remaining. */
3086 struct partition *into;
3087 if (only_patterns_p)
3089 for (i = 0; partitions.iterate (i, &into); ++i)
3090 if (!partition_builtin_p (into))
3091 break;
3092 for (++i; partitions.iterate (i, &partition); ++i)
3093 if (!partition_builtin_p (partition))
3095 partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
3096 partitions.unordered_remove (i);
3097 partition_free (partition);
3098 i--;
3102 /* Due to limitations in the transform phase we have to fuse all
3103 reduction partitions into the last partition so the existing
3104 loop will contain all loop-closed PHI nodes. */
3105 for (i = 0; partitions.iterate (i, &into); ++i)
3106 if (partition_reduction_p (into))
3107 break;
3108 for (i = i + 1; partitions.iterate (i, &partition); ++i)
3109 if (partition_reduction_p (partition))
3111 partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
3112 partitions.unordered_remove (i);
3113 partition_free (partition);
3114 i--;
3117 /* Apply our simple cost model - fuse partitions with similar
3118 memory accesses. */
3119 for (i = 0; partitions.iterate (i, &into); ++i)
3121 bool changed = false;
3122 for (int j = i + 1; partitions.iterate (j, &partition); ++j)
3124 if (share_memory_accesses (rdg, into, partition))
3126 partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
3127 partitions.unordered_remove (j);
3128 partition_free (partition);
3129 j--;
3130 changed = true;
3133 /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
3134 accesses when 1 and 2 have similar accesses but not 0 and 1
3135 then in the next iteration we will fail to consider merging
3136 1 into 0,2. So try again if we did any merging into 0. */
3137 if (changed)
3138 i--;
3141 /* Put a non-builtin partition last if we need to preserve a reduction.
3142 In most cases this helps to keep a normal partition last avoiding to
3143 spill a reduction result across builtin calls.
3144 ??? The proper way would be to use dependences to see whether we
3145 can move builtin partitions earlier during merge_dep_scc_partitions
3146 and its sort_partitions_by_post_order. Especially when the
3147 dependence graph is composed of multiple independent subgraphs the
3148 heuristic does not work reliably. */
3149 if (reduction_in_all
3150 && partition_builtin_p (partitions.last()))
3151 FOR_EACH_VEC_ELT (partitions, i, partition)
3152 if (!partition_builtin_p (partition))
3154 partitions.unordered_remove (i);
3155 partitions.quick_push (partition);
3156 break;
3159 /* Build the partition dependency graph and fuse partitions in strong
3160 connected component. */
3161 if (partitions.length () > 1)
3163 /* Don't support loop nest distribution under runtime alias check
3164 since it's not likely to enable many vectorization opportunities.
3165 Also if loop has any data reference which may be not addressable
3166 since alias check needs to take, compare address of the object. */
3167 if (loop->inner || has_nonaddressable_dataref_p)
3168 merge_dep_scc_partitions (rdg, &partitions, false);
3169 else
3171 merge_dep_scc_partitions (rdg, &partitions, true);
3172 if (partitions.length () > 1)
3173 break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
3177 finalize_partitions (loop, &partitions, &alias_ddrs);
3179 /* If there is a reduction in all partitions make sure the last
3180 non-builtin partition provides the LC PHI defs. */
3181 if (reduction_in_all)
3183 FOR_EACH_VEC_ELT (partitions, i, partition)
3184 if (!partition_builtin_p (partition))
3185 reduction_partition_num = i;
3186 if (reduction_partition_num == -1)
3188 /* If all partitions are builtin, force the last one to
3189 be code generated as normal partition. */
3190 partition = partitions.last ();
3191 partition->kind = PKIND_NORMAL;
3195 nbp = partitions.length ();
3196 if (nbp == 0
3197 || (nbp == 1 && !partition_builtin_p (partitions[0]))
3198 || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
3200 nbp = 0;
3201 goto ldist_done;
3204 if (version_for_distribution_p (&partitions, &alias_ddrs))
3205 version_loop_by_alias_check (&partitions, loop, &alias_ddrs);
3207 if (dump_file && (dump_flags & TDF_DETAILS))
3209 fprintf (dump_file,
3210 "distribute loop <%d> into partitions:\n", loop->num);
3211 dump_rdg_partitions (dump_file, partitions);
3214 FOR_EACH_VEC_ELT (partitions, i, partition)
3216 if (partition_builtin_p (partition))
3217 (*nb_calls)++;
3218 *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1,
3219 i == reduction_partition_num);
3222 ldist_done:
3223 loop_nest.release ();
3224 free_data_refs (datarefs_vec);
3225 for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
3226 iter != ddrs_table->end (); ++iter)
3228 free_dependence_relation (*iter);
3229 *iter = NULL;
3231 delete ddrs_table;
3233 FOR_EACH_VEC_ELT (partitions, i, partition)
3234 partition_free (partition);
3236 free_rdg (rdg);
3237 return nbp - *nb_calls;
3241 void loop_distribution::bb_top_order_init (void)
3243 int rpo_num;
3244 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (cfun) - NUM_FIXED_BLOCKS);
3245 edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3246 bitmap exit_bbs = BITMAP_ALLOC (NULL);
3248 bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
3249 bb_top_order_index_size = last_basic_block_for_fn (cfun);
3251 entry->flags &= ~EDGE_DFS_BACK;
3252 bitmap_set_bit (exit_bbs, EXIT_BLOCK);
3253 rpo_num = rev_post_order_and_mark_dfs_back_seme (cfun, entry, exit_bbs, true,
3254 rpo, NULL);
3255 BITMAP_FREE (exit_bbs);
3257 for (int i = 0; i < rpo_num; i++)
3258 bb_top_order_index[rpo[i]] = i;
3260 free (rpo);
3263 void loop_distribution::bb_top_order_destroy ()
3265 free (bb_top_order_index);
3266 bb_top_order_index = NULL;
3267 bb_top_order_index_size = 0;
3271 /* Given LOOP, this function records seed statements for distribution in
3272 WORK_LIST. Return false if there is nothing for distribution. */
3274 static bool
3275 find_seed_stmts_for_distribution (class loop *loop, vec<gimple *> *work_list)
3277 basic_block *bbs = get_loop_body_in_dom_order (loop);
3279 /* Initialize the worklist with stmts we seed the partitions with. */
3280 for (unsigned i = 0; i < loop->num_nodes; ++i)
3282 /* In irreducible sub-regions we don't know how to redirect
3283 conditions, so fail. See PR100492. */
3284 if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
3286 if (dump_file && (dump_flags & TDF_DETAILS))
3287 fprintf (dump_file, "loop %d contains an irreducible region.\n",
3288 loop->num);
3289 work_list->truncate (0);
3290 break;
3292 for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
3293 !gsi_end_p (gsi); gsi_next (&gsi))
3295 gphi *phi = gsi.phi ();
3296 if (virtual_operand_p (gimple_phi_result (phi)))
3297 continue;
3298 /* Distribute stmts which have defs that are used outside of
3299 the loop. */
3300 if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
3301 continue;
3302 work_list->safe_push (phi);
3304 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3305 !gsi_end_p (gsi); gsi_next (&gsi))
3307 gimple *stmt = gsi_stmt (gsi);
3309 /* Ignore clobbers, they do not have true side effects. */
3310 if (gimple_clobber_p (stmt))
3311 continue;
3313 /* If there is a stmt with side-effects bail out - we
3314 cannot and should not distribute this loop. */
3315 if (gimple_has_side_effects (stmt))
3317 free (bbs);
3318 return false;
3321 /* Distribute stmts which have defs that are used outside of
3322 the loop. */
3323 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
3325 /* Otherwise only distribute stores for now. */
3326 else if (!gimple_vdef (stmt))
3327 continue;
3329 work_list->safe_push (stmt);
3332 bool res = work_list->length () > 0;
3333 if (res && !can_copy_bbs_p (bbs, loop->num_nodes))
3335 if (dump_file && (dump_flags & TDF_DETAILS))
3336 fprintf (dump_file, "cannot copy loop %d.\n", loop->num);
3337 res = false;
3339 free (bbs);
3340 return res;
3343 /* A helper function for generate_{rawmemchr,strlen}_builtin functions in order
3344 to place new statements SEQ before LOOP and replace the old reduction
3345 variable with the new one. */
3347 static void
3348 generate_reduction_builtin_1 (loop_p loop, gimple_seq &seq,
3349 tree reduction_var_old, tree reduction_var_new,
3350 const char *info, machine_mode load_mode)
3352 gcc_assert (flag_tree_loop_distribute_patterns);
3354 /* Place new statements before LOOP. */
3355 gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
3356 gsi_insert_seq_after (&gsi, seq, GSI_CONTINUE_LINKING);
3358 /* Replace old reduction variable with new one. */
3359 imm_use_iterator iter;
3360 gimple *stmt;
3361 use_operand_p use_p;
3362 FOR_EACH_IMM_USE_STMT (stmt, iter, reduction_var_old)
3364 FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
3365 SET_USE (use_p, reduction_var_new);
3367 update_stmt (stmt);
3370 if (dump_file && (dump_flags & TDF_DETAILS))
3371 fprintf (dump_file, info, GET_MODE_NAME (load_mode));
3374 /* Generate a call to rawmemchr and place it before LOOP. REDUCTION_VAR is
3375 replaced with a fresh SSA name representing the result of the call. */
3377 static void
3378 generate_rawmemchr_builtin (loop_p loop, tree reduction_var,
3379 data_reference_p store_dr, tree base, tree pattern,
3380 location_t loc)
3382 gimple_seq seq = NULL;
3384 tree mem = force_gimple_operand (base, &seq, true, NULL_TREE);
3385 gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, mem, pattern);
3386 tree reduction_var_new = copy_ssa_name (reduction_var);
3387 gimple_call_set_lhs (fn_call, reduction_var_new);
3388 gimple_set_location (fn_call, loc);
3389 gimple_seq_add_stmt (&seq, fn_call);
3391 if (store_dr)
3393 gassign *g = gimple_build_assign (DR_REF (store_dr), reduction_var_new);
3394 gimple_seq_add_stmt (&seq, g);
3397 generate_reduction_builtin_1 (loop, seq, reduction_var, reduction_var_new,
3398 "generated rawmemchr%s\n",
3399 TYPE_MODE (TREE_TYPE (TREE_TYPE (base))));
3402 /* Helper function for generate_strlen_builtin(,_using_rawmemchr) */
3404 static void
3405 generate_strlen_builtin_1 (loop_p loop, gimple_seq &seq,
3406 tree reduction_var_old, tree reduction_var_new,
3407 machine_mode mode, tree start_len)
3409 /* REDUCTION_VAR_NEW has either size type or ptrdiff type and must be
3410 converted if types of old and new reduction variable are not compatible. */
3411 reduction_var_new = gimple_convert (&seq, TREE_TYPE (reduction_var_old),
3412 reduction_var_new);
3414 /* Loops of the form `for (i=42; s[i]; ++i);` have an additional start
3415 length. */
3416 if (!integer_zerop (start_len))
3418 tree lhs = make_ssa_name (TREE_TYPE (reduction_var_new));
3419 gimple *g = gimple_build_assign (lhs, PLUS_EXPR, reduction_var_new,
3420 start_len);
3421 gimple_seq_add_stmt (&seq, g);
3422 reduction_var_new = lhs;
3425 generate_reduction_builtin_1 (loop, seq, reduction_var_old, reduction_var_new,
3426 "generated strlen%s\n", mode);
3429 /* Generate a call to strlen and place it before LOOP. REDUCTION_VAR is
3430 replaced with a fresh SSA name representing the result of the call. */
3432 static void
3433 generate_strlen_builtin (loop_p loop, tree reduction_var, tree base,
3434 tree start_len, location_t loc)
3436 gimple_seq seq = NULL;
3438 tree reduction_var_new = make_ssa_name (size_type_node);
3440 tree mem = force_gimple_operand (base, &seq, true, NULL_TREE);
3441 tree fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_STRLEN));
3442 gimple *fn_call = gimple_build_call (fn, 1, mem);
3443 gimple_call_set_lhs (fn_call, reduction_var_new);
3444 gimple_set_location (fn_call, loc);
3445 gimple_seq_add_stmt (&seq, fn_call);
3447 generate_strlen_builtin_1 (loop, seq, reduction_var, reduction_var_new,
3448 QImode, start_len);
3451 /* Generate code in order to mimic the behaviour of strlen but this time over
3452 an array of elements with mode different than QI. REDUCTION_VAR is replaced
3453 with a fresh SSA name representing the result, i.e., the length. */
3455 static void
3456 generate_strlen_builtin_using_rawmemchr (loop_p loop, tree reduction_var,
3457 tree base, tree load_type,
3458 tree start_len, location_t loc)
3460 gimple_seq seq = NULL;
3462 tree start = force_gimple_operand (base, &seq, true, NULL_TREE);
3463 tree zero = build_zero_cst (load_type);
3464 gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, start, zero);
3465 tree end = make_ssa_name (TREE_TYPE (base));
3466 gimple_call_set_lhs (fn_call, end);
3467 gimple_set_location (fn_call, loc);
3468 gimple_seq_add_stmt (&seq, fn_call);
3470 /* Determine the number of elements between START and END by
3471 evaluating (END - START) / sizeof (*START). */
3472 tree diff = make_ssa_name (ptrdiff_type_node);
3473 gimple *diff_stmt = gimple_build_assign (diff, POINTER_DIFF_EXPR, end, start);
3474 gimple_seq_add_stmt (&seq, diff_stmt);
3475 /* Let SIZE be the size of each character. */
3476 tree size = gimple_convert (&seq, ptrdiff_type_node,
3477 TYPE_SIZE_UNIT (load_type));
3478 tree count = make_ssa_name (ptrdiff_type_node);
3479 gimple *count_stmt = gimple_build_assign (count, TRUNC_DIV_EXPR, diff, size);
3480 gimple_seq_add_stmt (&seq, count_stmt);
3482 generate_strlen_builtin_1 (loop, seq, reduction_var, count,
3483 TYPE_MODE (load_type),
3484 start_len);
3487 /* Return true if we can count at least as many characters by taking pointer
3488 difference as we can count via reduction_var without an overflow. Thus
3489 compute 2^n < (2^(m-1) / s) where n = TYPE_PRECISION (reduction_var_type),
3490 m = TYPE_PRECISION (ptrdiff_type_node), and s = size of each character. */
3491 static bool
3492 reduction_var_overflows_first (tree reduction_var_type, tree load_type)
3494 widest_int n2 = wi::lshift (1, TYPE_PRECISION (reduction_var_type));;
3495 widest_int m2 = wi::lshift (1, TYPE_PRECISION (ptrdiff_type_node) - 1);
3496 widest_int s = wi::to_widest (TYPE_SIZE_UNIT (load_type));
3497 return wi::ltu_p (n2, wi::udiv_trunc (m2, s));
3500 static gimple *
3501 determine_reduction_stmt_1 (const loop_p loop, const basic_block *bbs)
3503 gimple *reduction_stmt = NULL;
3505 for (unsigned i = 0, ninsns = 0; i < loop->num_nodes; ++i)
3507 basic_block bb = bbs[i];
3509 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
3510 gsi_next_nondebug (&bsi))
3512 gphi *phi = bsi.phi ();
3513 if (virtual_operand_p (gimple_phi_result (phi)))
3514 continue;
3515 if (stmt_has_scalar_dependences_outside_loop (loop, phi))
3517 if (reduction_stmt)
3518 return NULL;
3519 reduction_stmt = phi;
3523 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
3524 gsi_next_nondebug (&bsi), ++ninsns)
3526 /* Bail out early for loops which are unlikely to match. */
3527 if (ninsns > 16)
3528 return NULL;
3529 gimple *stmt = gsi_stmt (bsi);
3530 if (gimple_clobber_p (stmt))
3531 continue;
3532 if (gimple_code (stmt) == GIMPLE_LABEL)
3533 continue;
3534 if (gimple_has_volatile_ops (stmt))
3535 return NULL;
3536 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
3538 if (reduction_stmt)
3539 return NULL;
3540 reduction_stmt = stmt;
3545 return reduction_stmt;
3548 /* If LOOP has a single non-volatile reduction statement, then return a pointer
3549 to it. Otherwise return NULL. */
3550 static gimple *
3551 determine_reduction_stmt (const loop_p loop)
3553 basic_block *bbs = get_loop_body (loop);
3554 gimple *reduction_stmt = determine_reduction_stmt_1 (loop, bbs);
3555 XDELETEVEC (bbs);
3556 return reduction_stmt;
3559 /* Transform loops which mimic the effects of builtins rawmemchr or strlen and
3560 replace them accordingly. For example, a loop of the form
3562 for (; *p != 42; ++p);
3564 is replaced by
3566 p = rawmemchr<MODE> (p, 42);
3568 under the assumption that rawmemchr is available for a particular MODE.
3569 Another example is
3571 int i;
3572 for (i = 42; s[i]; ++i);
3574 which is replaced by
3576 i = (int)strlen (&s[42]) + 42;
3578 for some character array S. In case array S is not of type character array
3579 we end up with
3581 i = (int)(rawmemchr<MODE> (&s[42], 0) - &s[42]) + 42;
3583 assuming that rawmemchr is available for a particular MODE. */
3585 bool
3586 loop_distribution::transform_reduction_loop (loop_p loop)
3588 gimple *reduction_stmt;
3589 data_reference_p load_dr = NULL, store_dr = NULL;
3591 edge e = single_exit (loop);
3592 gcond *cond = safe_dyn_cast <gcond *> (*gsi_last_bb (e->src));
3593 if (!cond)
3594 return false;
3595 /* Ensure loop condition is an (in)equality test and loop is exited either if
3596 the inequality test fails or the equality test succeeds. */
3597 if (!(e->flags & EDGE_FALSE_VALUE && gimple_cond_code (cond) == NE_EXPR)
3598 && !(e->flags & EDGE_TRUE_VALUE && gimple_cond_code (cond) == EQ_EXPR))
3599 return false;
3600 /* A limitation of the current implementation is that we only support
3601 constant patterns in (in)equality tests. */
3602 tree pattern = gimple_cond_rhs (cond);
3603 if (TREE_CODE (pattern) != INTEGER_CST)
3604 return false;
3606 reduction_stmt = determine_reduction_stmt (loop);
3608 /* A limitation of the current implementation is that we require a reduction
3609 statement. Therefore, loops without a reduction statement as in the
3610 following are not recognized:
3611 int *p;
3612 void foo (void) { for (; *p; ++p); } */
3613 if (reduction_stmt == NULL)
3614 return false;
3616 /* Reduction variables are guaranteed to be SSA names. */
3617 tree reduction_var;
3618 switch (gimple_code (reduction_stmt))
3620 case GIMPLE_ASSIGN:
3621 case GIMPLE_PHI:
3622 reduction_var = gimple_get_lhs (reduction_stmt);
3623 break;
3624 default:
3625 /* Bail out e.g. for GIMPLE_CALL. */
3626 return false;
3629 struct graph *rdg = build_rdg (loop, NULL);
3630 if (rdg == NULL)
3632 if (dump_file && (dump_flags & TDF_DETAILS))
3633 fprintf (dump_file,
3634 "Loop %d not transformed: failed to build the RDG.\n",
3635 loop->num);
3637 return false;
3639 auto_bitmap partition_stmts;
3640 bitmap_set_range (partition_stmts, 0, rdg->n_vertices);
3641 find_single_drs (loop, rdg, partition_stmts, &store_dr, &load_dr);
3642 free_rdg (rdg);
3644 /* Bail out if there is no single load. */
3645 if (load_dr == NULL)
3646 return false;
3648 /* Reaching this point we have a loop with a single reduction variable,
3649 a single load, and an optional single store. */
3651 tree load_ref = DR_REF (load_dr);
3652 tree load_type = TREE_TYPE (load_ref);
3653 tree load_access_base = build_fold_addr_expr (load_ref);
3654 tree load_access_size = TYPE_SIZE_UNIT (load_type);
3655 affine_iv load_iv, reduction_iv;
3657 if (!INTEGRAL_TYPE_P (load_type)
3658 || !type_has_mode_precision_p (load_type))
3659 return false;
3661 /* We already ensured that the loop condition tests for (in)equality where the
3662 rhs is a constant pattern. Now ensure that the lhs is the result of the
3663 load. */
3664 if (gimple_cond_lhs (cond) != gimple_assign_lhs (DR_STMT (load_dr)))
3665 return false;
3667 /* Bail out if no affine induction variable with constant step can be
3668 determined. */
3669 if (!simple_iv (loop, loop, load_access_base, &load_iv, false))
3670 return false;
3672 /* Bail out if memory accesses are not consecutive or not growing. */
3673 if (!operand_equal_p (load_iv.step, load_access_size, 0))
3674 return false;
3676 if (!simple_iv (loop, loop, reduction_var, &reduction_iv, false))
3677 return false;
3679 /* Handle rawmemchr like loops. */
3680 if (operand_equal_p (load_iv.base, reduction_iv.base)
3681 && operand_equal_p (load_iv.step, reduction_iv.step))
3683 if (store_dr)
3685 /* Ensure that we store to X and load from X+I where I>0. */
3686 if (TREE_CODE (load_iv.base) != POINTER_PLUS_EXPR
3687 || !integer_onep (TREE_OPERAND (load_iv.base, 1)))
3688 return false;
3689 tree ptr_base = TREE_OPERAND (load_iv.base, 0);
3690 if (TREE_CODE (ptr_base) != SSA_NAME)
3691 return false;
3692 gimple *def = SSA_NAME_DEF_STMT (ptr_base);
3693 if (!gimple_assign_single_p (def)
3694 || gimple_assign_rhs1 (def) != DR_REF (store_dr))
3695 return false;
3696 /* Ensure that the reduction value is stored. */
3697 if (gimple_assign_rhs1 (DR_STMT (store_dr)) != reduction_var)
3698 return false;
3700 /* Bail out if target does not provide rawmemchr for a certain mode. */
3701 machine_mode mode = TYPE_MODE (load_type);
3702 if (direct_optab_handler (rawmemchr_optab, mode) == CODE_FOR_nothing)
3703 return false;
3704 location_t loc = gimple_location (DR_STMT (load_dr));
3705 generate_rawmemchr_builtin (loop, reduction_var, store_dr, load_iv.base,
3706 pattern, loc);
3707 return true;
3710 /* Handle strlen like loops. */
3711 if (store_dr == NULL
3712 && integer_zerop (pattern)
3713 && INTEGRAL_TYPE_P (TREE_TYPE (reduction_var))
3714 && TREE_CODE (reduction_iv.base) == INTEGER_CST
3715 && TREE_CODE (reduction_iv.step) == INTEGER_CST
3716 && integer_onep (reduction_iv.step))
3718 location_t loc = gimple_location (DR_STMT (load_dr));
3719 tree reduction_var_type = TREE_TYPE (reduction_var);
3720 /* While determining the length of a string an overflow might occur.
3721 If an overflow only occurs in the loop implementation and not in the
3722 strlen implementation, then either the overflow is undefined or the
3723 truncated result of strlen equals the one of the loop. Otherwise if
3724 an overflow may also occur in the strlen implementation, then
3725 replacing a loop by a call to strlen is sound whenever we ensure that
3726 if an overflow occurs in the strlen implementation, then also an
3727 overflow occurs in the loop implementation which is undefined. It
3728 seems reasonable to relax this and assume that the strlen
3729 implementation cannot overflow in case sizetype is big enough in the
3730 sense that an overflow can only happen for string objects which are
3731 bigger than half of the address space; at least for 32-bit targets and
3734 For strlen which makes use of rawmemchr the maximal length of a string
3735 which can be determined without an overflow is PTRDIFF_MAX / S where
3736 each character has size S. Since an overflow for ptrdiff type is
3737 undefined we have to make sure that if an overflow occurs, then an
3738 overflow occurs in the loop implementation, too, and this is
3739 undefined, too. Similar as before we relax this and assume that no
3740 string object is larger than half of the address space; at least for
3741 32-bit targets and up. */
3742 if (TYPE_MODE (load_type) == TYPE_MODE (char_type_node)
3743 && TYPE_PRECISION (load_type) == TYPE_PRECISION (char_type_node)
3744 && ((TYPE_PRECISION (sizetype) >= TYPE_PRECISION (ptr_type_node) - 1
3745 && TYPE_PRECISION (ptr_type_node) >= 32)
3746 || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type)
3747 && TYPE_PRECISION (reduction_var_type) <= TYPE_PRECISION (sizetype)))
3748 && builtin_decl_implicit (BUILT_IN_STRLEN))
3749 generate_strlen_builtin (loop, reduction_var, load_iv.base,
3750 reduction_iv.base, loc);
3751 else if (direct_optab_handler (rawmemchr_optab, TYPE_MODE (load_type))
3752 != CODE_FOR_nothing
3753 && ((TYPE_PRECISION (ptrdiff_type_node) == TYPE_PRECISION (ptr_type_node)
3754 && TYPE_PRECISION (ptrdiff_type_node) >= 32)
3755 || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type)
3756 && reduction_var_overflows_first (reduction_var_type, load_type))))
3757 generate_strlen_builtin_using_rawmemchr (loop, reduction_var,
3758 load_iv.base,
3759 load_type,
3760 reduction_iv.base, loc);
3761 else
3762 return false;
3763 return true;
3766 return false;
3769 /* Given innermost LOOP, return the outermost enclosing loop that forms a
3770 perfect loop nest. */
3772 static class loop *
3773 prepare_perfect_loop_nest (class loop *loop)
3775 class loop *outer = loop_outer (loop);
3776 tree niters = number_of_latch_executions (loop);
3778 /* TODO: We only support the innermost 3-level loop nest distribution
3779 because of compilation time issue for now. This should be relaxed
3780 in the future. Note we only allow 3-level loop nest distribution
3781 when parallelizing loops. */
3782 while ((loop->inner == NULL
3783 || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
3784 && loop_outer (outer)
3785 && outer->inner == loop && loop->next == NULL
3786 && single_exit (outer)
3787 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3788 && (niters = number_of_latch_executions (outer)) != NULL_TREE
3789 && niters != chrec_dont_know)
3791 loop = outer;
3792 outer = loop_outer (loop);
3795 return loop;
3799 unsigned int
3800 loop_distribution::execute (function *fun)
3802 bool changed = false;
3803 basic_block bb;
3804 control_dependences *cd = NULL;
3805 auto_vec<loop_p> loops_to_be_destroyed;
3807 if (number_of_loops (fun) <= 1)
3808 return 0;
3810 bb_top_order_init ();
3812 FOR_ALL_BB_FN (bb, fun)
3814 gimple_stmt_iterator gsi;
3815 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3816 gimple_set_uid (gsi_stmt (gsi), -1);
3817 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3818 gimple_set_uid (gsi_stmt (gsi), -1);
3821 /* We can at the moment only distribute non-nested loops, thus restrict
3822 walking to innermost loops. */
3823 for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
3825 /* Don't distribute multiple exit edges loop, or cold loop when
3826 not doing pattern detection. */
3827 if (!single_exit (loop)
3828 || (!flag_tree_loop_distribute_patterns
3829 && !optimize_loop_for_speed_p (loop)))
3830 continue;
3832 /* If niters is unknown don't distribute loop but rather try to transform
3833 it to a call to a builtin. */
3834 tree niters = number_of_latch_executions (loop);
3835 if (niters == NULL_TREE || niters == chrec_dont_know)
3837 datarefs_vec.create (20);
3838 if (flag_tree_loop_distribute_patterns
3839 && transform_reduction_loop (loop))
3841 changed = true;
3842 loops_to_be_destroyed.safe_push (loop);
3843 if (dump_enabled_p ())
3845 dump_user_location_t loc = find_loop_location (loop);
3846 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3847 loc, "Loop %d transformed into a builtin.\n",
3848 loop->num);
3851 free_data_refs (datarefs_vec);
3852 continue;
3855 /* Get the perfect loop nest for distribution. */
3856 loop = prepare_perfect_loop_nest (loop);
3857 for (; loop; loop = loop->inner)
3859 auto_vec<gimple *> work_list;
3860 if (!find_seed_stmts_for_distribution (loop, &work_list))
3861 continue;
3863 const char *str = loop->inner ? " nest" : "";
3864 dump_user_location_t loc = find_loop_location (loop);
3865 if (!cd)
3867 calculate_dominance_info (CDI_DOMINATORS);
3868 calculate_dominance_info (CDI_POST_DOMINATORS);
3869 cd = new control_dependences ();
3870 free_dominance_info (CDI_POST_DOMINATORS);
3873 bool destroy_p;
3874 int nb_generated_loops, nb_generated_calls;
3875 bool only_patterns = !optimize_loop_for_speed_p (loop)
3876 || !flag_tree_loop_distribution;
3877 /* do not try to distribute loops that are not expected to iterate. */
3878 if (!only_patterns)
3880 HOST_WIDE_INT iterations = estimated_loop_iterations_int (loop);
3881 if (iterations < 0)
3882 iterations = likely_max_loop_iterations_int (loop);
3883 if (!iterations)
3884 only_patterns = true;
3886 nb_generated_loops
3887 = distribute_loop (loop, work_list, cd, &nb_generated_calls,
3888 &destroy_p, only_patterns);
3889 if (destroy_p)
3890 loops_to_be_destroyed.safe_push (loop);
3892 if (nb_generated_loops + nb_generated_calls > 0)
3894 changed = true;
3895 if (dump_enabled_p ())
3896 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3897 loc, "Loop%s %d distributed: split to %d loops "
3898 "and %d library calls.\n", str, loop->num,
3899 nb_generated_loops, nb_generated_calls);
3901 break;
3904 if (dump_file && (dump_flags & TDF_DETAILS))
3905 fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
3909 if (cd)
3910 delete cd;
3912 if (bb_top_order_index != NULL)
3913 bb_top_order_destroy ();
3915 if (changed)
3917 /* Destroy loop bodies that could not be reused. Do this late as we
3918 otherwise can end up refering to stale data in control dependences. */
3919 unsigned i;
3920 class loop *loop;
3921 FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
3922 destroy_loop (loop);
3924 /* Cached scalar evolutions now may refer to wrong or non-existing
3925 loops. */
3926 scev_reset ();
3927 mark_virtual_operands_for_renaming (fun);
3928 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3931 checking_verify_loop_structure ();
3933 return changed ? TODO_cleanup_cfg : 0;
3937 /* Distribute all loops in the current function. */
3939 namespace {
3941 const pass_data pass_data_loop_distribution =
3943 GIMPLE_PASS, /* type */
3944 "ldist", /* name */
3945 OPTGROUP_LOOP, /* optinfo_flags */
3946 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
3947 ( PROP_cfg | PROP_ssa ), /* properties_required */
3948 0, /* properties_provided */
3949 0, /* properties_destroyed */
3950 0, /* todo_flags_start */
3951 0, /* todo_flags_finish */
3954 class pass_loop_distribution : public gimple_opt_pass
3956 public:
3957 pass_loop_distribution (gcc::context *ctxt)
3958 : gimple_opt_pass (pass_data_loop_distribution, ctxt)
3961 /* opt_pass methods: */
3962 bool gate (function *) final override
3964 return flag_tree_loop_distribution
3965 || flag_tree_loop_distribute_patterns;
3968 unsigned int execute (function *) final override;
3970 }; // class pass_loop_distribution
3972 unsigned int
3973 pass_loop_distribution::execute (function *fun)
3975 return loop_distribution ().execute (fun);
3978 } // anon namespace
3980 gimple_opt_pass *
3981 make_pass_loop_distribution (gcc::context *ctxt)
3983 return new pass_loop_distribution (ctxt);