* doc/install.texi (Downloading the Source): Update references to
[official-gcc.git] / gcc / tree-loop-distribution.c
blob747b6ac6131d3b2315773a9d3f02595ca9759d4d
1 /* Loop distribution.
2 Copyright (C) 2006-2013 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 /* This pass performs loop distribution: for example, the loop
24 |DO I = 2, N
25 | A(I) = B(I) + C
26 | D(I) = A(I-1)*E
27 |ENDDO
29 is transformed to
31 |DOALL I = 2, N
32 | A(I) = B(I) + C
33 |ENDDO
35 |DOALL I = 2, N
36 | D(I) = A(I-1)*E
37 |ENDDO
39 This pass uses an RDG, Reduced Dependence Graph built on top of the
40 data dependence relations. The RDG is then topologically sorted to
41 obtain a map of information producers/consumers based on which it
42 generates the new loops. */
44 #include "config.h"
45 #include "system.h"
46 #include "coretypes.h"
47 #include "tree-flow.h"
48 #include "cfgloop.h"
49 #include "tree-chrec.h"
50 #include "tree-data-ref.h"
51 #include "tree-scalar-evolution.h"
52 #include "tree-pass.h"
54 enum partition_kind { PKIND_NORMAL, PKIND_MEMSET, PKIND_MEMCPY };
56 typedef struct partition_s
58 bitmap stmts;
59 bool has_writes;
60 enum partition_kind kind;
61 /* data-references a kind != PKIND_NORMAL partition is about. */
62 data_reference_p main_dr;
63 data_reference_p secondary_dr;
64 } *partition_t;
67 /* Allocate and initialize a partition from BITMAP. */
69 static partition_t
70 partition_alloc (bitmap stmts)
72 partition_t partition = XCNEW (struct partition_s);
73 partition->stmts = stmts ? stmts : BITMAP_ALLOC (NULL);
74 partition->has_writes = false;
75 partition->kind = PKIND_NORMAL;
76 return partition;
79 /* Free PARTITION. */
81 static void
82 partition_free (partition_t partition)
84 BITMAP_FREE (partition->stmts);
85 free (partition);
88 /* Returns true if the partition can be generated as a builtin. */
90 static bool
91 partition_builtin_p (partition_t partition)
93 return partition->kind != PKIND_NORMAL;
96 /* Returns true if the partition has an writes. */
98 static bool
99 partition_has_writes (partition_t partition)
101 return partition->has_writes;
104 /* If bit I is not set, it means that this node represents an
105 operation that has already been performed, and that should not be
106 performed again. This is the subgraph of remaining important
107 computations that is passed to the DFS algorithm for avoiding to
108 include several times the same stores in different loops. */
109 static bitmap remaining_stmts;
111 /* A node of the RDG is marked in this bitmap when it has as a
112 predecessor a node that writes to memory. */
113 static bitmap upstream_mem_writes;
115 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
116 the LOOP. */
118 static bool
119 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
121 imm_use_iterator imm_iter;
122 use_operand_p use_p;
124 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
126 gimple use_stmt = USE_STMT (use_p);
127 if (!is_gimple_debug (use_stmt)
128 && loop != loop_containing_stmt (use_stmt))
129 return true;
132 return false;
135 /* Returns true when STMT defines a scalar variable used after the
136 loop LOOP. */
138 static bool
139 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple stmt)
141 def_operand_p def_p;
142 ssa_op_iter op_iter;
144 if (gimple_code (stmt) == GIMPLE_PHI)
145 return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
147 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
148 if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
149 return true;
151 return false;
154 /* Return a copy of LOOP placed before LOOP. */
156 static struct loop *
157 copy_loop_before (struct loop *loop)
159 struct loop *res;
160 edge preheader = loop_preheader_edge (loop);
162 initialize_original_copy_tables ();
163 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader);
164 gcc_assert (res != NULL);
165 free_original_copy_tables ();
166 delete_update_ssa ();
168 return res;
171 /* Creates an empty basic block after LOOP. */
173 static void
174 create_bb_after_loop (struct loop *loop)
176 edge exit = single_exit (loop);
178 if (!exit)
179 return;
181 split_edge (exit);
184 /* Generate code for PARTITION from the code in LOOP. The loop is
185 copied when COPY_P is true. All the statements not flagged in the
186 PARTITION bitmap are removed from the loop or from its copy. The
187 statements are indexed in sequence inside a basic block, and the
188 basic blocks of a loop are taken in dom order. */
190 static void
191 generate_loops_for_partition (struct loop *loop, partition_t partition,
192 bool copy_p)
194 unsigned i, x;
195 gimple_stmt_iterator bsi;
196 basic_block *bbs;
198 if (copy_p)
200 loop = copy_loop_before (loop);
201 gcc_assert (loop != NULL);
202 create_preheader (loop, CP_SIMPLE_PREHEADERS);
203 create_bb_after_loop (loop);
206 /* Remove stmts not in the PARTITION bitmap. The order in which we
207 visit the phi nodes and the statements is exactly as in
208 stmts_from_loop. */
209 bbs = get_loop_body_in_dom_order (loop);
211 if (MAY_HAVE_DEBUG_STMTS)
212 for (x = 0, i = 0; i < loop->num_nodes; i++)
214 basic_block bb = bbs[i];
216 for (bsi = gsi_start_phis (bb); !gsi_end_p (bsi); gsi_next (&bsi))
217 if (!bitmap_bit_p (partition->stmts, x++))
218 reset_debug_uses (gsi_stmt (bsi));
220 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
222 gimple stmt = gsi_stmt (bsi);
223 if (gimple_code (stmt) != GIMPLE_LABEL
224 && !is_gimple_debug (stmt)
225 && !bitmap_bit_p (partition->stmts, x++))
226 reset_debug_uses (stmt);
230 for (x = 0, i = 0; i < loop->num_nodes; i++)
232 basic_block bb = bbs[i];
234 for (bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
235 if (!bitmap_bit_p (partition->stmts, x++))
237 gimple phi = gsi_stmt (bsi);
238 if (virtual_operand_p (gimple_phi_result (phi)))
239 mark_virtual_phi_result_for_renaming (phi);
240 remove_phi_node (&bsi, true);
242 else
243 gsi_next (&bsi);
245 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
247 gimple stmt = gsi_stmt (bsi);
248 if (gimple_code (stmt) != GIMPLE_LABEL
249 && !is_gimple_debug (stmt)
250 && !bitmap_bit_p (partition->stmts, x++))
252 unlink_stmt_vdef (stmt);
253 gsi_remove (&bsi, true);
254 release_defs (stmt);
256 else
257 gsi_next (&bsi);
261 free (bbs);
264 /* Build the size argument for a memory operation call. */
266 static tree
267 build_size_arg_loc (location_t loc, data_reference_p dr, tree nb_iter)
269 tree size;
270 size = fold_build2_loc (loc, MULT_EXPR, sizetype,
271 fold_convert_loc (loc, sizetype, nb_iter),
272 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))));
273 return fold_convert_loc (loc, size_type_node, size);
276 /* Build an address argument for a memory operation call. */
278 static tree
279 build_addr_arg_loc (location_t loc, data_reference_p dr, tree nb_bytes)
281 tree addr_base;
283 addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
284 addr_base = fold_convert_loc (loc, sizetype, addr_base);
286 /* Test for a negative stride, iterating over every element. */
287 if (tree_int_cst_sgn (DR_STEP (dr)) == -1)
289 addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base,
290 fold_convert_loc (loc, sizetype, nb_bytes));
291 addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base,
292 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))));
295 return fold_build_pointer_plus_loc (loc, DR_BASE_ADDRESS (dr), addr_base);
298 /* Generate a call to memset for PARTITION in LOOP. */
300 static void
301 generate_memset_builtin (struct loop *loop, partition_t partition)
303 gimple_stmt_iterator gsi;
304 gimple stmt, fn_call;
305 tree nb_iter, mem, fn, nb_bytes;
306 location_t loc;
307 tree val;
309 stmt = DR_STMT (partition->main_dr);
310 loc = gimple_location (stmt);
311 if (gimple_bb (stmt) == loop->latch)
312 nb_iter = number_of_latch_executions (loop);
313 else
314 nb_iter = number_of_exit_cond_executions (loop);
316 /* The new statements will be placed before LOOP. */
317 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
319 nb_bytes = build_size_arg_loc (loc, partition->main_dr, nb_iter);
320 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
321 false, GSI_CONTINUE_LINKING);
322 mem = build_addr_arg_loc (loc, partition->main_dr, nb_bytes);
323 mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
324 false, GSI_CONTINUE_LINKING);
326 /* This exactly matches the pattern recognition in classify_partition. */
327 val = gimple_assign_rhs1 (stmt);
328 if (integer_zerop (val)
329 || real_zerop (val)
330 || TREE_CODE (val) == CONSTRUCTOR)
331 val = integer_zero_node;
332 else if (integer_all_onesp (val))
333 val = build_int_cst (integer_type_node, -1);
334 else
336 if (TREE_CODE (val) == INTEGER_CST)
337 val = fold_convert (integer_type_node, val);
338 else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
340 gimple cstmt;
341 tree tem = make_ssa_name (integer_type_node, NULL);
342 cstmt = gimple_build_assign_with_ops (NOP_EXPR, tem, val, NULL_TREE);
343 gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
344 val = tem;
348 fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
349 fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
350 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
352 if (dump_file && (dump_flags & TDF_DETAILS))
354 fprintf (dump_file, "generated memset");
355 if (integer_zerop (val))
356 fprintf (dump_file, " zero\n");
357 else if (integer_all_onesp (val))
358 fprintf (dump_file, " minus one\n");
359 else
360 fprintf (dump_file, "\n");
364 /* Generate a call to memcpy for PARTITION in LOOP. */
366 static void
367 generate_memcpy_builtin (struct loop *loop, partition_t partition)
369 gimple_stmt_iterator gsi;
370 gimple stmt, fn_call;
371 tree nb_iter, dest, src, fn, nb_bytes;
372 location_t loc;
373 enum built_in_function kind;
375 stmt = DR_STMT (partition->main_dr);
376 loc = gimple_location (stmt);
377 if (gimple_bb (stmt) == loop->latch)
378 nb_iter = number_of_latch_executions (loop);
379 else
380 nb_iter = number_of_exit_cond_executions (loop);
382 /* The new statements will be placed before LOOP. */
383 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
385 nb_bytes = build_size_arg_loc (loc, partition->main_dr, nb_iter);
386 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
387 false, GSI_CONTINUE_LINKING);
388 dest = build_addr_arg_loc (loc, partition->main_dr, nb_bytes);
389 src = build_addr_arg_loc (loc, partition->secondary_dr, nb_bytes);
390 if (ptr_derefs_may_alias_p (dest, src))
391 kind = BUILT_IN_MEMMOVE;
392 else
393 kind = BUILT_IN_MEMCPY;
395 dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
396 false, GSI_CONTINUE_LINKING);
397 src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
398 false, GSI_CONTINUE_LINKING);
399 fn = build_fold_addr_expr (builtin_decl_implicit (kind));
400 fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
401 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
403 if (dump_file && (dump_flags & TDF_DETAILS))
405 if (kind == BUILT_IN_MEMCPY)
406 fprintf (dump_file, "generated memcpy\n");
407 else
408 fprintf (dump_file, "generated memmove\n");
412 /* Remove and destroy the loop LOOP. */
414 static void
415 destroy_loop (struct loop *loop)
417 unsigned nbbs = loop->num_nodes;
418 edge exit = single_exit (loop);
419 basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
420 basic_block *bbs;
421 unsigned i;
423 bbs = get_loop_body_in_dom_order (loop);
425 redirect_edge_pred (exit, src);
426 exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
427 exit->flags |= EDGE_FALLTHRU;
428 cancel_loop_tree (loop);
429 rescan_loop_exit (exit, false, true);
431 for (i = 0; i < nbbs; i++)
433 /* We have made sure to not leave any dangling uses of SSA
434 names defined in the loop. With the exception of virtuals.
435 Make sure we replace all uses of virtual defs that will remain
436 outside of the loop with the bare symbol as delete_basic_block
437 will release them. */
438 gimple_stmt_iterator gsi;
439 for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
441 gimple phi = gsi_stmt (gsi);
442 if (virtual_operand_p (gimple_phi_result (phi)))
443 mark_virtual_phi_result_for_renaming (phi);
445 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
447 gimple stmt = gsi_stmt (gsi);
448 tree vdef = gimple_vdef (stmt);
449 if (vdef && TREE_CODE (vdef) == SSA_NAME)
450 mark_virtual_operand_for_renaming (vdef);
452 delete_basic_block (bbs[i]);
454 free (bbs);
456 set_immediate_dominator (CDI_DOMINATORS, dest,
457 recompute_dominator (CDI_DOMINATORS, dest));
460 /* Generates code for PARTITION. */
462 static void
463 generate_code_for_partition (struct loop *loop,
464 partition_t partition, bool copy_p)
466 switch (partition->kind)
468 case PKIND_MEMSET:
469 generate_memset_builtin (loop, partition);
470 /* If this is the last partition for which we generate code, we have
471 to destroy the loop. */
472 if (!copy_p)
473 destroy_loop (loop);
474 break;
476 case PKIND_MEMCPY:
477 generate_memcpy_builtin (loop, partition);
478 /* If this is the last partition for which we generate code, we have
479 to destroy the loop. */
480 if (!copy_p)
481 destroy_loop (loop);
482 break;
484 case PKIND_NORMAL:
485 generate_loops_for_partition (loop, partition, copy_p);
486 break;
488 default:
489 gcc_unreachable ();
494 /* Returns true if the node V of RDG cannot be recomputed. */
496 static bool
497 rdg_cannot_recompute_vertex_p (struct graph *rdg, int v)
499 if (RDG_MEM_WRITE_STMT (rdg, v))
500 return true;
502 return false;
505 /* Returns true when the vertex V has already been generated in the
506 current partition (V is in PROCESSED), or when V belongs to another
507 partition and cannot be recomputed (V is not in REMAINING_STMTS). */
509 static inline bool
510 already_processed_vertex_p (bitmap processed, int v)
512 return (bitmap_bit_p (processed, v)
513 || !bitmap_bit_p (remaining_stmts, v));
516 /* Returns NULL when there is no anti-dependence among the successors
517 of vertex V, otherwise returns the edge with the anti-dep. */
519 static struct graph_edge *
520 has_anti_dependence (struct vertex *v)
522 struct graph_edge *e;
524 if (v->succ)
525 for (e = v->succ; e; e = e->succ_next)
526 if (RDGE_TYPE (e) == anti_dd)
527 return e;
529 return NULL;
532 /* Returns true when V has an anti-dependence edge among its successors. */
534 static bool
535 predecessor_has_mem_write (struct graph *rdg, struct vertex *v)
537 struct graph_edge *e;
539 if (v->pred)
540 for (e = v->pred; e; e = e->pred_next)
541 if (bitmap_bit_p (upstream_mem_writes, e->src)
542 /* Don't consider flow channels: a write to memory followed
543 by a read from memory. These channels allow the split of
544 the RDG in different partitions. */
545 && !RDG_MEM_WRITE_STMT (rdg, e->src))
546 return true;
548 return false;
551 /* Initializes the upstream_mem_writes bitmap following the
552 information from RDG. */
554 static void
555 mark_nodes_having_upstream_mem_writes (struct graph *rdg)
557 int v, x;
558 bitmap seen = BITMAP_ALLOC (NULL);
560 for (v = rdg->n_vertices - 1; v >= 0; v--)
561 if (!bitmap_bit_p (seen, v))
563 unsigned i;
564 vec<int> nodes;
565 nodes.create (3);
567 graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
569 FOR_EACH_VEC_ELT (nodes, i, x)
571 if (!bitmap_set_bit (seen, x))
572 continue;
574 if (RDG_MEM_WRITE_STMT (rdg, x)
575 || predecessor_has_mem_write (rdg, &(rdg->vertices[x]))
576 /* In anti dependences the read should occur before
577 the write, this is why both the read and the write
578 should be placed in the same partition. */
579 || has_anti_dependence (&(rdg->vertices[x])))
581 bitmap_set_bit (upstream_mem_writes, x);
585 nodes.release ();
589 /* Returns true when vertex u has a memory write node as a predecessor
590 in RDG. */
592 static bool
593 has_upstream_mem_writes (int u)
595 return bitmap_bit_p (upstream_mem_writes, u);
598 static void rdg_flag_vertex_and_dependent (struct graph *, int, partition_t,
599 bitmap, bitmap);
601 /* Flag the uses of U stopping following the information from
602 upstream_mem_writes. */
604 static void
605 rdg_flag_uses (struct graph *rdg, int u, partition_t partition, bitmap loops,
606 bitmap processed)
608 use_operand_p use_p;
609 struct vertex *x = &(rdg->vertices[u]);
610 gimple stmt = RDGV_STMT (x);
611 struct graph_edge *anti_dep = has_anti_dependence (x);
613 /* Keep in the same partition the destination of an antidependence,
614 because this is a store to the exact same location. Putting this
615 in another partition is bad for cache locality. */
616 if (anti_dep)
618 int v = anti_dep->dest;
620 if (!already_processed_vertex_p (processed, v))
621 rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
622 processed);
625 if (gimple_code (stmt) != GIMPLE_PHI)
627 if ((use_p = gimple_vuse_op (stmt)) != NULL_USE_OPERAND_P)
629 tree use = USE_FROM_PTR (use_p);
631 if (TREE_CODE (use) == SSA_NAME)
633 gimple def_stmt = SSA_NAME_DEF_STMT (use);
634 int v = rdg_vertex_for_stmt (rdg, def_stmt);
636 if (v >= 0
637 && !already_processed_vertex_p (processed, v))
638 rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
639 processed);
644 if (is_gimple_assign (stmt) && has_upstream_mem_writes (u))
646 tree op0 = gimple_assign_lhs (stmt);
648 /* Scalar channels don't have enough space for transmitting data
649 between tasks, unless we add more storage by privatizing. */
650 if (is_gimple_reg (op0))
652 use_operand_p use_p;
653 imm_use_iterator iter;
655 FOR_EACH_IMM_USE_FAST (use_p, iter, op0)
657 int v = rdg_vertex_for_stmt (rdg, USE_STMT (use_p));
659 if (!already_processed_vertex_p (processed, v))
660 rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
661 processed);
667 /* Flag V from RDG as part of PARTITION, and also flag its loop number
668 in LOOPS. */
670 static void
671 rdg_flag_vertex (struct graph *rdg, int v, partition_t partition, bitmap loops)
673 struct loop *loop;
675 if (!bitmap_set_bit (partition->stmts, v))
676 return;
678 loop = loop_containing_stmt (RDG_STMT (rdg, v));
679 bitmap_set_bit (loops, loop->num);
681 if (rdg_cannot_recompute_vertex_p (rdg, v))
683 partition->has_writes = true;
684 bitmap_clear_bit (remaining_stmts, v);
688 /* Flag in the bitmap PARTITION the vertex V and all its predecessors.
689 Also flag their loop number in LOOPS. */
691 static void
692 rdg_flag_vertex_and_dependent (struct graph *rdg, int v, partition_t partition,
693 bitmap loops, bitmap processed)
695 unsigned i;
696 vec<int> nodes;
697 nodes.create (3);
698 int x;
700 bitmap_set_bit (processed, v);
701 rdg_flag_uses (rdg, v, partition, loops, processed);
702 graphds_dfs (rdg, &v, 1, &nodes, false, remaining_stmts);
703 rdg_flag_vertex (rdg, v, partition, loops);
705 FOR_EACH_VEC_ELT (nodes, i, x)
706 if (!already_processed_vertex_p (processed, x))
707 rdg_flag_vertex_and_dependent (rdg, x, partition, loops, processed);
709 nodes.release ();
712 /* Initialize CONDS with all the condition statements from the basic
713 blocks of LOOP. */
715 static void
716 collect_condition_stmts (struct loop *loop, vec<gimple> *conds)
718 unsigned i;
719 edge e;
720 vec<edge> exits = get_loop_exit_edges (loop);
722 FOR_EACH_VEC_ELT (exits, i, e)
724 gimple cond = last_stmt (e->src);
726 if (cond)
727 conds->safe_push (cond);
730 exits.release ();
733 /* Add to PARTITION all the exit condition statements for LOOPS
734 together with all their dependent statements determined from
735 RDG. */
737 static void
738 rdg_flag_loop_exits (struct graph *rdg, bitmap loops, partition_t partition,
739 bitmap processed)
741 unsigned i;
742 bitmap_iterator bi;
743 vec<gimple> conds;
744 conds.create (3);
746 EXECUTE_IF_SET_IN_BITMAP (loops, 0, i, bi)
747 collect_condition_stmts (get_loop (i), &conds);
749 while (!conds.is_empty ())
751 gimple cond = conds.pop ();
752 int v = rdg_vertex_for_stmt (rdg, cond);
753 bitmap new_loops = BITMAP_ALLOC (NULL);
755 if (!already_processed_vertex_p (processed, v))
756 rdg_flag_vertex_and_dependent (rdg, v, partition, new_loops, processed);
758 EXECUTE_IF_SET_IN_BITMAP (new_loops, 0, i, bi)
759 if (bitmap_set_bit (loops, i))
760 collect_condition_stmts (get_loop (i), &conds);
762 BITMAP_FREE (new_loops);
765 conds.release ();
768 /* Returns a bitmap in which all the statements needed for computing
769 the strongly connected component C of the RDG are flagged, also
770 including the loop exit conditions. */
772 static partition_t
773 build_rdg_partition_for_component (struct graph *rdg, rdgc c)
775 int i, v;
776 partition_t partition = partition_alloc (NULL);
777 bitmap loops = BITMAP_ALLOC (NULL);
778 bitmap processed = BITMAP_ALLOC (NULL);
780 FOR_EACH_VEC_ELT (c->vertices, i, v)
781 if (!already_processed_vertex_p (processed, v))
782 rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed);
784 rdg_flag_loop_exits (rdg, loops, partition, processed);
786 BITMAP_FREE (processed);
787 BITMAP_FREE (loops);
788 return partition;
791 /* Free memory for COMPONENTS. */
793 static void
794 free_rdg_components (vec<rdgc> components)
796 int i;
797 rdgc x;
799 FOR_EACH_VEC_ELT (components, i, x)
801 x->vertices.release ();
802 free (x);
805 components.release ();
808 /* Build the COMPONENTS vector with the strongly connected components
809 of RDG in which the STARTING_VERTICES occur. */
811 static void
812 rdg_build_components (struct graph *rdg, vec<int> starting_vertices,
813 vec<rdgc> *components)
815 int i, v;
816 bitmap saved_components = BITMAP_ALLOC (NULL);
817 int n_components = graphds_scc (rdg, NULL);
818 /* ??? Macros cannot process template types with more than one
819 argument, so we need this typedef. */
820 typedef vec<int> vec_int_heap;
821 vec<int> *all_components = XNEWVEC (vec_int_heap, n_components);
823 for (i = 0; i < n_components; i++)
824 all_components[i].create (3);
826 for (i = 0; i < rdg->n_vertices; i++)
827 all_components[rdg->vertices[i].component].safe_push (i);
829 FOR_EACH_VEC_ELT (starting_vertices, i, v)
831 int c = rdg->vertices[v].component;
833 if (bitmap_set_bit (saved_components, c))
835 rdgc x = XCNEW (struct rdg_component);
836 x->num = c;
837 x->vertices = all_components[c];
839 components->safe_push (x);
843 for (i = 0; i < n_components; i++)
844 if (!bitmap_bit_p (saved_components, i))
845 all_components[i].release ();
847 free (all_components);
848 BITMAP_FREE (saved_components);
851 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
852 For the moment we detect only the memset zero pattern. */
854 static void
855 classify_partition (loop_p loop, struct graph *rdg, partition_t partition)
857 bitmap_iterator bi;
858 unsigned i;
859 tree nb_iter;
860 data_reference_p single_load, single_store;
862 partition->kind = PKIND_NORMAL;
863 partition->main_dr = NULL;
864 partition->secondary_dr = NULL;
866 if (!flag_tree_loop_distribute_patterns)
867 return;
869 /* Perform general partition disqualification for builtins. */
870 nb_iter = number_of_exit_cond_executions (loop);
871 if (!nb_iter || nb_iter == chrec_dont_know)
872 return;
874 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
876 gimple stmt = RDG_STMT (rdg, i);
878 if (gimple_has_volatile_ops (stmt))
879 return;
881 /* If the stmt has uses outside of the loop fail.
882 ??? If the stmt is generated in another partition that
883 is not created as builtin we can ignore this. */
884 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
886 if (dump_file && (dump_flags & TDF_DETAILS))
887 fprintf (dump_file, "not generating builtin, partition has "
888 "scalar uses outside of the loop\n");
889 return;
893 /* Detect memset and memcpy. */
894 single_load = NULL;
895 single_store = NULL;
896 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
898 gimple stmt = RDG_STMT (rdg, i);
899 data_reference_p dr;
900 unsigned j;
902 if (gimple_code (stmt) == GIMPLE_PHI)
903 continue;
905 /* Any scalar stmts are ok. */
906 if (!gimple_vuse (stmt))
907 continue;
909 /* Otherwise just regular loads/stores. */
910 if (!gimple_assign_single_p (stmt))
911 return;
913 /* But exactly one store and/or load. */
914 for (j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
916 if (DR_IS_READ (dr))
918 if (single_load != NULL)
919 return;
920 single_load = dr;
922 else
924 if (single_store != NULL)
925 return;
926 single_store = dr;
931 if (single_store && !single_load)
933 gimple stmt = DR_STMT (single_store);
934 tree rhs = gimple_assign_rhs1 (stmt);
935 if (!(integer_zerop (rhs)
936 || integer_all_onesp (rhs)
937 || real_zerop (rhs)
938 || (TREE_CODE (rhs) == CONSTRUCTOR
939 && !TREE_CLOBBER_P (rhs))
940 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
941 && (TYPE_MODE (TREE_TYPE (gimple_assign_lhs (stmt)))
942 == TYPE_MODE (unsigned_char_type_node)))))
943 return;
944 if (TREE_CODE (rhs) == SSA_NAME
945 && !SSA_NAME_IS_DEFAULT_DEF (rhs)
946 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
947 return;
948 if (!adjacent_dr_p (single_store))
949 return;
950 partition->kind = PKIND_MEMSET;
951 partition->main_dr = single_store;
953 else if (single_store && single_load)
955 gimple store = DR_STMT (single_store);
956 gimple load = DR_STMT (single_load);
957 /* Direct aggregate copy or via an SSA name temporary. */
958 if (load != store
959 && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
960 return;
961 if (!adjacent_dr_p (single_store)
962 || !adjacent_dr_p (single_load)
963 || !operand_equal_p (DR_STEP (single_store),
964 DR_STEP (single_load), 0))
965 return;
966 /* Now check that if there is a dependence this dependence is
967 of a suitable form for memmove. */
968 vec<loop_p> loops = vNULL;
969 ddr_p ddr;
970 loops.safe_push (loop);
971 ddr = initialize_data_dependence_relation (single_load, single_store,
972 loops);
973 compute_affine_dependence (ddr, loop);
974 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
976 free_dependence_relation (ddr);
977 loops.release ();
978 return;
980 if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
982 if (DDR_NUM_DIST_VECTS (ddr) == 0)
984 free_dependence_relation (ddr);
985 loops.release ();
986 return;
988 lambda_vector dist_v;
989 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
991 int dist = dist_v[index_in_loop_nest (loop->num,
992 DDR_LOOP_NEST (ddr))];
993 if (dist > 0 && !DDR_REVERSED_P (ddr))
995 free_dependence_relation (ddr);
996 loops.release ();
997 return;
1001 free_dependence_relation (ddr);
1002 loops.release ();
1003 partition->kind = PKIND_MEMCPY;
1004 partition->main_dr = single_store;
1005 partition->secondary_dr = single_load;
1009 /* For a data reference REF, return the declaration of its base
1010 address or NULL_TREE if the base is not determined. */
1012 static tree
1013 ref_base_address (data_reference_p dr)
1015 tree base_address = DR_BASE_ADDRESS (dr);
1016 if (base_address
1017 && TREE_CODE (base_address) == ADDR_EXPR)
1018 return TREE_OPERAND (base_address, 0);
1020 return base_address;
1023 /* Returns true when PARTITION1 and PARTITION2 have similar memory
1024 accesses in RDG. */
1026 static bool
1027 similar_memory_accesses (struct graph *rdg, partition_t partition1,
1028 partition_t partition2)
1030 unsigned i, j, k, l;
1031 bitmap_iterator bi, bj;
1032 data_reference_p ref1, ref2;
1034 /* First check whether in the intersection of the two partitions are
1035 any loads or stores. Common loads are the situation that happens
1036 most often. */
1037 EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1038 if (RDG_MEM_WRITE_STMT (rdg, i)
1039 || RDG_MEM_READS_STMT (rdg, i))
1040 return true;
1042 /* Then check all data-references against each other. */
1043 EXECUTE_IF_SET_IN_BITMAP (partition1->stmts, 0, i, bi)
1044 if (RDG_MEM_WRITE_STMT (rdg, i)
1045 || RDG_MEM_READS_STMT (rdg, i))
1046 EXECUTE_IF_SET_IN_BITMAP (partition2->stmts, 0, j, bj)
1047 if (RDG_MEM_WRITE_STMT (rdg, j)
1048 || RDG_MEM_READS_STMT (rdg, j))
1050 FOR_EACH_VEC_ELT (RDG_DATAREFS (rdg, i), k, ref1)
1052 tree base1 = ref_base_address (ref1);
1053 if (base1)
1054 FOR_EACH_VEC_ELT (RDG_DATAREFS (rdg, j), l, ref2)
1055 if (base1 == ref_base_address (ref2))
1056 return true;
1060 return false;
1063 /* Aggregate several components into a useful partition that is
1064 registered in the PARTITIONS vector. Partitions will be
1065 distributed in different loops. */
1067 static void
1068 rdg_build_partitions (struct graph *rdg, vec<rdgc> components,
1069 vec<int> *other_stores,
1070 vec<partition_t> *partitions, bitmap processed)
1072 int i;
1073 rdgc x;
1074 partition_t partition = partition_alloc (NULL);
1076 FOR_EACH_VEC_ELT (components, i, x)
1078 partition_t np;
1079 int v = x->vertices[0];
1081 if (bitmap_bit_p (processed, v))
1082 continue;
1084 np = build_rdg_partition_for_component (rdg, x);
1085 bitmap_ior_into (partition->stmts, np->stmts);
1086 partition->has_writes = partition_has_writes (np);
1087 bitmap_ior_into (processed, np->stmts);
1088 partition_free (np);
1090 if (partition_has_writes (partition))
1092 if (dump_file && (dump_flags & TDF_DETAILS))
1094 fprintf (dump_file, "ldist useful partition:\n");
1095 dump_bitmap (dump_file, partition->stmts);
1098 partitions->safe_push (partition);
1099 partition = partition_alloc (NULL);
1103 /* Add the nodes from the RDG that were not marked as processed, and
1104 that are used outside the current loop. These are scalar
1105 computations that are not yet part of previous partitions. */
1106 for (i = 0; i < rdg->n_vertices; i++)
1107 if (!bitmap_bit_p (processed, i)
1108 && rdg_defs_used_in_other_loops_p (rdg, i))
1109 other_stores->safe_push (i);
1111 /* If there are still statements left in the OTHER_STORES array,
1112 create other components and partitions with these stores and
1113 their dependences. */
1114 if (other_stores->length () > 0)
1116 vec<rdgc> comps;
1117 comps.create (3);
1118 vec<int> foo;
1119 foo.create (3);
1121 rdg_build_components (rdg, *other_stores, &comps);
1122 rdg_build_partitions (rdg, comps, &foo, partitions, processed);
1124 foo.release ();
1125 free_rdg_components (comps);
1128 /* If there is something left in the last partition, save it. */
1129 if (bitmap_count_bits (partition->stmts) > 0)
1130 partitions->safe_push (partition);
1131 else
1132 partition_free (partition);
1135 /* Dump to FILE the PARTITIONS. */
1137 static void
1138 dump_rdg_partitions (FILE *file, vec<partition_t> partitions)
1140 int i;
1141 partition_t partition;
1143 FOR_EACH_VEC_ELT (partitions, i, partition)
1144 debug_bitmap_file (file, partition->stmts);
1147 /* Debug PARTITIONS. */
1148 extern void debug_rdg_partitions (vec<partition_t> );
1150 DEBUG_FUNCTION void
1151 debug_rdg_partitions (vec<partition_t> partitions)
1153 dump_rdg_partitions (stderr, partitions);
1156 /* Returns the number of read and write operations in the RDG. */
1158 static int
1159 number_of_rw_in_rdg (struct graph *rdg)
1161 int i, res = 0;
1163 for (i = 0; i < rdg->n_vertices; i++)
1165 if (RDG_MEM_WRITE_STMT (rdg, i))
1166 ++res;
1168 if (RDG_MEM_READS_STMT (rdg, i))
1169 ++res;
1172 return res;
1175 /* Returns the number of read and write operations in a PARTITION of
1176 the RDG. */
1178 static int
1179 number_of_rw_in_partition (struct graph *rdg, partition_t partition)
1181 int res = 0;
1182 unsigned i;
1183 bitmap_iterator ii;
1185 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
1187 if (RDG_MEM_WRITE_STMT (rdg, i))
1188 ++res;
1190 if (RDG_MEM_READS_STMT (rdg, i))
1191 ++res;
1194 return res;
1197 /* Returns true when one of the PARTITIONS contains all the read or
1198 write operations of RDG. */
1200 static bool
1201 partition_contains_all_rw (struct graph *rdg,
1202 vec<partition_t> partitions)
1204 int i;
1205 partition_t partition;
1206 int nrw = number_of_rw_in_rdg (rdg);
1208 FOR_EACH_VEC_ELT (partitions, i, partition)
1209 if (nrw == number_of_rw_in_partition (rdg, partition))
1210 return true;
1212 return false;
1215 /* Generate code from STARTING_VERTICES in RDG. Returns the number of
1216 distributed loops. */
1218 static int
1219 ldist_gen (struct loop *loop, struct graph *rdg,
1220 vec<int> starting_vertices)
1222 int i, nbp;
1223 vec<rdgc> components;
1224 components.create (3);
1225 vec<partition_t> partitions;
1226 partitions.create (3);
1227 vec<int> other_stores;
1228 other_stores.create (3);
1229 partition_t partition;
1230 bitmap processed = BITMAP_ALLOC (NULL);
1231 bool any_builtin;
1233 remaining_stmts = BITMAP_ALLOC (NULL);
1234 upstream_mem_writes = BITMAP_ALLOC (NULL);
1236 for (i = 0; i < rdg->n_vertices; i++)
1238 bitmap_set_bit (remaining_stmts, i);
1240 /* Save in OTHER_STORES all the memory writes that are not in
1241 STARTING_VERTICES. */
1242 if (RDG_MEM_WRITE_STMT (rdg, i))
1244 int v;
1245 unsigned j;
1246 bool found = false;
1248 FOR_EACH_VEC_ELT (starting_vertices, j, v)
1249 if (i == v)
1251 found = true;
1252 break;
1255 if (!found)
1256 other_stores.safe_push (i);
1260 mark_nodes_having_upstream_mem_writes (rdg);
1261 rdg_build_components (rdg, starting_vertices, &components);
1262 rdg_build_partitions (rdg, components, &other_stores, &partitions,
1263 processed);
1264 BITMAP_FREE (processed);
1266 any_builtin = false;
1267 FOR_EACH_VEC_ELT (partitions, i, partition)
1269 classify_partition (loop, rdg, partition);
1270 any_builtin |= partition_builtin_p (partition);
1273 /* If we are only distributing patterns fuse all partitions that
1274 were not properly classified as builtins. Else fuse partitions
1275 with similar memory accesses. */
1276 if (!flag_tree_loop_distribution)
1278 partition_t into;
1279 /* If we did not detect any builtin simply bail out. */
1280 if (!any_builtin)
1282 nbp = 0;
1283 goto ldist_done;
1285 /* Only fuse adjacent non-builtin partitions, see PR53616.
1286 ??? Use dependence information to improve partition ordering. */
1287 i = 0;
1290 for (; partitions.iterate (i, &into); ++i)
1291 if (!partition_builtin_p (into))
1292 break;
1293 for (++i; partitions.iterate (i, &partition); ++i)
1294 if (!partition_builtin_p (partition))
1296 bitmap_ior_into (into->stmts, partition->stmts);
1297 partitions.ordered_remove (i);
1298 i--;
1300 else
1301 break;
1303 while ((unsigned) i < partitions.length ());
1305 else
1307 partition_t into;
1308 int j;
1309 for (i = 0; partitions.iterate (i, &into); ++i)
1311 if (partition_builtin_p (into))
1312 continue;
1313 for (j = i + 1;
1314 partitions.iterate (j, &partition); ++j)
1316 if (!partition_builtin_p (partition)
1317 /* ??? The following is horribly inefficient,
1318 we are re-computing and analyzing data-references
1319 of the stmts in the partitions all the time. */
1320 && similar_memory_accesses (rdg, into, partition))
1322 if (dump_file && (dump_flags & TDF_DETAILS))
1324 fprintf (dump_file, "fusing partitions\n");
1325 dump_bitmap (dump_file, into->stmts);
1326 dump_bitmap (dump_file, partition->stmts);
1327 fprintf (dump_file, "because they have similar "
1328 "memory accesses\n");
1330 bitmap_ior_into (into->stmts, partition->stmts);
1331 partitions.ordered_remove (j);
1332 j--;
1338 nbp = partitions.length ();
1339 if (nbp == 0
1340 || (nbp == 1 && !partition_builtin_p (partitions[0]))
1341 || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
1343 nbp = 0;
1344 goto ldist_done;
1347 if (dump_file && (dump_flags & TDF_DETAILS))
1348 dump_rdg_partitions (dump_file, partitions);
1350 FOR_EACH_VEC_ELT (partitions, i, partition)
1351 generate_code_for_partition (loop, partition, i < nbp - 1);
1353 ldist_done:
1355 BITMAP_FREE (remaining_stmts);
1356 BITMAP_FREE (upstream_mem_writes);
1358 FOR_EACH_VEC_ELT (partitions, i, partition)
1359 partition_free (partition);
1361 other_stores.release ();
1362 partitions.release ();
1363 free_rdg_components (components);
1364 return nbp;
1367 /* Distributes the code from LOOP in such a way that producer
1368 statements are placed before consumer statements. When STMTS is
1369 NULL, performs the maximal distribution, if STMTS is not NULL,
1370 tries to separate only these statements from the LOOP's body.
1371 Returns the number of distributed loops. */
1373 static int
1374 distribute_loop (struct loop *loop, vec<gimple> stmts)
1376 int res = 0;
1377 struct graph *rdg;
1378 gimple s;
1379 unsigned i;
1380 vec<int> vertices;
1381 vec<ddr_p> dependence_relations;
1382 vec<data_reference_p> datarefs;
1383 vec<loop_p> loop_nest;
1385 datarefs.create (10);
1386 dependence_relations.create (100);
1387 loop_nest.create (3);
1388 rdg = build_rdg (loop, &loop_nest, &dependence_relations, &datarefs);
1390 if (!rdg)
1392 if (dump_file && (dump_flags & TDF_DETAILS))
1393 fprintf (dump_file,
1394 "FIXME: Loop %d not distributed: failed to build the RDG.\n",
1395 loop->num);
1397 free_dependence_relations (dependence_relations);
1398 free_data_refs (datarefs);
1399 loop_nest.release ();
1400 return res;
1403 vertices.create (3);
1405 if (dump_file && (dump_flags & TDF_DETAILS))
1406 dump_rdg (dump_file, rdg);
1408 FOR_EACH_VEC_ELT (stmts, i, s)
1410 int v = rdg_vertex_for_stmt (rdg, s);
1412 if (v >= 0)
1414 vertices.safe_push (v);
1416 if (dump_file && (dump_flags & TDF_DETAILS))
1417 fprintf (dump_file,
1418 "ldist asked to generate code for vertex %d\n", v);
1422 res = ldist_gen (loop, rdg, vertices);
1423 vertices.release ();
1424 free_rdg (rdg);
1425 free_dependence_relations (dependence_relations);
1426 free_data_refs (datarefs);
1427 loop_nest.release ();
1428 return res;
1431 /* Distribute all loops in the current function. */
1433 static unsigned int
1434 tree_loop_distribution (void)
1436 struct loop *loop;
1437 loop_iterator li;
1438 bool changed = false;
1439 basic_block bb;
1441 FOR_ALL_BB (bb)
1443 gimple_stmt_iterator gsi;
1444 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1445 gimple_set_uid (gsi_stmt (gsi), -1);
1446 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1447 gimple_set_uid (gsi_stmt (gsi), -1);
1450 /* We can at the moment only distribute non-nested loops, thus restrict
1451 walking to innermost loops. */
1452 FOR_EACH_LOOP (li, loop, LI_ONLY_INNERMOST)
1454 vec<gimple> work_list = vNULL;
1455 basic_block *bbs;
1456 int num = loop->num;
1457 int nb_generated_loops = 0;
1458 unsigned int i;
1460 /* If the loop doesn't have a single exit we will fail anyway,
1461 so do that early. */
1462 if (!single_exit (loop))
1463 continue;
1465 /* Only optimize hot loops. */
1466 if (!optimize_loop_for_speed_p (loop))
1467 continue;
1469 /* Only distribute loops with a header and latch for now. */
1470 if (loop->num_nodes > 2)
1471 continue;
1473 /* Initialize the worklist with stmts we seed the partitions with. */
1474 bbs = get_loop_body_in_dom_order (loop);
1475 for (i = 0; i < loop->num_nodes; ++i)
1477 gimple_stmt_iterator gsi;
1478 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
1480 gimple stmt = gsi_stmt (gsi);
1481 /* Only distribute stores for now.
1482 ??? We should also try to distribute scalar reductions,
1483 thus SSA defs that have scalar uses outside of the loop. */
1484 if (!gimple_assign_single_p (stmt)
1485 || is_gimple_reg (gimple_assign_lhs (stmt)))
1486 continue;
1488 work_list.safe_push (stmt);
1491 free (bbs);
1493 if (work_list.length () > 0)
1494 nb_generated_loops = distribute_loop (loop, work_list);
1496 if (nb_generated_loops > 0)
1497 changed = true;
1499 if (dump_file && (dump_flags & TDF_DETAILS))
1501 if (nb_generated_loops > 1)
1502 fprintf (dump_file, "Loop %d distributed: split to %d loops.\n",
1503 num, nb_generated_loops);
1504 else
1505 fprintf (dump_file, "Loop %d is the same.\n", num);
1508 work_list.release ();
1511 if (changed)
1513 mark_virtual_operands_for_renaming (cfun);
1514 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
1517 #ifdef ENABLE_CHECKING
1518 verify_loop_structure ();
1519 #endif
1521 return 0;
1524 static bool
1525 gate_tree_loop_distribution (void)
1527 return flag_tree_loop_distribution
1528 || flag_tree_loop_distribute_patterns;
1531 struct gimple_opt_pass pass_loop_distribution =
1534 GIMPLE_PASS,
1535 "ldist", /* name */
1536 OPTGROUP_LOOP, /* optinfo_flags */
1537 gate_tree_loop_distribution, /* gate */
1538 tree_loop_distribution, /* execute */
1539 NULL, /* sub */
1540 NULL, /* next */
1541 0, /* static_pass_number */
1542 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
1543 PROP_cfg | PROP_ssa, /* properties_required */
1544 0, /* properties_provided */
1545 0, /* properties_destroyed */
1546 0, /* todo_flags_start */
1547 TODO_ggc_collect
1548 | TODO_verify_ssa /* todo_flags_finish */