2009-01-19 Iain Sandoe <iain.sandoe@sandoe-acoustics.co.uk>
[official-gcc.git] / gcc / tree-loop-distribution.c
blob745957fcea29ca1d6d2bd971712b1d143e318871
1 /* Loop distribution.
2 Copyright (C) 2006, 2007, 2008 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 /* This pass performs loop distribution: for example, the loop
24 |DO I = 2, N
25 | A(I) = B(I) + C
26 | D(I) = A(I-1)*E
27 |ENDDO
29 is transformed to
31 |DOALL I = 2, N
32 | A(I) = B(I) + C
33 |ENDDO
35 |DOALL I = 2, N
36 | D(I) = A(I-1)*E
37 |ENDDO
39 This pass uses an RDG, Reduced Dependence Graph built on top of the
40 data dependence relations. The RDG is then topologically sorted to
41 obtain a map of information producers/consumers based on which it
42 generates the new loops. */
44 #include "config.h"
45 #include "system.h"
46 #include "coretypes.h"
47 #include "tm.h"
48 #include "ggc.h"
49 #include "tree.h"
50 #include "target.h"
52 #include "rtl.h"
53 #include "basic-block.h"
54 #include "diagnostic.h"
55 #include "tree-flow.h"
56 #include "tree-dump.h"
57 #include "timevar.h"
58 #include "cfgloop.h"
59 #include "expr.h"
60 #include "optabs.h"
61 #include "tree-chrec.h"
62 #include "tree-data-ref.h"
63 #include "tree-scalar-evolution.h"
64 #include "tree-pass.h"
65 #include "lambda.h"
66 #include "langhooks.h"
67 #include "tree-vectorizer.h"
69 /* If bit I is not set, it means that this node represents an
70 operation that has already been performed, and that should not be
71 performed again. This is the subgraph of remaining important
72 computations that is passed to the DFS algorithm for avoiding to
73 include several times the same stores in different loops. */
74 static bitmap remaining_stmts;
76 /* A node of the RDG is marked in this bitmap when it has as a
77 predecessor a node that writes to memory. */
78 static bitmap upstream_mem_writes;
80 /* Update the PHI nodes of NEW_LOOP. NEW_LOOP is a duplicate of
81 ORIG_LOOP. */
83 static void
84 update_phis_for_loop_copy (struct loop *orig_loop, struct loop *new_loop)
86 tree new_ssa_name;
87 gimple_stmt_iterator si_new, si_orig;
88 edge orig_loop_latch = loop_latch_edge (orig_loop);
89 edge orig_entry_e = loop_preheader_edge (orig_loop);
90 edge new_loop_entry_e = loop_preheader_edge (new_loop);
92 /* Scan the phis in the headers of the old and new loops
93 (they are organized in exactly the same order). */
94 for (si_new = gsi_start_phis (new_loop->header),
95 si_orig = gsi_start_phis (orig_loop->header);
96 !gsi_end_p (si_new) && !gsi_end_p (si_orig);
97 gsi_next (&si_new), gsi_next (&si_orig))
99 tree def;
100 gimple phi_new = gsi_stmt (si_new);
101 gimple phi_orig = gsi_stmt (si_orig);
103 /* Add the first phi argument for the phi in NEW_LOOP (the one
104 associated with the entry of NEW_LOOP) */
105 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_entry_e);
106 add_phi_arg (phi_new, def, new_loop_entry_e);
108 /* Add the second phi argument for the phi in NEW_LOOP (the one
109 associated with the latch of NEW_LOOP) */
110 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
112 if (TREE_CODE (def) == SSA_NAME)
114 new_ssa_name = get_current_def (def);
116 if (!new_ssa_name)
117 /* This only happens if there are no definitions inside the
118 loop. Use the phi_result in this case. */
119 new_ssa_name = PHI_RESULT (phi_new);
121 else
122 /* Could be an integer. */
123 new_ssa_name = def;
125 add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop));
129 /* Return a copy of LOOP placed before LOOP. */
131 static struct loop *
132 copy_loop_before (struct loop *loop)
134 struct loop *res;
135 edge preheader = loop_preheader_edge (loop);
137 if (!single_exit (loop))
138 return NULL;
140 initialize_original_copy_tables ();
141 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader);
142 free_original_copy_tables ();
144 if (!res)
145 return NULL;
147 update_phis_for_loop_copy (loop, res);
148 rename_variables_in_loop (res);
150 return res;
153 /* Creates an empty basic block after LOOP. */
155 static void
156 create_bb_after_loop (struct loop *loop)
158 edge exit = single_exit (loop);
160 if (!exit)
161 return;
163 split_edge (exit);
166 /* Generate code for PARTITION from the code in LOOP. The loop is
167 copied when COPY_P is true. All the statements not flagged in the
168 PARTITION bitmap are removed from the loop or from its copy. The
169 statements are indexed in sequence inside a basic block, and the
170 basic blocks of a loop are taken in dom order. Returns true when
171 the code gen succeeded. */
173 static bool
174 generate_loops_for_partition (struct loop *loop, bitmap partition, bool copy_p)
176 unsigned i, x;
177 gimple_stmt_iterator bsi;
178 basic_block *bbs;
180 if (copy_p)
182 loop = copy_loop_before (loop);
183 create_preheader (loop, CP_SIMPLE_PREHEADERS);
184 create_bb_after_loop (loop);
187 if (loop == NULL)
188 return false;
190 /* Remove stmts not in the PARTITION bitmap. The order in which we
191 visit the phi nodes and the statements is exactly as in
192 stmts_from_loop. */
193 bbs = get_loop_body_in_dom_order (loop);
195 for (x = 0, i = 0; i < loop->num_nodes; i++)
197 basic_block bb = bbs[i];
199 for (bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
200 if (!bitmap_bit_p (partition, x++))
201 remove_phi_node (&bsi, true);
202 else
203 gsi_next (&bsi);
205 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
206 if (gimple_code (gsi_stmt (bsi)) != GIMPLE_LABEL
207 && !bitmap_bit_p (partition, x++))
208 gsi_remove (&bsi, false);
209 else
210 gsi_next (&bsi);
212 mark_virtual_ops_in_bb (bb);
215 free (bbs);
216 return true;
219 /* Build size argument. */
221 static inline tree
222 build_size_arg (tree nb_iter, tree op, gimple_seq* stmt_list)
224 tree nb_bytes;
225 gimple_seq stmts = NULL;
227 nb_bytes = fold_build2 (MULT_EXPR, TREE_TYPE (nb_iter),
228 nb_iter, TYPE_SIZE_UNIT (TREE_TYPE (op)));
229 nb_bytes = force_gimple_operand (nb_bytes, &stmts, true, NULL);
230 gimple_seq_add_seq (stmt_list, stmts);
232 return nb_bytes;
235 /* Generate a call to memset. Return true when the operation succeeded. */
237 static bool
238 generate_memset_zero (gimple stmt, tree op0, tree nb_iter,
239 gimple_stmt_iterator bsi)
241 tree t, addr_base;
242 tree nb_bytes = NULL;
243 bool res = false;
244 gimple_seq stmts = NULL, stmt_list = NULL;
245 gimple fn_call;
246 tree mem, fndecl, fntype, fn;
247 gimple_stmt_iterator i;
248 ssa_op_iter iter;
249 struct data_reference *dr = XCNEW (struct data_reference);
251 DR_STMT (dr) = stmt;
252 DR_REF (dr) = op0;
253 if (!dr_analyze_innermost (dr))
254 goto end;
256 /* Test for a positive stride, iterating over every element. */
257 if (integer_zerop (fold_build2 (MINUS_EXPR, integer_type_node, DR_STEP (dr),
258 TYPE_SIZE_UNIT (TREE_TYPE (op0)))))
259 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_BASE_ADDRESS (dr)),
260 DR_BASE_ADDRESS (dr),
261 size_binop (PLUS_EXPR,
262 DR_OFFSET (dr), DR_INIT (dr)));
264 /* Test for a negative stride, iterating over every element. */
265 else if (integer_zerop (fold_build2 (PLUS_EXPR, integer_type_node,
266 TYPE_SIZE_UNIT (TREE_TYPE (op0)),
267 DR_STEP (dr))))
269 nb_bytes = build_size_arg (nb_iter, op0, &stmt_list);
270 addr_base = size_binop (PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
271 addr_base = fold_build2 (MINUS_EXPR, sizetype, addr_base, nb_bytes);
272 addr_base = force_gimple_operand (addr_base, &stmts, true, NULL);
273 gimple_seq_add_seq (&stmt_list, stmts);
275 addr_base = fold_build2 (POINTER_PLUS_EXPR,
276 TREE_TYPE (DR_BASE_ADDRESS (dr)),
277 DR_BASE_ADDRESS (dr), addr_base);
279 else
280 goto end;
282 mem = force_gimple_operand (addr_base, &stmts, true, NULL);
283 gimple_seq_add_seq (&stmt_list, stmts);
285 fndecl = implicit_built_in_decls [BUILT_IN_MEMSET];
286 fntype = TREE_TYPE (fndecl);
287 fn = build1 (ADDR_EXPR, build_pointer_type (fntype), fndecl);
289 if (!nb_bytes)
290 nb_bytes = build_size_arg (nb_iter, op0, &stmt_list);
291 fn_call = gimple_build_call (fn, 3, mem, integer_zero_node, nb_bytes);
292 gimple_seq_add_stmt (&stmt_list, fn_call);
294 for (i = gsi_start (stmt_list); !gsi_end_p (i); gsi_next (&i))
296 gimple s = gsi_stmt (i);
297 update_stmt_if_modified (s);
299 FOR_EACH_SSA_TREE_OPERAND (t, s, iter, SSA_OP_VIRTUAL_DEFS)
301 if (TREE_CODE (t) == SSA_NAME)
302 t = SSA_NAME_VAR (t);
303 mark_sym_for_renaming (t);
307 /* Mark also the uses of the VDEFS of STMT to be renamed. */
308 FOR_EACH_SSA_TREE_OPERAND (t, stmt, iter, SSA_OP_VIRTUAL_DEFS)
310 if (TREE_CODE (t) == SSA_NAME)
312 gimple s;
313 imm_use_iterator imm_iter;
315 FOR_EACH_IMM_USE_STMT (s, imm_iter, t)
316 update_stmt (s);
318 t = SSA_NAME_VAR (t);
320 mark_sym_for_renaming (t);
323 gsi_insert_seq_after (&bsi, stmt_list, GSI_CONTINUE_LINKING);
324 res = true;
326 if (dump_file && (dump_flags & TDF_DETAILS))
327 fprintf (dump_file, "generated memset zero\n");
329 end:
330 free_data_ref (dr);
331 return res;
334 /* Propagate phis in BB b to their uses and remove them. */
336 static void
337 prop_phis (basic_block b)
339 gimple_stmt_iterator psi;
340 gimple_seq phis = phi_nodes (b);
342 for (psi = gsi_start (phis); !gsi_end_p (psi); )
344 gimple phi = gsi_stmt (psi);
345 tree def = gimple_phi_result (phi), use = gimple_phi_arg_def (phi, 0);
347 gcc_assert (gimple_phi_num_args (phi) == 1);
349 if (!is_gimple_reg (def))
351 imm_use_iterator iter;
352 use_operand_p use_p;
353 gimple stmt;
355 FOR_EACH_IMM_USE_STMT (stmt, iter, def)
356 FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
357 SET_USE (use_p, use);
359 else
360 replace_uses_by (def, use);
362 remove_phi_node (&psi, true);
366 /* Tries to generate a builtin function for the instructions of LOOP
367 pointed to by the bits set in PARTITION. Returns true when the
368 operation succeeded. */
370 static bool
371 generate_builtin (struct loop *loop, bitmap partition, bool copy_p)
373 bool res = false;
374 unsigned i, x = 0;
375 basic_block *bbs;
376 gimple write = NULL;
377 tree op0, op1;
378 gimple_stmt_iterator bsi;
379 tree nb_iter = number_of_exit_cond_executions (loop);
381 if (!nb_iter || nb_iter == chrec_dont_know)
382 return false;
384 bbs = get_loop_body_in_dom_order (loop);
386 for (i = 0; i < loop->num_nodes; i++)
388 basic_block bb = bbs[i];
390 for (bsi = gsi_start_phis (bb); !gsi_end_p (bsi); gsi_next (&bsi))
391 x++;
393 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
395 gimple stmt = gsi_stmt (bsi);
397 if (bitmap_bit_p (partition, x++)
398 && is_gimple_assign (stmt)
399 && !is_gimple_reg (gimple_assign_lhs (stmt)))
401 /* Don't generate the builtins when there are more than
402 one memory write. */
403 if (write != NULL)
404 goto end;
406 write = stmt;
411 if (!write)
412 goto end;
414 op0 = gimple_assign_lhs (write);
415 op1 = gimple_assign_rhs1 (write);
417 if (!(TREE_CODE (op0) == ARRAY_REF
418 || TREE_CODE (op0) == INDIRECT_REF))
419 goto end;
421 /* The new statements will be placed before LOOP. */
422 bsi = gsi_last_bb (loop_preheader_edge (loop)->src);
424 if (gimple_assign_rhs_code (write) == INTEGER_CST
425 && (integer_zerop (op1) || real_zerop (op1)))
426 res = generate_memset_zero (write, op0, nb_iter, bsi);
428 /* If this is the last partition for which we generate code, we have
429 to destroy the loop. */
430 if (res && !copy_p)
432 unsigned nbbs = loop->num_nodes;
433 basic_block src = loop_preheader_edge (loop)->src;
434 basic_block dest = single_exit (loop)->dest;
435 prop_phis (dest);
436 make_edge (src, dest, EDGE_FALLTHRU);
437 set_immediate_dominator (CDI_DOMINATORS, dest, src);
438 cancel_loop_tree (loop);
440 for (i = 0; i < nbbs; i++)
441 delete_basic_block (bbs[i]);
444 end:
445 free (bbs);
446 return res;
449 /* Generates code for PARTITION. For simple loops, this function can
450 generate a built-in. */
452 static bool
453 generate_code_for_partition (struct loop *loop, bitmap partition, bool copy_p)
455 if (generate_builtin (loop, partition, copy_p))
456 return true;
458 return generate_loops_for_partition (loop, partition, copy_p);
462 /* Returns true if the node V of RDG cannot be recomputed. */
464 static bool
465 rdg_cannot_recompute_vertex_p (struct graph *rdg, int v)
467 if (RDG_MEM_WRITE_STMT (rdg, v))
468 return true;
470 return false;
473 /* Returns true when the vertex V has already been generated in the
474 current partition (V is in PROCESSED), or when V belongs to another
475 partition and cannot be recomputed (V is not in REMAINING_STMTS). */
477 static inline bool
478 already_processed_vertex_p (bitmap processed, int v)
480 return (bitmap_bit_p (processed, v)
481 || !bitmap_bit_p (remaining_stmts, v));
484 /* Returns NULL when there is no anti-dependence among the successors
485 of vertex V, otherwise returns the edge with the anti-dep. */
487 static struct graph_edge *
488 has_anti_dependence (struct vertex *v)
490 struct graph_edge *e;
492 if (v->succ)
493 for (e = v->succ; e; e = e->succ_next)
494 if (RDGE_TYPE (e) == anti_dd)
495 return e;
497 return NULL;
500 /* Returns true when V has an anti-dependence edge among its successors. */
502 static bool
503 predecessor_has_mem_write (struct graph *rdg, struct vertex *v)
505 struct graph_edge *e;
507 if (v->pred)
508 for (e = v->pred; e; e = e->pred_next)
509 if (bitmap_bit_p (upstream_mem_writes, e->src)
510 /* Don't consider flow channels: a write to memory followed
511 by a read from memory. These channels allow the split of
512 the RDG in different partitions. */
513 && !RDG_MEM_WRITE_STMT (rdg, e->src))
514 return true;
516 return false;
519 /* Initializes the upstream_mem_writes bitmap following the
520 information from RDG. */
522 static void
523 mark_nodes_having_upstream_mem_writes (struct graph *rdg)
525 int v, x;
526 bitmap seen = BITMAP_ALLOC (NULL);
528 for (v = rdg->n_vertices - 1; v >= 0; v--)
529 if (!bitmap_bit_p (seen, v))
531 unsigned i;
532 VEC (int, heap) *nodes = VEC_alloc (int, heap, 3);
533 bool has_upstream_mem_write_p = false;
535 graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
537 for (i = 0; VEC_iterate (int, nodes, i, x); i++)
539 if (bitmap_bit_p (seen, x))
540 continue;
542 bitmap_set_bit (seen, x);
544 if (RDG_MEM_WRITE_STMT (rdg, x)
545 || predecessor_has_mem_write (rdg, &(rdg->vertices[x]))
546 /* In anti dependences the read should occur before
547 the write, this is why both the read and the write
548 should be placed in the same partition. */
549 || has_anti_dependence (&(rdg->vertices[x])))
551 has_upstream_mem_write_p = true;
552 bitmap_set_bit (upstream_mem_writes, x);
556 VEC_free (int, heap, nodes);
560 /* Returns true when vertex u has a memory write node as a predecessor
561 in RDG. */
563 static bool
564 has_upstream_mem_writes (int u)
566 return bitmap_bit_p (upstream_mem_writes, u);
569 static void rdg_flag_vertex_and_dependent (struct graph *, int, bitmap, bitmap,
570 bitmap, bool *);
572 /* Flag all the uses of U. */
574 static void
575 rdg_flag_all_uses (struct graph *rdg, int u, bitmap partition, bitmap loops,
576 bitmap processed, bool *part_has_writes)
578 struct graph_edge *e;
580 for (e = rdg->vertices[u].succ; e; e = e->succ_next)
581 if (!bitmap_bit_p (processed, e->dest))
583 rdg_flag_vertex_and_dependent (rdg, e->dest, partition, loops,
584 processed, part_has_writes);
585 rdg_flag_all_uses (rdg, e->dest, partition, loops, processed,
586 part_has_writes);
590 /* Flag the uses of U stopping following the information from
591 upstream_mem_writes. */
593 static void
594 rdg_flag_uses (struct graph *rdg, int u, bitmap partition, bitmap loops,
595 bitmap processed, bool *part_has_writes)
597 ssa_op_iter iter;
598 use_operand_p use_p;
599 struct vertex *x = &(rdg->vertices[u]);
600 gimple stmt = RDGV_STMT (x);
601 struct graph_edge *anti_dep = has_anti_dependence (x);
603 /* Keep in the same partition the destination of an antidependence,
604 because this is a store to the exact same location. Putting this
605 in another partition is bad for cache locality. */
606 if (anti_dep)
608 int v = anti_dep->dest;
610 if (!already_processed_vertex_p (processed, v))
611 rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
612 processed, part_has_writes);
615 if (gimple_code (stmt) != GIMPLE_PHI)
617 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VIRTUAL_USES)
619 tree use = USE_FROM_PTR (use_p);
621 if (TREE_CODE (use) == SSA_NAME)
623 gimple def_stmt = SSA_NAME_DEF_STMT (use);
624 int v = rdg_vertex_for_stmt (rdg, def_stmt);
626 if (v >= 0
627 && !already_processed_vertex_p (processed, v))
628 rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
629 processed, part_has_writes);
634 if (is_gimple_assign (stmt) && has_upstream_mem_writes (u))
636 tree op0 = gimple_assign_lhs (stmt);
638 /* Scalar channels don't have enough space for transmitting data
639 between tasks, unless we add more storage by privatizing. */
640 if (is_gimple_reg (op0))
642 use_operand_p use_p;
643 imm_use_iterator iter;
645 FOR_EACH_IMM_USE_FAST (use_p, iter, op0)
647 int v = rdg_vertex_for_stmt (rdg, USE_STMT (use_p));
649 if (!already_processed_vertex_p (processed, v))
650 rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
651 processed, part_has_writes);
657 /* Flag V from RDG as part of PARTITION, and also flag its loop number
658 in LOOPS. */
660 static void
661 rdg_flag_vertex (struct graph *rdg, int v, bitmap partition, bitmap loops,
662 bool *part_has_writes)
664 struct loop *loop;
666 if (bitmap_bit_p (partition, v))
667 return;
669 loop = loop_containing_stmt (RDG_STMT (rdg, v));
670 bitmap_set_bit (loops, loop->num);
671 bitmap_set_bit (partition, v);
673 if (rdg_cannot_recompute_vertex_p (rdg, v))
675 *part_has_writes = true;
676 bitmap_clear_bit (remaining_stmts, v);
680 /* Flag in the bitmap PARTITION the vertex V and all its predecessors.
681 Also flag their loop number in LOOPS. */
683 static void
684 rdg_flag_vertex_and_dependent (struct graph *rdg, int v, bitmap partition,
685 bitmap loops, bitmap processed,
686 bool *part_has_writes)
688 unsigned i;
689 VEC (int, heap) *nodes = VEC_alloc (int, heap, 3);
690 int x;
692 bitmap_set_bit (processed, v);
693 rdg_flag_uses (rdg, v, partition, loops, processed, part_has_writes);
694 graphds_dfs (rdg, &v, 1, &nodes, false, remaining_stmts);
695 rdg_flag_vertex (rdg, v, partition, loops, part_has_writes);
697 for (i = 0; VEC_iterate (int, nodes, i, x); i++)
698 if (!already_processed_vertex_p (processed, x))
699 rdg_flag_vertex_and_dependent (rdg, x, partition, loops, processed,
700 part_has_writes);
702 VEC_free (int, heap, nodes);
705 /* Initialize CONDS with all the condition statements from the basic
706 blocks of LOOP. */
708 static void
709 collect_condition_stmts (struct loop *loop, VEC (gimple, heap) **conds)
711 unsigned i;
712 edge e;
713 VEC (edge, heap) *exits = get_loop_exit_edges (loop);
715 for (i = 0; VEC_iterate (edge, exits, i, e); i++)
717 gimple cond = last_stmt (e->src);
719 if (cond)
720 VEC_safe_push (gimple, heap, *conds, cond);
723 VEC_free (edge, heap, exits);
726 /* Add to PARTITION all the exit condition statements for LOOPS
727 together with all their dependent statements determined from
728 RDG. */
730 static void
731 rdg_flag_loop_exits (struct graph *rdg, bitmap loops, bitmap partition,
732 bitmap processed, bool *part_has_writes)
734 unsigned i;
735 bitmap_iterator bi;
736 VEC (gimple, heap) *conds = VEC_alloc (gimple, heap, 3);
738 EXECUTE_IF_SET_IN_BITMAP (loops, 0, i, bi)
739 collect_condition_stmts (get_loop (i), &conds);
741 while (!VEC_empty (gimple, conds))
743 gimple cond = VEC_pop (gimple, conds);
744 int v = rdg_vertex_for_stmt (rdg, cond);
745 bitmap new_loops = BITMAP_ALLOC (NULL);
747 if (!already_processed_vertex_p (processed, v))
748 rdg_flag_vertex_and_dependent (rdg, v, partition, new_loops, processed,
749 part_has_writes);
751 EXECUTE_IF_SET_IN_BITMAP (new_loops, 0, i, bi)
752 if (!bitmap_bit_p (loops, i))
754 bitmap_set_bit (loops, i);
755 collect_condition_stmts (get_loop (i), &conds);
758 BITMAP_FREE (new_loops);
762 /* Flag all the nodes of RDG containing memory accesses that could
763 potentially belong to arrays already accessed in the current
764 PARTITION. */
766 static void
767 rdg_flag_similar_memory_accesses (struct graph *rdg, bitmap partition,
768 bitmap loops, bitmap processed,
769 VEC (int, heap) **other_stores)
771 bool foo;
772 unsigned i, n;
773 int j, k, kk;
774 bitmap_iterator ii;
775 struct graph_edge *e;
777 EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii)
778 if (RDG_MEM_WRITE_STMT (rdg, i)
779 || RDG_MEM_READS_STMT (rdg, i))
781 for (j = 0; j < rdg->n_vertices; j++)
782 if (!bitmap_bit_p (processed, j)
783 && (RDG_MEM_WRITE_STMT (rdg, j)
784 || RDG_MEM_READS_STMT (rdg, j))
785 && rdg_has_similar_memory_accesses (rdg, i, j))
787 /* Flag first the node J itself, and all the nodes that
788 are needed to compute J. */
789 rdg_flag_vertex_and_dependent (rdg, j, partition, loops,
790 processed, &foo);
792 /* When J is a read, we want to coalesce in the same
793 PARTITION all the nodes that are using J: this is
794 needed for better cache locality. */
795 rdg_flag_all_uses (rdg, j, partition, loops, processed, &foo);
797 /* Remove from OTHER_STORES the vertex that we flagged. */
798 if (RDG_MEM_WRITE_STMT (rdg, j))
799 for (k = 0; VEC_iterate (int, *other_stores, k, kk); k++)
800 if (kk == j)
802 VEC_unordered_remove (int, *other_stores, k);
803 break;
807 /* If the node I has two uses, then keep these together in the
808 same PARTITION. */
809 for (n = 0, e = rdg->vertices[i].succ; e; e = e->succ_next, n++);
811 if (n > 1)
812 rdg_flag_all_uses (rdg, i, partition, loops, processed, &foo);
816 /* Returns a bitmap in which all the statements needed for computing
817 the strongly connected component C of the RDG are flagged, also
818 including the loop exit conditions. */
820 static bitmap
821 build_rdg_partition_for_component (struct graph *rdg, rdgc c,
822 bool *part_has_writes,
823 VEC (int, heap) **other_stores)
825 int i, v;
826 bitmap partition = BITMAP_ALLOC (NULL);
827 bitmap loops = BITMAP_ALLOC (NULL);
828 bitmap processed = BITMAP_ALLOC (NULL);
830 for (i = 0; VEC_iterate (int, c->vertices, i, v); i++)
831 if (!already_processed_vertex_p (processed, v))
832 rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed,
833 part_has_writes);
835 /* Also iterate on the array of stores not in the starting vertices,
836 and determine those vertices that have some memory affinity with
837 the current nodes in the component: these are stores to the same
838 arrays, i.e. we're taking care of cache locality. */
839 rdg_flag_similar_memory_accesses (rdg, partition, loops, processed,
840 other_stores);
842 rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes);
844 BITMAP_FREE (processed);
845 BITMAP_FREE (loops);
846 return partition;
849 /* Free memory for COMPONENTS. */
851 static void
852 free_rdg_components (VEC (rdgc, heap) *components)
854 int i;
855 rdgc x;
857 for (i = 0; VEC_iterate (rdgc, components, i, x); i++)
859 VEC_free (int, heap, x->vertices);
860 free (x);
864 /* Build the COMPONENTS vector with the strongly connected components
865 of RDG in which the STARTING_VERTICES occur. */
867 static void
868 rdg_build_components (struct graph *rdg, VEC (int, heap) *starting_vertices,
869 VEC (rdgc, heap) **components)
871 int i, v;
872 bitmap saved_components = BITMAP_ALLOC (NULL);
873 int n_components = graphds_scc (rdg, NULL);
874 VEC (int, heap) **all_components = XNEWVEC (VEC (int, heap) *, n_components);
876 for (i = 0; i < n_components; i++)
877 all_components[i] = VEC_alloc (int, heap, 3);
879 for (i = 0; i < rdg->n_vertices; i++)
880 VEC_safe_push (int, heap, all_components[rdg->vertices[i].component], i);
882 for (i = 0; VEC_iterate (int, starting_vertices, i, v); i++)
884 int c = rdg->vertices[v].component;
886 if (!bitmap_bit_p (saved_components, c))
888 rdgc x = XCNEW (struct rdg_component);
889 x->num = c;
890 x->vertices = all_components[c];
892 VEC_safe_push (rdgc, heap, *components, x);
893 bitmap_set_bit (saved_components, c);
897 for (i = 0; i < n_components; i++)
898 if (!bitmap_bit_p (saved_components, i))
899 VEC_free (int, heap, all_components[i]);
901 free (all_components);
902 BITMAP_FREE (saved_components);
905 /* Aggregate several components into a useful partition that is
906 registered in the PARTITIONS vector. Partitions will be
907 distributed in different loops. */
909 static void
910 rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components,
911 VEC (int, heap) **other_stores,
912 VEC (bitmap, heap) **partitions, bitmap processed)
914 int i;
915 rdgc x;
916 bitmap partition = BITMAP_ALLOC (NULL);
918 for (i = 0; VEC_iterate (rdgc, components, i, x); i++)
920 bitmap np;
921 bool part_has_writes = false;
922 int v = VEC_index (int, x->vertices, 0);
924 if (bitmap_bit_p (processed, v))
925 continue;
927 np = build_rdg_partition_for_component (rdg, x, &part_has_writes,
928 other_stores);
929 bitmap_ior_into (partition, np);
930 bitmap_ior_into (processed, np);
931 BITMAP_FREE (np);
933 if (part_has_writes)
935 if (dump_file && (dump_flags & TDF_DETAILS))
937 fprintf (dump_file, "ldist useful partition:\n");
938 dump_bitmap (dump_file, partition);
941 VEC_safe_push (bitmap, heap, *partitions, partition);
942 partition = BITMAP_ALLOC (NULL);
946 /* Add the nodes from the RDG that were not marked as processed, and
947 that are used outside the current loop. These are scalar
948 computations that are not yet part of previous partitions. */
949 for (i = 0; i < rdg->n_vertices; i++)
950 if (!bitmap_bit_p (processed, i)
951 && rdg_defs_used_in_other_loops_p (rdg, i))
952 VEC_safe_push (int, heap, *other_stores, i);
954 /* If there are still statements left in the OTHER_STORES array,
955 create other components and partitions with these stores and
956 their dependences. */
957 if (VEC_length (int, *other_stores) > 0)
959 VEC (rdgc, heap) *comps = VEC_alloc (rdgc, heap, 3);
960 VEC (int, heap) *foo = VEC_alloc (int, heap, 3);
962 rdg_build_components (rdg, *other_stores, &comps);
963 rdg_build_partitions (rdg, comps, &foo, partitions, processed);
965 VEC_free (int, heap, foo);
966 free_rdg_components (comps);
969 /* If there is something left in the last partition, save it. */
970 if (bitmap_count_bits (partition) > 0)
971 VEC_safe_push (bitmap, heap, *partitions, partition);
972 else
973 BITMAP_FREE (partition);
976 /* Dump to FILE the PARTITIONS. */
978 static void
979 dump_rdg_partitions (FILE *file, VEC (bitmap, heap) *partitions)
981 int i;
982 bitmap partition;
984 for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
985 debug_bitmap_file (file, partition);
988 /* Debug PARTITIONS. */
989 extern void debug_rdg_partitions (VEC (bitmap, heap) *);
991 void
992 debug_rdg_partitions (VEC (bitmap, heap) *partitions)
994 dump_rdg_partitions (stderr, partitions);
997 /* Returns the number of read and write operations in the RDG. */
999 static int
1000 number_of_rw_in_rdg (struct graph *rdg)
1002 int i, res = 0;
1004 for (i = 0; i < rdg->n_vertices; i++)
1006 if (RDG_MEM_WRITE_STMT (rdg, i))
1007 ++res;
1009 if (RDG_MEM_READS_STMT (rdg, i))
1010 ++res;
1013 return res;
1016 /* Returns the number of read and write operations in a PARTITION of
1017 the RDG. */
1019 static int
1020 number_of_rw_in_partition (struct graph *rdg, bitmap partition)
1022 int res = 0;
1023 unsigned i;
1024 bitmap_iterator ii;
1026 EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii)
1028 if (RDG_MEM_WRITE_STMT (rdg, i))
1029 ++res;
1031 if (RDG_MEM_READS_STMT (rdg, i))
1032 ++res;
1035 return res;
1038 /* Returns true when one of the PARTITIONS contains all the read or
1039 write operations of RDG. */
1041 static bool
1042 partition_contains_all_rw (struct graph *rdg, VEC (bitmap, heap) *partitions)
1044 int i;
1045 bitmap partition;
1046 int nrw = number_of_rw_in_rdg (rdg);
1048 for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
1049 if (nrw == number_of_rw_in_partition (rdg, partition))
1050 return true;
1052 return false;
1055 /* Generate code from STARTING_VERTICES in RDG. Returns the number of
1056 distributed loops. */
1058 static int
1059 ldist_gen (struct loop *loop, struct graph *rdg,
1060 VEC (int, heap) *starting_vertices)
1062 int i, nbp;
1063 VEC (rdgc, heap) *components = VEC_alloc (rdgc, heap, 3);
1064 VEC (bitmap, heap) *partitions = VEC_alloc (bitmap, heap, 3);
1065 VEC (int, heap) *other_stores = VEC_alloc (int, heap, 3);
1066 bitmap partition, processed = BITMAP_ALLOC (NULL);
1068 remaining_stmts = BITMAP_ALLOC (NULL);
1069 upstream_mem_writes = BITMAP_ALLOC (NULL);
1071 for (i = 0; i < rdg->n_vertices; i++)
1073 bitmap_set_bit (remaining_stmts, i);
1075 /* Save in OTHER_STORES all the memory writes that are not in
1076 STARTING_VERTICES. */
1077 if (RDG_MEM_WRITE_STMT (rdg, i))
1079 int v;
1080 unsigned j;
1081 bool found = false;
1083 for (j = 0; VEC_iterate (int, starting_vertices, j, v); j++)
1084 if (i == v)
1086 found = true;
1087 break;
1090 if (!found)
1091 VEC_safe_push (int, heap, other_stores, i);
1095 mark_nodes_having_upstream_mem_writes (rdg);
1096 rdg_build_components (rdg, starting_vertices, &components);
1097 rdg_build_partitions (rdg, components, &other_stores, &partitions,
1098 processed);
1099 BITMAP_FREE (processed);
1100 nbp = VEC_length (bitmap, partitions);
1102 if (nbp <= 1
1103 || partition_contains_all_rw (rdg, partitions))
1104 goto ldist_done;
1106 if (dump_file && (dump_flags & TDF_DETAILS))
1107 dump_rdg_partitions (dump_file, partitions);
1109 for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
1110 if (!generate_code_for_partition (loop, partition, i < nbp - 1))
1111 goto ldist_done;
1113 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
1114 update_ssa (TODO_update_ssa_only_virtuals | TODO_update_ssa);
1116 ldist_done:
1118 BITMAP_FREE (remaining_stmts);
1119 BITMAP_FREE (upstream_mem_writes);
1121 for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
1122 BITMAP_FREE (partition);
1124 VEC_free (int, heap, other_stores);
1125 VEC_free (bitmap, heap, partitions);
1126 free_rdg_components (components);
1127 return nbp;
1130 /* Distributes the code from LOOP in such a way that producer
1131 statements are placed before consumer statements. When STMTS is
1132 NULL, performs the maximal distribution, if STMTS is not NULL,
1133 tries to separate only these statements from the LOOP's body.
1134 Returns the number of distributed loops. */
1136 static int
1137 distribute_loop (struct loop *loop, VEC (gimple, heap) *stmts)
1139 bool res = false;
1140 struct graph *rdg;
1141 gimple s;
1142 unsigned i;
1143 VEC (int, heap) *vertices;
1145 if (loop->num_nodes > 2)
1147 if (dump_file && (dump_flags & TDF_DETAILS))
1148 fprintf (dump_file,
1149 "FIXME: Loop %d not distributed: it has more than two basic blocks.\n",
1150 loop->num);
1152 return res;
1155 rdg = build_rdg (loop);
1157 if (!rdg)
1159 if (dump_file && (dump_flags & TDF_DETAILS))
1160 fprintf (dump_file,
1161 "FIXME: Loop %d not distributed: failed to build the RDG.\n",
1162 loop->num);
1164 return res;
1167 vertices = VEC_alloc (int, heap, 3);
1169 if (dump_file && (dump_flags & TDF_DETAILS))
1170 dump_rdg (dump_file, rdg);
1172 for (i = 0; VEC_iterate (gimple, stmts, i, s); i++)
1174 int v = rdg_vertex_for_stmt (rdg, s);
1176 if (v >= 0)
1178 VEC_safe_push (int, heap, vertices, v);
1180 if (dump_file && (dump_flags & TDF_DETAILS))
1181 fprintf (dump_file,
1182 "ldist asked to generate code for vertex %d\n", v);
1186 res = ldist_gen (loop, rdg, vertices);
1187 VEC_free (int, heap, vertices);
1188 free_rdg (rdg);
1190 return res;
1193 /* Distribute all loops in the current function. */
1195 static unsigned int
1196 tree_loop_distribution (void)
1198 struct loop *loop;
1199 loop_iterator li;
1200 int nb_generated_loops = 0;
1202 FOR_EACH_LOOP (li, loop, 0)
1204 VEC (gimple, heap) *work_list = VEC_alloc (gimple, heap, 3);
1206 /* With the following working list, we're asking distribute_loop
1207 to separate the stores of the loop: when dependences allow,
1208 it will end on having one store per loop. */
1209 stores_from_loop (loop, &work_list);
1211 /* A simple heuristic for cache locality is to not split stores
1212 to the same array. Without this call, an unrolled loop would
1213 be split into as many loops as unroll factor, each loop
1214 storing in the same array. */
1215 remove_similar_memory_refs (&work_list);
1217 nb_generated_loops = distribute_loop (loop, work_list);
1219 if (dump_file && (dump_flags & TDF_DETAILS))
1221 if (nb_generated_loops > 1)
1222 fprintf (dump_file, "Loop %d distributed: split to %d loops.\n",
1223 loop->num, nb_generated_loops);
1224 else
1225 fprintf (dump_file, "Loop %d is the same.\n", loop->num);
1228 verify_loop_structure ();
1230 VEC_free (gimple, heap, work_list);
1233 return 0;
1236 static bool
1237 gate_tree_loop_distribution (void)
1239 return flag_tree_loop_distribution != 0;
1242 struct gimple_opt_pass pass_loop_distribution =
1245 GIMPLE_PASS,
1246 "ldist", /* name */
1247 gate_tree_loop_distribution, /* gate */
1248 tree_loop_distribution, /* execute */
1249 NULL, /* sub */
1250 NULL, /* next */
1251 0, /* static_pass_number */
1252 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
1253 PROP_cfg | PROP_ssa, /* properties_required */
1254 0, /* properties_provided */
1255 0, /* properties_destroyed */
1256 0, /* todo_flags_start */
1257 TODO_dump_func | TODO_verify_loops /* todo_flags_finish */