Check in tree-dce enh to trunk
[official-gcc.git] / gcc / tree-loop-distribution.c
blobc380854eab59170dc7fe27283b36d5b21f8c3fa6
1 /* Loop distribution.
2 Copyright (C) 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 /* This pass performs loop distribution: for example, the loop
24 |DO I = 2, N
25 | A(I) = B(I) + C
26 | D(I) = A(I-1)*E
27 |ENDDO
29 is transformed to
31 |DOALL I = 2, N
32 | A(I) = B(I) + C
33 |ENDDO
35 |DOALL I = 2, N
36 | D(I) = A(I-1)*E
37 |ENDDO
39 This pass uses an RDG, Reduced Dependence Graph built on top of the
40 data dependence relations. The RDG is then topologically sorted to
41 obtain a map of information producers/consumers based on which it
42 generates the new loops. */
44 #include "config.h"
45 #include "system.h"
46 #include "coretypes.h"
47 #include "tm.h"
48 #include "ggc.h"
49 #include "tree.h"
50 #include "target.h"
52 #include "rtl.h"
53 #include "basic-block.h"
54 #include "diagnostic.h"
55 #include "tree-flow.h"
56 #include "tree-dump.h"
57 #include "timevar.h"
58 #include "cfgloop.h"
59 #include "expr.h"
60 #include "optabs.h"
61 #include "tree-chrec.h"
62 #include "tree-data-ref.h"
63 #include "tree-scalar-evolution.h"
64 #include "tree-pass.h"
65 #include "lambda.h"
66 #include "langhooks.h"
67 #include "tree-vectorizer.h"
69 /* If bit I is not set, it means that this node represents an
70 operation that has already been performed, and that should not be
71 performed again. This is the subgraph of remaining important
72 computations that is passed to the DFS algorithm for avoiding to
73 include several times the same stores in different loops. */
74 static bitmap remaining_stmts;
76 /* A node of the RDG is marked in this bitmap when it has as a
77 predecessor a node that writes to memory. */
78 static bitmap upstream_mem_writes;
80 /* Update the PHI nodes of NEW_LOOP. NEW_LOOP is a duplicate of
81 ORIG_LOOP. */
83 static void
84 update_phis_for_loop_copy (struct loop *orig_loop, struct loop *new_loop)
86 tree new_ssa_name;
87 tree phi_new, phi_orig;
88 edge orig_loop_latch = loop_latch_edge (orig_loop);
89 edge orig_entry_e = loop_preheader_edge (orig_loop);
90 edge new_loop_entry_e = loop_preheader_edge (new_loop);
92 /* Scan the phis in the headers of the old and new loops
93 (they are organized in exactly the same order). */
95 for (phi_new = phi_nodes (new_loop->header),
96 phi_orig = phi_nodes (orig_loop->header);
97 phi_new && phi_orig;
98 phi_new = PHI_CHAIN (phi_new), phi_orig = PHI_CHAIN (phi_orig))
100 /* Add the first phi argument for the phi in NEW_LOOP (the one
101 associated with the entry of NEW_LOOP) */
102 tree def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_entry_e);
103 add_phi_arg (phi_new, def, new_loop_entry_e);
105 /* Add the second phi argument for the phi in NEW_LOOP (the one
106 associated with the latch of NEW_LOOP) */
107 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
109 if (TREE_CODE (def) == SSA_NAME)
111 new_ssa_name = get_current_def (def);
113 if (!new_ssa_name)
114 /* This only happens if there are no definitions inside the
115 loop. Use the phi_result in this case. */
116 new_ssa_name = PHI_RESULT (phi_new);
118 else
119 /* Could be an integer. */
120 new_ssa_name = def;
122 add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop));
126 /* Return a copy of LOOP placed before LOOP. */
128 static struct loop *
129 copy_loop_before (struct loop *loop)
131 struct loop *res;
132 edge preheader = loop_preheader_edge (loop);
134 if (!single_exit (loop))
135 return NULL;
137 initialize_original_copy_tables ();
138 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader);
139 free_original_copy_tables ();
141 if (!res)
142 return NULL;
144 update_phis_for_loop_copy (loop, res);
145 rename_variables_in_loop (res);
147 return res;
150 /* Creates an empty basic block after LOOP. */
152 static void
153 create_bb_after_loop (struct loop *loop)
155 edge exit = single_exit (loop);
157 if (!exit)
158 return;
160 split_edge (exit);
163 /* Generate code for PARTITION from the code in LOOP. The loop is
164 copied when COPY_P is true. All the statements not flagged in the
165 PARTITION bitmap are removed from the loop or from its copy. The
166 statements are indexed in sequence inside a basic block, and the
167 basic blocks of a loop are taken in dom order. Returns true when
168 the code gen succeeded. */
170 static bool
171 generate_loops_for_partition (struct loop *loop, bitmap partition, bool copy_p)
173 unsigned i, x;
174 block_stmt_iterator bsi;
175 basic_block *bbs;
177 if (copy_p)
179 loop = copy_loop_before (loop);
180 create_preheader (loop, CP_SIMPLE_PREHEADERS);
181 create_bb_after_loop (loop);
184 if (loop == NULL)
185 return false;
187 /* Remove stmts not in the PARTITION bitmap. The order in which we
188 visit the phi nodes and the statements is exactly as in
189 stmts_from_loop. */
190 bbs = get_loop_body_in_dom_order (loop);
192 for (x = 0, i = 0; i < loop->num_nodes; i++)
194 basic_block bb = bbs[i];
195 tree phi, prev = NULL_TREE, next;
197 for (phi = phi_nodes (bb); phi;)
198 if (!bitmap_bit_p (partition, x++))
200 next = PHI_CHAIN (phi);
201 remove_phi_node (phi, prev, true);
202 phi = next;
204 else
206 prev = phi;
207 phi = PHI_CHAIN (phi);
210 for (bsi = bsi_start (bb); !bsi_end_p (bsi);)
211 if (TREE_CODE (bsi_stmt (bsi)) != LABEL_EXPR
212 && !bitmap_bit_p (partition, x++))
213 bsi_remove (&bsi, false);
214 else
215 bsi_next (&bsi);
217 mark_virtual_ops_in_bb (bb);
220 free (bbs);
221 return true;
224 /* Generate a call to memset. Return true when the operation succeeded. */
226 static bool
227 generate_memset_zero (tree stmt, tree op0, tree nb_iter,
228 block_stmt_iterator bsi)
230 tree s, t, stmts, nb_bytes, addr_base;
231 bool res = false;
232 tree stmt_list = NULL_TREE;
233 tree args [3];
234 tree fn_call, mem, fndecl, fntype, fn;
235 tree_stmt_iterator i;
236 ssa_op_iter iter;
237 struct data_reference *dr = XCNEW (struct data_reference);
239 nb_bytes = fold_build2 (MULT_EXPR, TREE_TYPE (nb_iter),
240 nb_iter, TYPE_SIZE_UNIT (TREE_TYPE (op0)));
241 nb_bytes = force_gimple_operand (nb_bytes, &stmts, true, NULL);
242 append_to_statement_list_force (stmts, &stmt_list);
244 DR_STMT (dr) = stmt;
245 DR_REF (dr) = op0;
246 dr_analyze_innermost (dr);
248 /* Test for a positive stride, iterating over every element. */
249 if (integer_zerop (fold_build2 (MINUS_EXPR, integer_type_node, DR_STEP (dr),
250 TYPE_SIZE_UNIT (TREE_TYPE (op0)))))
251 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_BASE_ADDRESS (dr)),
252 DR_BASE_ADDRESS (dr),
253 size_binop (PLUS_EXPR,
254 DR_OFFSET (dr), DR_INIT (dr)));
256 /* Test for a negative stride, iterating over every element. */
257 else if (integer_zerop (fold_build2 (PLUS_EXPR, integer_type_node,
258 TYPE_SIZE_UNIT (TREE_TYPE (op0)),
259 DR_STEP (dr))))
261 addr_base = size_binop (PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
262 addr_base = fold_build2 (MINUS_EXPR, sizetype, addr_base, nb_bytes);
263 addr_base = force_gimple_operand (addr_base, &stmts, true, NULL);
264 append_to_statement_list_force (stmts, &stmt_list);
266 addr_base = fold_build2 (POINTER_PLUS_EXPR,
267 TREE_TYPE (DR_BASE_ADDRESS (dr)),
268 DR_BASE_ADDRESS (dr), addr_base);
270 else
271 goto end;
273 mem = force_gimple_operand (addr_base, &stmts, true, NULL);
274 append_to_statement_list_force (stmts, &stmt_list);
277 fndecl = implicit_built_in_decls [BUILT_IN_MEMSET];
278 fntype = TREE_TYPE (fndecl);
279 fn = build1 (ADDR_EXPR, build_pointer_type (fntype), fndecl);
281 args[0] = mem;
282 args[1] = integer_zero_node;
283 args[2] = nb_bytes;
285 fn_call = build_call_array (fntype, fn, 3, args);
286 append_to_statement_list_force (fn_call, &stmt_list);
288 for (i = tsi_start (stmt_list); !tsi_end_p (i); tsi_next (&i))
290 s = tsi_stmt (i);
291 update_stmt_if_modified (s);
293 FOR_EACH_SSA_TREE_OPERAND (t, s, iter, SSA_OP_VIRTUAL_DEFS)
295 if (TREE_CODE (t) == SSA_NAME)
296 t = SSA_NAME_VAR (t);
297 mark_sym_for_renaming (t);
301 /* Mark also the uses of the VDEFS of STMT to be renamed. */
302 FOR_EACH_SSA_TREE_OPERAND (t, stmt, iter, SSA_OP_VIRTUAL_DEFS)
304 if (TREE_CODE (t) == SSA_NAME)
306 imm_use_iterator imm_iter;
308 FOR_EACH_IMM_USE_STMT (s, imm_iter, t)
309 update_stmt (s);
311 t = SSA_NAME_VAR (t);
313 mark_sym_for_renaming (t);
316 bsi_insert_after (&bsi, stmt_list, BSI_CONTINUE_LINKING);
317 res = true;
319 if (dump_file && (dump_flags & TDF_DETAILS))
320 fprintf (dump_file, "generated memset zero\n");
322 end:
323 free_data_ref (dr);
324 return res;
327 /* Tries to generate a builtin function for the instructions of LOOP
328 pointed to by the bits set in PARTITION. Returns true when the
329 operation succeeded. */
331 static bool
332 generate_builtin (struct loop *loop, bitmap partition, bool copy_p)
334 bool res = false;
335 unsigned i, x = 0;
336 basic_block *bbs;
337 tree write = NULL_TREE;
338 tree op0, op1;
339 block_stmt_iterator bsi;
340 tree nb_iter = number_of_exit_cond_executions (loop);
342 if (!nb_iter || nb_iter == chrec_dont_know)
343 return false;
345 bbs = get_loop_body_in_dom_order (loop);
347 for (i = 0; i < loop->num_nodes; i++)
349 basic_block bb = bbs[i];
350 tree phi;
352 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
353 x++;
355 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
357 tree stmt = bsi_stmt (bsi);
359 if (bitmap_bit_p (partition, x++)
360 && TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
361 && !is_gimple_reg (GIMPLE_STMT_OPERAND (stmt, 0)))
363 /* Don't generate the builtins when there are more than
364 one memory write. */
365 if (write != NULL)
366 goto end;
368 write = stmt;
373 if (!write)
374 goto end;
376 op0 = GIMPLE_STMT_OPERAND (write, 0);
377 op1 = GIMPLE_STMT_OPERAND (write, 1);
379 if (!(TREE_CODE (op0) == ARRAY_REF
380 || TREE_CODE (op0) == INDIRECT_REF))
381 goto end;
383 /* The new statements will be placed before LOOP. */
384 bsi = bsi_last (loop_preheader_edge (loop)->src);
386 if (integer_zerop (op1) || real_zerop (op1))
387 res = generate_memset_zero (write, op0, nb_iter, bsi);
389 /* If this is the last partition for which we generate code, we have
390 to destroy the loop. */
391 if (res && !copy_p)
393 unsigned nbbs = loop->num_nodes;
394 basic_block src = loop_preheader_edge (loop)->src;
395 basic_block dest = single_exit (loop)->dest;
396 make_edge (src, dest, EDGE_FALLTHRU);
397 set_immediate_dominator (CDI_DOMINATORS, dest, src);
398 cancel_loop_tree (loop);
400 for (i = 0; i < nbbs; i++)
401 delete_basic_block (bbs[i]);
404 end:
405 free (bbs);
406 return res;
409 /* Generates code for PARTITION. For simple loops, this function can
410 generate a built-in. */
412 static bool
413 generate_code_for_partition (struct loop *loop, bitmap partition, bool copy_p)
415 if (generate_builtin (loop, partition, copy_p))
416 return true;
418 return generate_loops_for_partition (loop, partition, copy_p);
422 /* Returns true if the node V of RDG cannot be recomputed. */
424 static bool
425 rdg_cannot_recompute_vertex_p (struct graph *rdg, int v)
427 if (RDG_MEM_WRITE_STMT (rdg, v))
428 return true;
430 return false;
433 /* Returns true when the vertex V has already been generated in the
434 current partition (V is in PROCESSED), or when V belongs to another
435 partition and cannot be recomputed (V is not in REMAINING_STMTS). */
437 static inline bool
438 already_processed_vertex_p (bitmap processed, int v)
440 return (bitmap_bit_p (processed, v)
441 || !bitmap_bit_p (remaining_stmts, v));
444 /* Returns NULL when there is no anti-dependence among the successors
445 of vertex V, otherwise returns the edge with the anti-dep. */
447 static struct graph_edge *
448 has_anti_dependence (struct vertex *v)
450 struct graph_edge *e;
452 if (v->succ)
453 for (e = v->succ; e; e = e->succ_next)
454 if (RDGE_TYPE (e) == anti_dd)
455 return e;
457 return NULL;
460 /* Returns true when V has an anti-dependence edge among its successors. */
462 static bool
463 predecessor_has_mem_write (struct graph *rdg, struct vertex *v)
465 struct graph_edge *e;
467 if (v->pred)
468 for (e = v->pred; e; e = e->pred_next)
469 if (bitmap_bit_p (upstream_mem_writes, e->src)
470 /* Don't consider flow channels: a write to memory followed
471 by a read from memory. These channels allow the split of
472 the RDG in different partitions. */
473 && !RDG_MEM_WRITE_STMT (rdg, e->src))
474 return true;
476 return false;
479 /* Initializes the upstream_mem_writes bitmap following the
480 information from RDG. */
482 static void
483 mark_nodes_having_upstream_mem_writes (struct graph *rdg)
485 int v, x;
486 bitmap seen = BITMAP_ALLOC (NULL);
488 for (v = rdg->n_vertices - 1; v >= 0; v--)
489 if (!bitmap_bit_p (seen, v))
491 unsigned i;
492 VEC (int, heap) *nodes = VEC_alloc (int, heap, 3);
493 bool has_upstream_mem_write_p = false;
495 graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
497 for (i = 0; VEC_iterate (int, nodes, i, x); i++)
499 if (bitmap_bit_p (seen, x))
500 continue;
502 bitmap_set_bit (seen, x);
504 if (RDG_MEM_WRITE_STMT (rdg, x)
505 || predecessor_has_mem_write (rdg, &(rdg->vertices[x]))
506 /* In anti dependences the read should occur before
507 the write, this is why both the read and the write
508 should be placed in the same partition. */
509 || has_anti_dependence (&(rdg->vertices[x])))
511 has_upstream_mem_write_p = true;
512 bitmap_set_bit (upstream_mem_writes, x);
516 VEC_free (int, heap, nodes);
520 /* Returns true when vertex u has a memory write node as a predecessor
521 in RDG. */
523 static bool
524 has_upstream_mem_writes (int u)
526 return bitmap_bit_p (upstream_mem_writes, u);
529 static void rdg_flag_vertex_and_dependent (struct graph *, int, bitmap, bitmap,
530 bitmap, bool *);
532 /* Flag all the uses of U. */
534 static void
535 rdg_flag_all_uses (struct graph *rdg, int u, bitmap partition, bitmap loops,
536 bitmap processed, bool *part_has_writes)
538 struct graph_edge *e;
540 for (e = rdg->vertices[u].succ; e; e = e->succ_next)
541 if (!bitmap_bit_p (processed, e->dest))
543 rdg_flag_vertex_and_dependent (rdg, e->dest, partition, loops,
544 processed, part_has_writes);
545 rdg_flag_all_uses (rdg, e->dest, partition, loops, processed,
546 part_has_writes);
550 /* Flag the uses of U stopping following the information from
551 upstream_mem_writes. */
553 static void
554 rdg_flag_uses (struct graph *rdg, int u, bitmap partition, bitmap loops,
555 bitmap processed, bool *part_has_writes)
557 ssa_op_iter iter;
558 use_operand_p use_p;
559 struct vertex *x = &(rdg->vertices[u]);
560 tree stmt = RDGV_STMT (x);
561 struct graph_edge *anti_dep = has_anti_dependence (x);
563 /* Keep in the same partition the destination of an antidependence,
564 because this is a store to the exact same location. Putting this
565 in another partition is bad for cache locality. */
566 if (anti_dep)
568 int v = anti_dep->dest;
570 if (!already_processed_vertex_p (processed, v))
571 rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
572 processed, part_has_writes);
575 if (TREE_CODE (stmt) != PHI_NODE)
577 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VIRTUAL_USES)
579 tree use = USE_FROM_PTR (use_p);
581 if (TREE_CODE (use) == SSA_NAME)
583 tree def_stmt = SSA_NAME_DEF_STMT (use);
584 int v = rdg_vertex_for_stmt (rdg, def_stmt);
586 if (v >= 0
587 && !already_processed_vertex_p (processed, v))
588 rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
589 processed, part_has_writes);
594 if (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
595 && has_upstream_mem_writes (u))
597 tree op0 = GIMPLE_STMT_OPERAND (stmt, 0);
599 /* Scalar channels don't have enough space for transmitting data
600 between tasks, unless we add more storage by privatizing. */
601 if (is_gimple_reg (op0))
603 use_operand_p use_p;
604 imm_use_iterator iter;
606 FOR_EACH_IMM_USE_FAST (use_p, iter, op0)
608 int v = rdg_vertex_for_stmt (rdg, USE_STMT (use_p));
610 if (!already_processed_vertex_p (processed, v))
611 rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
612 processed, part_has_writes);
618 /* Flag V from RDG as part of PARTITION, and also flag its loop number
619 in LOOPS. */
621 static void
622 rdg_flag_vertex (struct graph *rdg, int v, bitmap partition, bitmap loops,
623 bool *part_has_writes)
625 struct loop *loop;
627 if (bitmap_bit_p (partition, v))
628 return;
630 loop = loop_containing_stmt (RDG_STMT (rdg, v));
631 bitmap_set_bit (loops, loop->num);
632 bitmap_set_bit (partition, v);
634 if (rdg_cannot_recompute_vertex_p (rdg, v))
636 *part_has_writes = true;
637 bitmap_clear_bit (remaining_stmts, v);
641 /* Flag in the bitmap PARTITION the vertex V and all its predecessors.
642 Alse flag their loop number in LOOPS. */
644 static void
645 rdg_flag_vertex_and_dependent (struct graph *rdg, int v, bitmap partition,
646 bitmap loops, bitmap processed,
647 bool *part_has_writes)
649 unsigned i;
650 VEC (int, heap) *nodes = VEC_alloc (int, heap, 3);
651 int x;
653 bitmap_set_bit (processed, v);
654 rdg_flag_uses (rdg, v, partition, loops, processed, part_has_writes);
655 graphds_dfs (rdg, &v, 1, &nodes, false, remaining_stmts);
656 rdg_flag_vertex (rdg, v, partition, loops, part_has_writes);
658 for (i = 0; VEC_iterate (int, nodes, i, x); i++)
659 if (!already_processed_vertex_p (processed, x))
660 rdg_flag_vertex_and_dependent (rdg, x, partition, loops, processed,
661 part_has_writes);
663 VEC_free (int, heap, nodes);
666 /* Initialize CONDS with all the condition statements from the basic
667 blocks of LOOP. */
669 static void
670 collect_condition_stmts (struct loop *loop, VEC (tree, heap) **conds)
672 unsigned i;
673 edge e;
674 VEC (edge, heap) *exits = get_loop_exit_edges (loop);
676 for (i = 0; VEC_iterate (edge, exits, i, e); i++)
678 tree cond = last_stmt (e->src);
680 if (cond)
681 VEC_safe_push (tree, heap, *conds, cond);
684 VEC_free (edge, heap, exits);
687 /* Add to PARTITION all the exit condition statements for LOOPS
688 together with all their dependent statements determined from
689 RDG. */
691 static void
692 rdg_flag_loop_exits (struct graph *rdg, bitmap loops, bitmap partition,
693 bitmap processed, bool *part_has_writes)
695 unsigned i;
696 bitmap_iterator bi;
697 VEC (tree, heap) *conds = VEC_alloc (tree, heap, 3);
699 EXECUTE_IF_SET_IN_BITMAP (loops, 0, i, bi)
700 collect_condition_stmts (get_loop (i), &conds);
702 while (!VEC_empty (tree, conds))
704 tree cond = VEC_pop (tree, conds);
705 int v = rdg_vertex_for_stmt (rdg, cond);
706 bitmap new_loops = BITMAP_ALLOC (NULL);
708 if (!already_processed_vertex_p (processed, v))
709 rdg_flag_vertex_and_dependent (rdg, v, partition, new_loops, processed,
710 part_has_writes);
712 EXECUTE_IF_SET_IN_BITMAP (new_loops, 0, i, bi)
713 if (!bitmap_bit_p (loops, i))
715 bitmap_set_bit (loops, i);
716 collect_condition_stmts (get_loop (i), &conds);
719 BITMAP_FREE (new_loops);
723 /* Strongly connected components of the reduced data dependence graph. */
725 typedef struct rdg_component
727 int num;
728 VEC (int, heap) *vertices;
729 } *rdgc;
731 DEF_VEC_P (rdgc);
732 DEF_VEC_ALLOC_P (rdgc, heap);
734 /* Flag all the nodes of RDG containing memory accesses that could
735 potentially belong to arrays already accessed in the current
736 PARTITION. */
738 static void
739 rdg_flag_similar_memory_accesses (struct graph *rdg, bitmap partition,
740 bitmap loops, bitmap processed,
741 VEC (int, heap) **other_stores)
743 bool foo;
744 unsigned i, n;
745 int j, k, kk;
746 bitmap_iterator ii;
747 struct graph_edge *e;
749 EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii)
750 if (RDG_MEM_WRITE_STMT (rdg, i)
751 || RDG_MEM_READS_STMT (rdg, i))
753 for (j = 0; j < rdg->n_vertices; j++)
754 if (!bitmap_bit_p (processed, j)
755 && (RDG_MEM_WRITE_STMT (rdg, j)
756 || RDG_MEM_READS_STMT (rdg, j))
757 && rdg_has_similar_memory_accesses (rdg, i, j))
759 /* Flag first the node J itself, and all the nodes that
760 are needed to compute J. */
761 rdg_flag_vertex_and_dependent (rdg, j, partition, loops,
762 processed, &foo);
764 /* When J is a read, we want to coalesce in the same
765 PARTITION all the nodes that are using J: this is
766 needed for better cache locality. */
767 rdg_flag_all_uses (rdg, j, partition, loops, processed, &foo);
769 /* Remove from OTHER_STORES the vertex that we flagged. */
770 if (RDG_MEM_WRITE_STMT (rdg, j))
771 for (k = 0; VEC_iterate (int, *other_stores, k, kk); k++)
772 if (kk == j)
774 VEC_unordered_remove (int, *other_stores, k);
775 break;
779 /* If the node I has two uses, then keep these together in the
780 same PARTITION. */
781 for (n = 0, e = rdg->vertices[i].succ; e; e = e->succ_next, n++);
783 if (n > 1)
784 rdg_flag_all_uses (rdg, i, partition, loops, processed, &foo);
788 /* Returns a bitmap in which all the statements needed for computing
789 the strongly connected component C of the RDG are flagged, also
790 including the loop exit conditions. */
792 static bitmap
793 build_rdg_partition_for_component (struct graph *rdg, rdgc c,
794 bool *part_has_writes,
795 VEC (int, heap) **other_stores)
797 int i, v;
798 bitmap partition = BITMAP_ALLOC (NULL);
799 bitmap loops = BITMAP_ALLOC (NULL);
800 bitmap processed = BITMAP_ALLOC (NULL);
802 for (i = 0; VEC_iterate (int, c->vertices, i, v); i++)
803 if (!already_processed_vertex_p (processed, v))
804 rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed,
805 part_has_writes);
807 /* Also iterate on the array of stores not in the starting vertices,
808 and determine those vertices that have some memory affinity with
809 the current nodes in the component: these are stores to the same
810 arrays, i.e. we're taking care of cache locality. */
811 rdg_flag_similar_memory_accesses (rdg, partition, loops, processed,
812 other_stores);
814 rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes);
816 BITMAP_FREE (processed);
817 BITMAP_FREE (loops);
818 return partition;
821 /* Free memory for COMPONENTS. */
823 static void
824 free_rdg_components (VEC (rdgc, heap) *components)
826 int i;
827 rdgc x;
829 for (i = 0; VEC_iterate (rdgc, components, i, x); i++)
831 VEC_free (int, heap, x->vertices);
832 free (x);
836 /* Build the COMPONENTS vector with the strongly connected components
837 of RDG in which the STARTING_VERTICES occur. */
839 static void
840 rdg_build_components (struct graph *rdg, VEC (int, heap) *starting_vertices,
841 VEC (rdgc, heap) **components)
843 int i, v;
844 bitmap saved_components = BITMAP_ALLOC (NULL);
845 int n_components = graphds_scc (rdg, NULL);
846 VEC (int, heap) **all_components = XNEWVEC (VEC (int, heap) *, n_components);
848 for (i = 0; i < n_components; i++)
849 all_components[i] = VEC_alloc (int, heap, 3);
851 for (i = 0; i < rdg->n_vertices; i++)
852 VEC_safe_push (int, heap, all_components[rdg->vertices[i].component], i);
854 for (i = 0; VEC_iterate (int, starting_vertices, i, v); i++)
856 int c = rdg->vertices[v].component;
858 if (!bitmap_bit_p (saved_components, c))
860 rdgc x = XCNEW (struct rdg_component);
861 x->num = c;
862 x->vertices = all_components[c];
864 VEC_safe_push (rdgc, heap, *components, x);
865 bitmap_set_bit (saved_components, c);
869 for (i = 0; i < n_components; i++)
870 if (!bitmap_bit_p (saved_components, i))
871 VEC_free (int, heap, all_components[i]);
873 free (all_components);
874 BITMAP_FREE (saved_components);
877 DEF_VEC_P (bitmap);
878 DEF_VEC_ALLOC_P (bitmap, heap);
880 /* Aggregate several components into a useful partition that is
881 registered in the PARTITIONS vector. Partitions will be
882 distributed in different loops. */
884 static void
885 rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components,
886 VEC (int, heap) **other_stores,
887 VEC (bitmap, heap) **partitions, bitmap processed)
889 int i;
890 rdgc x;
891 bitmap partition = BITMAP_ALLOC (NULL);
893 for (i = 0; VEC_iterate (rdgc, components, i, x); i++)
895 bitmap np;
896 bool part_has_writes = false;
897 int v = VEC_index (int, x->vertices, 0);
899 if (bitmap_bit_p (processed, v))
900 continue;
902 np = build_rdg_partition_for_component (rdg, x, &part_has_writes,
903 other_stores);
904 bitmap_ior_into (partition, np);
905 bitmap_ior_into (processed, np);
906 BITMAP_FREE (np);
908 if (part_has_writes)
910 if (dump_file && (dump_flags & TDF_DETAILS))
912 fprintf (dump_file, "ldist useful partition:\n");
913 dump_bitmap (dump_file, partition);
916 VEC_safe_push (bitmap, heap, *partitions, partition);
917 partition = BITMAP_ALLOC (NULL);
921 /* Add the nodes from the RDG that were not marked as processed, and
922 that are used outside the current loop. These are scalar
923 computations that are not yet part of previous partitions. */
924 for (i = 0; i < rdg->n_vertices; i++)
925 if (!bitmap_bit_p (processed, i)
926 && rdg_defs_used_in_other_loops_p (rdg, i))
927 VEC_safe_push (int, heap, *other_stores, i);
929 /* If there are still statements left in the OTHER_STORES array,
930 create other components and partitions with these stores and
931 their dependences. */
932 if (VEC_length (int, *other_stores) > 0)
934 VEC (rdgc, heap) *comps = VEC_alloc (rdgc, heap, 3);
935 VEC (int, heap) *foo = VEC_alloc (int, heap, 3);
937 rdg_build_components (rdg, *other_stores, &comps);
938 rdg_build_partitions (rdg, comps, &foo, partitions, processed);
940 VEC_free (int, heap, foo);
941 free_rdg_components (comps);
944 /* If there is something left in the last partition, save it. */
945 if (bitmap_count_bits (partition) > 0)
946 VEC_safe_push (bitmap, heap, *partitions, partition);
947 else
948 BITMAP_FREE (partition);
951 /* Dump to FILE the PARTITIONS. */
953 static void
954 dump_rdg_partitions (FILE *file, VEC (bitmap, heap) *partitions)
956 int i;
957 bitmap partition;
959 for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
960 debug_bitmap_file (file, partition);
963 /* Debug PARTITIONS. */
964 extern void debug_rdg_partitions (VEC (bitmap, heap) *);
966 void
967 debug_rdg_partitions (VEC (bitmap, heap) *partitions)
969 dump_rdg_partitions (stderr, partitions);
972 /* Generate code from STARTING_VERTICES in RDG. Returns the number of
973 distributed loops. */
975 static int
976 ldist_gen (struct loop *loop, struct graph *rdg,
977 VEC (int, heap) *starting_vertices)
979 int i, nbp;
980 VEC (rdgc, heap) *components = VEC_alloc (rdgc, heap, 3);
981 VEC (bitmap, heap) *partitions = VEC_alloc (bitmap, heap, 3);
982 VEC (int, heap) *other_stores = VEC_alloc (int, heap, 3);
983 bitmap partition, processed = BITMAP_ALLOC (NULL);
985 remaining_stmts = BITMAP_ALLOC (NULL);
986 upstream_mem_writes = BITMAP_ALLOC (NULL);
988 for (i = 0; i < rdg->n_vertices; i++)
990 bitmap_set_bit (remaining_stmts, i);
992 /* Save in OTHER_STORES all the memory writes that are not in
993 STARTING_VERTICES. */
994 if (RDG_MEM_WRITE_STMT (rdg, i))
996 int v;
997 unsigned j;
998 bool found = false;
1000 for (j = 0; VEC_iterate (int, starting_vertices, j, v); j++)
1001 if (i == v)
1003 found = true;
1004 break;
1007 if (!found)
1008 VEC_safe_push (int, heap, other_stores, i);
1012 mark_nodes_having_upstream_mem_writes (rdg);
1013 rdg_build_components (rdg, starting_vertices, &components);
1014 rdg_build_partitions (rdg, components, &other_stores, &partitions,
1015 processed);
1016 BITMAP_FREE (processed);
1017 nbp = VEC_length (bitmap, partitions);
1019 if (nbp <= 1)
1020 goto ldist_done;
1022 if (dump_file && (dump_flags & TDF_DETAILS))
1023 dump_rdg_partitions (dump_file, partitions);
1025 for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
1026 if (!generate_code_for_partition (loop, partition, i < nbp - 1))
1027 goto ldist_done;
1029 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
1030 update_ssa (TODO_update_ssa_only_virtuals | TODO_update_ssa);
1032 ldist_done:
1034 BITMAP_FREE (remaining_stmts);
1035 BITMAP_FREE (upstream_mem_writes);
1037 for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
1038 BITMAP_FREE (partition);
1040 VEC_free (int, heap, other_stores);
1041 VEC_free (bitmap, heap, partitions);
1042 free_rdg_components (components);
1043 return nbp;
1046 /* Distributes the code from LOOP in such a way that producer
1047 statements are placed before consumer statements. When STMTS is
1048 NULL, performs the maximal distribution, if STMTS is not NULL,
1049 tries to separate only these statements from the LOOP's body.
1050 Returns the number of distributed loops. */
1052 static int
1053 distribute_loop (struct loop *loop, VEC (tree, heap) *stmts)
1055 bool res = false;
1056 struct graph *rdg;
1057 tree s;
1058 unsigned i;
1059 VEC (int, heap) *vertices;
1061 if (loop->num_nodes > 2)
1063 if (dump_file && (dump_flags & TDF_DETAILS))
1064 fprintf (dump_file,
1065 "FIXME: Loop %d not distributed: it has more than two basic blocks.\n",
1066 loop->num);
1068 return res;
1071 rdg = build_rdg (loop);
1073 if (!rdg)
1075 if (dump_file && (dump_flags & TDF_DETAILS))
1076 fprintf (dump_file,
1077 "FIXME: Loop %d not distributed: failed to build the RDG.\n",
1078 loop->num);
1080 return res;
1083 vertices = VEC_alloc (int, heap, 3);
1085 if (dump_file && (dump_flags & TDF_DETAILS))
1086 dump_rdg (dump_file, rdg);
1088 for (i = 0; VEC_iterate (tree, stmts, i, s); i++)
1090 int v = rdg_vertex_for_stmt (rdg, s);
1092 if (v >= 0)
1094 VEC_safe_push (int, heap, vertices, v);
1096 if (dump_file && (dump_flags & TDF_DETAILS))
1097 fprintf (dump_file,
1098 "ldist asked to generate code for vertex %d\n", v);
1102 res = ldist_gen (loop, rdg, vertices);
1103 VEC_free (int, heap, vertices);
1104 free_rdg (rdg);
1106 return res;
1109 /* Distribute all loops in the current function. */
1111 static unsigned int
1112 tree_loop_distribution (void)
1114 struct loop *loop;
1115 loop_iterator li;
1116 int nb_generated_loops = 0;
1118 FOR_EACH_LOOP (li, loop, 0)
1120 VEC (tree, heap) *work_list = VEC_alloc (tree, heap, 3);
1122 /* With the following working list, we're asking distribute_loop
1123 to separate the stores of the loop: when dependences allow,
1124 it will end on having one store per loop. */
1125 stores_from_loop (loop, &work_list);
1127 /* A simple heuristic for cache locality is to not split stores
1128 to the same array. Without this call, an unrolled loop would
1129 be split into as many loops as unroll factor, each loop
1130 storing in the same array. */
1131 remove_similar_memory_refs (&work_list);
1133 nb_generated_loops = distribute_loop (loop, work_list);
1135 if (dump_file && (dump_flags & TDF_DETAILS))
1137 if (nb_generated_loops > 1)
1138 fprintf (dump_file, "Loop %d distributed: split to %d loops.\n",
1139 loop->num, nb_generated_loops);
1140 else
1141 fprintf (dump_file, "Loop %d is the same.\n", loop->num);
1144 verify_loop_structure ();
1146 VEC_free (tree, heap, work_list);
1149 return 0;
1152 static bool
1153 gate_tree_loop_distribution (void)
1155 return flag_tree_loop_distribution != 0;
1158 struct gimple_opt_pass pass_loop_distribution =
1161 GIMPLE_PASS,
1162 "ldist", /* name */
1163 gate_tree_loop_distribution, /* gate */
1164 tree_loop_distribution, /* execute */
1165 NULL, /* sub */
1166 NULL, /* next */
1167 0, /* static_pass_number */
1168 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
1169 PROP_cfg | PROP_ssa, /* properties_required */
1170 0, /* properties_provided */
1171 0, /* properties_destroyed */
1172 0, /* todo_flags_start */
1173 TODO_dump_func | TODO_verify_loops /* todo_flags_finish */