2008-01-26 Jerry DeLisle <jvdelisle@gcc.gnu.org>
[official-gcc.git] / gcc / tree-parloops.c
blobea75ed9c02978b53cd4f4d280bcdb9ed0e835aae
1 /* Loop autoparallelization.
2 Copyright (C) 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Sebastian Pop <pop@cri.ensmp.fr> and
4 Zdenek Dvorak <dvorakz@suse.cz>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
21 02110-1301, USA. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "rtl.h"
29 #include "tree-flow.h"
30 #include "cfgloop.h"
31 #include "ggc.h"
32 #include "tree-data-ref.h"
33 #include "diagnostic.h"
34 #include "tree-pass.h"
35 #include "tree-scalar-evolution.h"
36 #include "hashtab.h"
37 #include "langhooks.h"
38 #include "tree-vectorizer.h"
40 /* This pass tries to distribute iterations of loops into several threads.
41 The implementation is straightforward -- for each loop we test whether its
42 iterations are independent, and if it is the case (and some additional
43 conditions regarding profitability and correctness are satisfied), we
44 add OMP_PARALLEL and OMP_FOR codes and let omp expansion machinery do
45 its job.
47 The most of the complexity is in bringing the code into shape expected
48 by the omp expanders:
49 -- for OMP_FOR, ensuring that the loop has only one induction variable
50 and that the exit test is at the start of the loop body
51 -- for OMP_PARALLEL, replacing the references to local addressable
52 variables by accesses through pointers, and breaking up ssa chains
53 by storing the values incoming to the parallelized loop to a structure
54 passed to the new function as an argument (something similar is done
55 in omp gimplification, unfortunately only a small part of the code
56 can be shared).
58 TODO:
59 -- if there are several parallelizable loops in a function, it may be
60 possible to generate the threads just once (using synchronization to
61 ensure that cross-loop dependences are obeyed).
62 -- handling of common scalar dependence patterns (accumulation, ...)
63 -- handling of non-innermost loops */
65 /*
66 Reduction handling:
67 currently we use vect_is_simple_reduction() to detect reduction patterns.
68 The code transformation will be introduced by an example.
71 parloop
73 int sum=1;
75 for (i = 0; i < N; i++)
77 x[i] = i + 3;
78 sum+=x[i];
82 gimple-like code:
83 header_bb:
85 # sum_29 = PHI <sum_11(5), 1(3)>
86 # i_28 = PHI <i_12(5), 0(3)>
87 D.1795_8 = i_28 + 3;
88 x[i_28] = D.1795_8;
89 sum_11 = D.1795_8 + sum_29;
90 i_12 = i_28 + 1;
91 if (N_6(D) > i_12)
92 goto header_bb;
95 exit_bb:
97 # sum_21 = PHI <sum_11(4)>
98 printf (&"%d"[0], sum_21);
101 after reduction transformation (only relevant parts):
103 parloop
106 ....
109 # Storing the the initial value given by the user. #
111 .paral_data_store.32.sum.27 = 1;
113 #pragma omp parallel num_threads(4)
115 #pragma omp for schedule(static)
117 # The neutral element corresponding to the particular
118 reduction's operation, e.g. 0 for PLUS_EXPR,
119 1 for MULT_EXPR, etc. replaces the user's initial value. #
121 # sum.27_29 = PHI <sum.27_11, 0>
123 sum.27_11 = D.1827_8 + sum.27_29;
125 OMP_CONTINUE
127 # Adding this reduction phi is done at create_phi_for_local_result() #
128 # sum.27_56 = PHI <sum.27_11, 0>
129 OMP_RETURN
131 # Creating the atomic operation is done at
132 create_call_for_reduction_1() #
134 #pragma omp atomic_load
135 D.1839_59 = *&.paral_data_load.33_51->reduction.23;
136 D.1840_60 = sum.27_56 + D.1839_59;
137 #pragma omp atomic_store (D.1840_60);
139 OMP_RETURN
141 # collecting the result after the join of the threads is done at
142 create_loads_for_reductions().
143 The value computed by the threads is loaded from the
144 shared struct. #
147 .paral_data_load.33_52 = &.paral_data_store.32;
148 sum_37 = .paral_data_load.33_52->sum.27;
149 sum_43 = D.1795_41 + sum_37;
151 exit bb:
152 # sum_21 = PHI <sum_43, sum_26>
153 printf (&"%d"[0], sum_21);
161 /* Minimal number of iterations of a loop that should be executed in each
162 thread. */
163 #define MIN_PER_THREAD 100
165 /* Element of the hashtable, representing a
166 reduction in the current loop. */
167 struct reduction_info
169 tree reduc_stmt; /* reduction statement. */
170 tree reduc_phi; /* The phi node defining the reduction. */
171 enum tree_code reduction_code; /* code for the reduction operation. */
172 tree keep_res; /* The PHI_RESULT of this phi is the resulting value
173 of the reduction variable when existing the loop. */
174 tree initial_value; /* The initial value of the reduction var before entering the loop. */
175 tree field; /* the name of the field in the parloop data structure intended for reduction. */
176 tree init; /* reduction initialization value. */
177 tree new_phi; /* (helper field) Newly created phi node whose result
178 will be passed to the atomic operation. Represents
179 the local result each thread computed for the reduction
180 operation. */
183 /* Equality and hash functions for hashtab code. */
185 static int
186 reduction_info_eq (const void *aa, const void *bb)
188 const struct reduction_info *a = (const struct reduction_info *) aa;
189 const struct reduction_info *b = (const struct reduction_info *) bb;
191 return (a->reduc_phi == b->reduc_phi);
194 static hashval_t
195 reduction_info_hash (const void *aa)
197 const struct reduction_info *a = (const struct reduction_info *) aa;
199 return htab_hash_pointer (a->reduc_phi);
202 static struct reduction_info *
203 reduction_phi (htab_t reduction_list, tree phi)
205 struct reduction_info tmpred, *red;
207 if (htab_elements (reduction_list) == 0)
208 return NULL;
210 tmpred.reduc_phi = phi;
211 red = htab_find (reduction_list, &tmpred);
213 return red;
216 /* Element of hashtable of names to copy. */
218 struct name_to_copy_elt
220 unsigned version; /* The version of the name to copy. */
221 tree new_name; /* The new name used in the copy. */
222 tree field; /* The field of the structure used to pass the
223 value. */
226 /* Equality and hash functions for hashtab code. */
228 static int
229 name_to_copy_elt_eq (const void *aa, const void *bb)
231 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
232 const struct name_to_copy_elt *b = (const struct name_to_copy_elt *) bb;
234 return a->version == b->version;
237 static hashval_t
238 name_to_copy_elt_hash (const void *aa)
240 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
242 return (hashval_t) a->version;
245 /* Returns true if the iterations of LOOP are independent on each other (that
246 is, if we can execute them in parallel), and if LOOP satisfies other
247 conditions that we need to be able to parallelize it. Description of number
248 of iterations is stored to NITER. Reduction analysis is done, if
249 reductions are found, they are inserted to the REDUCTION_LIST. */
251 static bool
252 loop_parallel_p (struct loop *loop, htab_t reduction_list, struct tree_niter_desc *niter)
254 edge exit = single_dom_exit (loop);
255 VEC (ddr_p, heap) * dependence_relations;
256 VEC (data_reference_p, heap) * datarefs;
257 lambda_trans_matrix trans;
258 bool ret = false;
259 tree phi;
260 loop_vec_info simple_loop_info;
262 /* Only consider innermost loops with just one exit. The innermost-loop
263 restriction is not necessary, but it makes things simpler. */
264 if (loop->inner || !exit)
265 return false;
267 if (dump_file && (dump_flags & TDF_DETAILS))
268 fprintf (dump_file, "\nConsidering loop %d\n", loop->num);
270 /* We need to know # of iterations, and there should be no uses of values
271 defined inside loop outside of it, unless the values are invariants of
272 the loop. */
273 if (!number_of_iterations_exit (loop, exit, niter, false))
275 if (dump_file && (dump_flags & TDF_DETAILS))
276 fprintf (dump_file, " FAILED: number of iterations not known\n");
277 return false;
280 simple_loop_info = vect_analyze_loop_form (loop);
282 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
284 tree reduc_stmt = NULL, operation;
286 /* ??? TODO: Change this into a generic function that
287 recognizes reductions. */
288 if (!is_gimple_reg (PHI_RESULT (phi)))
289 continue;
290 if (simple_loop_info)
291 reduc_stmt = vect_is_simple_reduction (simple_loop_info, phi);
293 /* Create a reduction_info struct, initialize it and insert it to
294 the reduction list. */
296 if (reduc_stmt)
298 PTR *slot;
299 struct reduction_info *new_reduction;
301 if (dump_file && (dump_flags & TDF_DETAILS))
303 fprintf (dump_file,
304 "Detected reduction. reduction stmt is: \n");
305 print_generic_stmt (dump_file, reduc_stmt, 0);
306 fprintf (dump_file, "\n");
309 new_reduction = XCNEW (struct reduction_info);
311 new_reduction->reduc_stmt = reduc_stmt;
312 new_reduction->reduc_phi = phi;
313 operation = GIMPLE_STMT_OPERAND (reduc_stmt, 1);
314 new_reduction->reduction_code = TREE_CODE (operation);
315 slot = htab_find_slot (reduction_list, new_reduction, INSERT);
316 *slot = new_reduction;
320 for (phi = phi_nodes (exit->dest); phi; phi = PHI_CHAIN (phi))
322 struct reduction_info *red;
323 imm_use_iterator imm_iter;
324 use_operand_p use_p;
325 tree reduc_phi;
327 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
329 if (is_gimple_reg (val))
331 if (dump_file && (dump_flags & TDF_DETAILS))
333 fprintf (dump_file, "phi is ");
334 print_generic_expr (dump_file, phi, 0);
335 fprintf (dump_file, "arg of phi to exit: value ");
336 print_generic_expr (dump_file, val, 0);
337 fprintf (dump_file, " used outside loop\n");
338 fprintf (dump_file,
339 " checking if it a part of reduction pattern: \n");
341 if (htab_elements (reduction_list) == 0)
343 if (dump_file && (dump_flags & TDF_DETAILS))
344 fprintf (dump_file,
345 " FAILED: it is not a part of reduction.\n");
346 return false;
348 reduc_phi = NULL;
349 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val)
351 if (flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
353 reduc_phi = USE_STMT (use_p);
354 break;
357 red = reduction_phi (reduction_list, reduc_phi);
358 if (red == NULL)
360 if (dump_file && (dump_flags & TDF_DETAILS))
361 fprintf (dump_file,
362 " FAILED: it is not a part of reduction.\n");
363 return false;
365 if (dump_file && (dump_flags & TDF_DETAILS))
367 fprintf (dump_file, "reduction phi is ");
368 print_generic_expr (dump_file, red->reduc_phi, 0);
369 fprintf (dump_file, "reduction stmt is ");
370 print_generic_expr (dump_file, red->reduc_stmt, 0);
376 /* The iterations of the loop may communicate only through bivs whose
377 iteration space can be distributed efficiently. */
378 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
380 tree def = PHI_RESULT (phi);
381 affine_iv iv;
383 if (is_gimple_reg (def) && !simple_iv (loop, phi, def, &iv, true))
385 struct reduction_info *red;
387 red = reduction_phi (reduction_list, phi);
388 if (red == NULL)
390 if (dump_file && (dump_flags & TDF_DETAILS))
391 fprintf (dump_file,
392 " FAILED: scalar dependency between iterations\n");
393 return false;
398 /* We need to version the loop to verify assumptions in runtime. */
399 if (!can_duplicate_loop_p (loop))
401 if (dump_file && (dump_flags & TDF_DETAILS))
402 fprintf (dump_file, " FAILED: cannot be duplicated\n");
403 return false;
406 /* Check for problems with dependences. If the loop can be reversed,
407 the iterations are independent. */
408 datarefs = VEC_alloc (data_reference_p, heap, 10);
409 dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
410 compute_data_dependences_for_loop (loop, true, &datarefs,
411 &dependence_relations);
412 if (dump_file && (dump_flags & TDF_DETAILS))
413 dump_data_dependence_relations (dump_file, dependence_relations);
415 trans = lambda_trans_matrix_new (1, 1);
416 LTM_MATRIX (trans)[0][0] = -1;
418 if (lambda_transform_legal_p (trans, 1, dependence_relations))
420 ret = true;
421 if (dump_file && (dump_flags & TDF_DETAILS))
422 fprintf (dump_file, " SUCCESS: may be parallelized\n");
424 else if (dump_file && (dump_flags & TDF_DETAILS))
425 fprintf (dump_file,
426 " FAILED: data dependencies exist across iterations\n");
428 free_dependence_relations (dependence_relations);
429 free_data_refs (datarefs);
431 return ret;
434 /* Return true when LOOP contains basic blocks marked with the
435 BB_IRREDUCIBLE_LOOP flag. */
437 static inline bool
438 loop_has_blocks_with_irreducible_flag (struct loop *loop)
440 unsigned i;
441 basic_block *bbs = get_loop_body_in_dom_order (loop);
442 bool res = true;
444 for (i = 0; i < loop->num_nodes; i++)
445 if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
446 goto end;
448 res = false;
449 end:
450 free (bbs);
451 return res;
454 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name.
455 The assignment statement is placed before LOOP. DECL_ADDRESS maps decls
456 to their addresses that can be reused. The address of OBJ is known to
457 be invariant in the whole function. */
459 static tree
460 take_address_of (tree obj, tree type, struct loop *loop, htab_t decl_address)
462 int uid;
463 void **dslot;
464 struct int_tree_map ielt, *nielt;
465 tree *var_p, name, bvar, stmt, addr;
466 edge entry = loop_preheader_edge (loop);
468 /* Since the address of OBJ is invariant, the trees may be shared.
469 Avoid rewriting unrelated parts of the code. */
470 obj = unshare_expr (obj);
471 for (var_p = &obj;
472 handled_component_p (*var_p);
473 var_p = &TREE_OPERAND (*var_p, 0))
474 continue;
475 uid = DECL_UID (*var_p);
477 ielt.uid = uid;
478 dslot = htab_find_slot_with_hash (decl_address, &ielt, uid, INSERT);
479 if (!*dslot)
481 addr = build_addr (*var_p, current_function_decl);
482 bvar = create_tmp_var (TREE_TYPE (addr), get_name (*var_p));
483 add_referenced_var (bvar);
484 stmt = build_gimple_modify_stmt (bvar, addr);
485 name = make_ssa_name (bvar, stmt);
486 GIMPLE_STMT_OPERAND (stmt, 0) = name;
487 bsi_insert_on_edge_immediate (entry, stmt);
489 nielt = XNEW (struct int_tree_map);
490 nielt->uid = uid;
491 nielt->to = name;
492 *dslot = nielt;
494 else
495 name = ((struct int_tree_map *) *dslot)->to;
497 if (var_p != &obj)
499 *var_p = build1 (INDIRECT_REF, TREE_TYPE (*var_p), name);
500 name = force_gimple_operand (build_addr (obj, current_function_decl),
501 &stmt, true, NULL_TREE);
502 if (stmt)
503 bsi_insert_on_edge_immediate (entry, stmt);
506 if (TREE_TYPE (name) != type)
508 name = force_gimple_operand (fold_convert (type, name), &stmt, true,
509 NULL_TREE);
510 if (stmt)
511 bsi_insert_on_edge_immediate (entry, stmt);
514 return name;
517 /* Callback for htab_traverse. Create the initialization statement
518 for reduction described in SLOT, and place it at the preheader of
519 the loop described in DATA. */
521 static int
522 initialize_reductions (void **slot, void *data)
524 tree init, c;
525 tree bvar, type, arg;
526 edge e;
528 struct reduction_info *reduc = *slot;
529 struct loop *loop = (struct loop *) data;
531 /* Create initialization in preheader:
532 reduction_variable = initialization value of reduction. */
534 /* In the phi node at the header, replace the argument coming
535 from the preheader with the reduction initialization value. */
537 /* Create a new variable to initialize the reduction. */
538 type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
539 bvar = create_tmp_var (type, "reduction");
540 add_referenced_var (bvar);
542 c = build_omp_clause (OMP_CLAUSE_REDUCTION);
543 OMP_CLAUSE_REDUCTION_CODE (c) = reduc->reduction_code;
544 OMP_CLAUSE_DECL (c) =
545 SSA_NAME_VAR (GIMPLE_STMT_OPERAND (reduc->reduc_stmt, 0));
547 init = omp_reduction_init (c, TREE_TYPE (bvar));
548 reduc->init = init;
550 /* Replace the argument representing the initialization value
551 with the initialization value for the reduction (neutral
552 element for the particular operation, e.g. 0 for PLUS_EXPR,
553 1 for MULT_EXPR, etc).
554 Keep the old value in a new variable "reduction_initial",
555 that will be taken in consideration after the parallel
556 computing is done. */
558 e = loop_preheader_edge (loop);
559 arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e);
560 /* Create new variable to hold the initial value. */
562 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
563 (reduc->reduc_phi, loop_preheader_edge (loop)), init);
564 reduc->initial_value = arg;
565 return 1;
568 struct elv_data
570 struct loop *loop;
571 htab_t decl_address;
572 bool changed;
575 /* Eliminates references to local variables in *TP out of LOOP. DECL_ADDRESS
576 contains addresses of the references that had their address taken already.
577 If the expression is changed, CHANGED is set to true. Callback for
578 walk_tree. */
580 static tree
581 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
583 struct elv_data *dta = data;
584 tree t = *tp, var, addr, addr_type, type, obj;
586 if (DECL_P (t))
588 *walk_subtrees = 0;
590 if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
591 return NULL_TREE;
593 type = TREE_TYPE (t);
594 addr_type = build_pointer_type (type);
595 addr = take_address_of (t, addr_type, dta->loop, dta->decl_address);
596 *tp = build1 (INDIRECT_REF, TREE_TYPE (*tp), addr);
598 dta->changed = true;
599 return NULL_TREE;
602 if (TREE_CODE (t) == ADDR_EXPR)
604 /* ADDR_EXPR may appear in two contexts:
605 -- as a gimple operand, when the address taken is a function invariant
606 -- as gimple rhs, when the resulting address in not a function
607 invariant
608 We do not need to do anything special in the latter case (the base of
609 the memory reference whose address is taken may be replaced in the
610 DECL_P case). The former case is more complicated, as we need to
611 ensure that the new address is still a gimple operand. Thus, it
612 is not sufficient to replace just the base of the memory reference --
613 we need to move the whole computation of the address out of the
614 loop. */
615 if (!is_gimple_val (t))
616 return NULL_TREE;
618 *walk_subtrees = 0;
619 obj = TREE_OPERAND (t, 0);
620 var = get_base_address (obj);
621 if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var))
622 return NULL_TREE;
624 addr_type = TREE_TYPE (t);
625 addr = take_address_of (obj, addr_type, dta->loop, dta->decl_address);
626 *tp = addr;
628 dta->changed = true;
629 return NULL_TREE;
632 if (!EXPR_P (t) && !GIMPLE_STMT_P (t))
633 *walk_subtrees = 0;
635 return NULL_TREE;
638 /* Moves the references to local variables in STMT from LOOP. DECL_ADDRESS
639 contains addresses for the references for that we have already taken
640 them. */
642 static void
643 eliminate_local_variables_stmt (struct loop *loop, tree stmt,
644 htab_t decl_address)
646 struct elv_data dta;
648 dta.loop = loop;
649 dta.decl_address = decl_address;
650 dta.changed = false;
652 walk_tree (&stmt, eliminate_local_variables_1, &dta, NULL);
654 if (dta.changed)
655 update_stmt (stmt);
658 /* Eliminates the references to local variables from LOOP.
659 This includes:
660 1) Taking address of a local variable -- these are moved out of the
661 loop (and temporary variable is created to hold the address if
662 necessary).
663 2) Dereferencing a local variable -- these are replaced with indirect
664 references. */
666 static void
667 eliminate_local_variables (struct loop *loop)
669 basic_block bb, *body = get_loop_body (loop);
670 unsigned i;
671 block_stmt_iterator bsi;
672 htab_t decl_address = htab_create (10, int_tree_map_hash, int_tree_map_eq,
673 free);
675 /* Find and rename the ssa names defined outside of loop. */
676 for (i = 0; i < loop->num_nodes; i++)
678 bb = body[i];
680 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
681 eliminate_local_variables_stmt (loop, bsi_stmt (bsi), decl_address);
684 htab_delete (decl_address);
687 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
688 The copies are stored to NAME_COPIES, if NAME was already duplicated,
689 its duplicate stored in NAME_COPIES is returned.
691 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
692 duplicated, storing the copies in DECL_COPIES. */
694 static tree
695 separate_decls_in_loop_name (tree name,
696 htab_t name_copies, htab_t decl_copies,
697 bool copy_name_p)
699 tree copy, var, var_copy;
700 unsigned idx, uid, nuid;
701 struct int_tree_map ielt, *nielt;
702 struct name_to_copy_elt elt, *nelt;
703 void **slot, **dslot;
705 if (TREE_CODE (name) != SSA_NAME)
706 return name;
708 idx = SSA_NAME_VERSION (name);
709 elt.version = idx;
710 slot = htab_find_slot_with_hash (name_copies, &elt, idx,
711 copy_name_p ? INSERT : NO_INSERT);
712 if (slot && *slot)
713 return ((struct name_to_copy_elt *) *slot)->new_name;
715 var = SSA_NAME_VAR (name);
716 uid = DECL_UID (var);
717 ielt.uid = uid;
718 dslot = htab_find_slot_with_hash (decl_copies, &ielt, uid, INSERT);
719 if (!*dslot)
721 var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
722 DECL_GIMPLE_REG_P (var_copy) = DECL_GIMPLE_REG_P (var);
723 add_referenced_var (var_copy);
724 nielt = XNEW (struct int_tree_map);
725 nielt->uid = uid;
726 nielt->to = var_copy;
727 *dslot = nielt;
729 /* Ensure that when we meet this decl next time, we won't duplicate
730 it again. */
731 nuid = DECL_UID (var_copy);
732 ielt.uid = nuid;
733 dslot = htab_find_slot_with_hash (decl_copies, &ielt, nuid, INSERT);
734 gcc_assert (!*dslot);
735 nielt = XNEW (struct int_tree_map);
736 nielt->uid = nuid;
737 nielt->to = var_copy;
738 *dslot = nielt;
740 else
741 var_copy = ((struct int_tree_map *) *dslot)->to;
743 if (copy_name_p)
745 copy = duplicate_ssa_name (name, NULL_TREE);
746 nelt = XNEW (struct name_to_copy_elt);
747 nelt->version = idx;
748 nelt->new_name = copy;
749 nelt->field = NULL_TREE;
750 *slot = nelt;
752 else
754 gcc_assert (!slot);
755 copy = name;
758 SSA_NAME_VAR (copy) = var_copy;
759 return copy;
762 /* Finds the ssa names used in STMT that are defined outside of LOOP and
763 replaces such ssa names with their duplicates. The duplicates are stored to
764 NAME_COPIES. Base decls of all ssa names used in STMT
765 (including those defined in LOOP) are replaced with the new temporary
766 variables; the replacement decls are stored in DECL_COPIES. */
768 static void
769 separate_decls_in_loop_stmt (struct loop *loop, tree stmt,
770 htab_t name_copies, htab_t decl_copies)
772 use_operand_p use;
773 def_operand_p def;
774 ssa_op_iter oi;
775 tree name, copy;
776 bool copy_name_p;
778 mark_virtual_ops_for_renaming (stmt);
780 FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
782 name = DEF_FROM_PTR (def);
783 gcc_assert (TREE_CODE (name) == SSA_NAME);
784 copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
785 false);
786 gcc_assert (copy == name);
789 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
791 name = USE_FROM_PTR (use);
792 if (TREE_CODE (name) != SSA_NAME)
793 continue;
795 copy_name_p = expr_invariant_in_loop_p (loop, name);
796 copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
797 copy_name_p);
798 SET_USE (use, copy);
802 /* Callback for htab_traverse. Adds a field corresponding to the reduction
803 specified in SLOT. The type is passed in DATA. */
805 static int
806 add_field_for_reduction (void **slot, void *data)
809 struct reduction_info *red = *slot;
810 tree type = data;
811 tree var = SSA_NAME_VAR (GIMPLE_STMT_OPERAND (red->reduc_stmt, 0));
812 tree field = build_decl (FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
814 insert_field_into_struct (type, field);
816 red->field = field;
818 return 1;
821 /* Callback for htab_traverse. Adds a field corresponding to a ssa name
822 described in SLOT. The type is passed in DATA. */
824 static int
825 add_field_for_name (void **slot, void *data)
827 struct name_to_copy_elt *elt = *slot;
828 tree type = data;
829 tree name = ssa_name (elt->version);
830 tree var = SSA_NAME_VAR (name);
831 tree field = build_decl (FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
833 insert_field_into_struct (type, field);
834 elt->field = field;
836 return 1;
839 /* Callback for htab_traverse. A local result is the intermediate result
840 computed by a single
841 thread, or the intial value in case no iteration was executed.
842 This function creates a phi node reflecting these values.
843 The phi's result will be stored in NEW_PHI field of the
844 reduction's data structure. */
846 static int
847 create_phi_for_local_result (void **slot, void *data)
849 struct reduction_info *reduc = *slot;
850 struct loop *loop = data;
851 edge e;
852 tree new_phi;
853 basic_block store_bb;
854 tree local_res;
856 /* STORE_BB is the block where the phi
857 should be stored. It is the destination of the loop exit.
858 (Find the fallthru edge from OMP_CONTINUE). */
859 store_bb = FALLTHRU_EDGE (loop->latch)->dest;
861 /* STORE_BB has two predecessors. One coming from the loop
862 (the reduction's result is computed at the loop),
863 and another coming from a block preceding the loop,
864 when no iterations
865 are executed (the initial value should be taken). */
866 if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (loop->latch))
867 e = EDGE_PRED (store_bb, 1);
868 else
869 e = EDGE_PRED (store_bb, 0);
870 local_res = make_ssa_name (SSA_NAME_VAR (GIMPLE_STMT_OPERAND (reduc->reduc_stmt, 0)), NULL_TREE);
871 new_phi = create_phi_node (local_res, store_bb);
872 SSA_NAME_DEF_STMT (local_res) = new_phi;
873 add_phi_arg (new_phi, reduc->init, e);
874 add_phi_arg (new_phi, GIMPLE_STMT_OPERAND (reduc->reduc_stmt, 0),
875 FALLTHRU_EDGE (loop->latch));
876 reduc->new_phi = new_phi;
878 return 1;
881 struct clsn_data
883 tree store;
884 tree load;
886 basic_block store_bb;
887 basic_block load_bb;
890 /* Callback for htab_traverse. Create an atomic instruction for the
891 reduction described in SLOT.
892 DATA annotates the place in memory the atomic operation relates to,
893 and the basic block it needs to be generated in. */
895 static int
896 create_call_for_reduction_1 (void **slot, void *data)
898 struct reduction_info *reduc = *slot;
899 struct clsn_data *clsn_data = data;
900 block_stmt_iterator bsi;
901 tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
902 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
903 tree load_struct;
904 basic_block bb;
905 basic_block new_bb;
906 edge e;
907 tree t, addr, addr_type, ref, x;
908 tree tmp_load, load, name;
910 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
911 t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
912 addr_type = build_pointer_type (type);
914 addr = build_addr (t, current_function_decl);
916 /* Create phi node. */
917 bb = clsn_data->load_bb;
919 e = split_block (bb, t);
920 new_bb = e->dest;
922 tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)), NULL);
923 add_referenced_var (tmp_load);
924 tmp_load = make_ssa_name (tmp_load, NULL);
925 load = build2 (OMP_ATOMIC_LOAD, void_type_node, tmp_load, addr);
926 SSA_NAME_DEF_STMT (tmp_load) = load;
927 bsi = bsi_start (new_bb);
928 bsi_insert_after (&bsi, load, BSI_NEW_STMT);
930 e = split_block (new_bb, load);
931 new_bb = e->dest;
932 bsi = bsi_start (new_bb);
933 ref = tmp_load;
935 fold_build2 (reduc->reduction_code,
936 TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref,
937 PHI_RESULT (reduc->new_phi));
939 name =
940 force_gimple_operand_bsi (&bsi, x, true, NULL_TREE, true,
941 BSI_CONTINUE_LINKING);
943 x = build1 (OMP_ATOMIC_STORE, void_type_node, name);
945 bsi_insert_after (&bsi, x, BSI_NEW_STMT);
946 return 1;
949 /* Create the atomic operation at the join point of the threads.
950 REDUCTION_LIST describes the reductions in the LOOP.
951 LD_ST_DATA describes the shared data structure where
952 shared data is stored in and loaded from. */
953 static void
954 create_call_for_reduction (struct loop *loop, htab_t reduction_list,
955 struct clsn_data *ld_st_data)
957 htab_traverse (reduction_list, create_phi_for_local_result, loop);
958 /* Find the fallthru edge from OMP_CONTINUE. */
959 ld_st_data->load_bb = FALLTHRU_EDGE (loop->latch)->dest;
960 htab_traverse (reduction_list, create_call_for_reduction_1, ld_st_data);
963 /* Callback for htab_traverse. Loads the final reduction value at the
964 join point of all threads, and inserts it in the right place. */
966 static int
967 create_loads_for_reductions (void **slot, void *data)
969 struct reduction_info *red = *slot;
970 struct clsn_data *clsn_data = data;
971 tree stmt;
972 block_stmt_iterator bsi;
973 tree type = TREE_TYPE (GIMPLE_STMT_OPERAND (red->reduc_stmt, 0));
974 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
975 tree load_struct;
976 tree name;
977 tree x;
979 bsi = bsi_after_labels (clsn_data->load_bb);
980 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
981 load_struct = build3 (COMPONENT_REF, type, load_struct, red->field,
982 NULL_TREE);
984 x = load_struct;
985 name = PHI_RESULT (red->keep_res);
986 stmt = build_gimple_modify_stmt (name, x);
987 GIMPLE_STMT_OPERAND (stmt, 0) = name;
988 SSA_NAME_DEF_STMT (name) = stmt;
990 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
992 remove_phi_node (red->keep_res, NULL_TREE, false);
994 return 1;
997 /* Load the reduction result that was stored in LD_ST_DATA.
998 REDUCTION_LIST describes the list of reductions that the
999 loades should be generated for. */
1000 static void
1001 create_final_loads_for_reduction (htab_t reduction_list,
1002 struct clsn_data *ld_st_data)
1004 block_stmt_iterator bsi;
1005 tree t;
1007 bsi = bsi_after_labels (ld_st_data->load_bb);
1008 t = build_fold_addr_expr (ld_st_data->store);
1010 build_gimple_modify_stmt (ld_st_data->load,
1011 build_fold_addr_expr (ld_st_data->store));
1013 bsi_insert_before (&bsi, t, BSI_NEW_STMT);
1014 SSA_NAME_DEF_STMT (ld_st_data->load) = t;
1015 GIMPLE_STMT_OPERAND (t, 0) = ld_st_data->load;
1017 htab_traverse (reduction_list, create_loads_for_reductions, ld_st_data);
1021 /* Callback for htab_traverse. Store the neutral value for the
1022 particular reduction's operation, e.g. 0 for PLUS_EXPR,
1023 1 for MULT_EXPR, etc. into the reduction field.
1024 The reduction is specified in SLOT. The store information is
1025 passed in DATA. */
1027 static int
1028 create_stores_for_reduction (void **slot, void *data)
1030 struct reduction_info *red = *slot;
1031 struct clsn_data *clsn_data = data;
1032 tree stmt;
1033 block_stmt_iterator bsi;
1034 tree type = TREE_TYPE (GIMPLE_STMT_OPERAND (red->reduc_stmt, 0));
1036 bsi = bsi_last (clsn_data->store_bb);
1037 stmt =
1038 build_gimple_modify_stmt (build3
1039 (COMPONENT_REF, type, clsn_data->store,
1040 red->field, NULL_TREE),
1041 red->initial_value);
1042 mark_virtual_ops_for_renaming (stmt);
1043 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1045 return 1;
1048 /* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and
1049 store to a field of STORE in STORE_BB for the ssa name and its duplicate
1050 specified in SLOT. */
1052 static int
1053 create_loads_and_stores_for_name (void **slot, void *data)
1055 struct name_to_copy_elt *elt = *slot;
1056 struct clsn_data *clsn_data = data;
1057 tree stmt;
1058 block_stmt_iterator bsi;
1059 tree type = TREE_TYPE (elt->new_name);
1060 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
1061 tree load_struct;
1063 bsi = bsi_last (clsn_data->store_bb);
1064 stmt =
1065 build_gimple_modify_stmt (build3
1066 (COMPONENT_REF, type, clsn_data->store,
1067 elt->field, NULL_TREE),
1068 ssa_name (elt->version));
1069 mark_virtual_ops_for_renaming (stmt);
1070 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1072 bsi = bsi_last (clsn_data->load_bb);
1073 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
1074 stmt = build_gimple_modify_stmt (elt->new_name,
1075 build3 (COMPONENT_REF, type, load_struct,
1076 elt->field, NULL_TREE));
1077 SSA_NAME_DEF_STMT (elt->new_name) = stmt;
1078 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1080 return 1;
1083 /* Moves all the variables used in LOOP and defined outside of it (including
1084 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1085 name) to a structure created for this purpose. The code
1087 while (1)
1089 use (a);
1090 use (b);
1093 is transformed this way:
1095 bb0:
1096 old.a = a;
1097 old.b = b;
1099 bb1:
1100 a' = new->a;
1101 b' = new->b;
1102 while (1)
1104 use (a');
1105 use (b');
1108 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The
1109 pointer `new' is intentionally not initialized (the loop will be split to a
1110 separate function later, and `new' will be initialized from its arguments).
1111 LD_ST_DATA holds information about the shared data structure used to pass
1112 information among the threads. It is initialized here, and
1113 gen_parallel_loop will pass it to create_call_for_reduction that
1114 needs this information. REDUCTION_LIST describes the reductions
1115 in LOOP. */
1117 static void
1118 separate_decls_in_loop (struct loop *loop, htab_t reduction_list,
1119 tree * arg_struct, tree * new_arg_struct,
1120 struct clsn_data *ld_st_data)
1123 basic_block bb1 = split_edge (loop_preheader_edge (loop));
1124 basic_block bb0 = single_pred (bb1);
1125 htab_t name_copies = htab_create (10, name_to_copy_elt_hash,
1126 name_to_copy_elt_eq, free);
1127 htab_t decl_copies = htab_create (10, int_tree_map_hash, int_tree_map_eq,
1128 free);
1129 basic_block bb, *body = get_loop_body (loop);
1130 unsigned i;
1131 tree phi, type, type_name, nvar;
1132 block_stmt_iterator bsi;
1133 struct clsn_data clsn_data;
1135 /* Find and rename the ssa names defined outside of loop. */
1136 for (i = 0; i < loop->num_nodes; i++)
1138 bb = body[i];
1140 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1141 separate_decls_in_loop_stmt (loop, phi, name_copies, decl_copies);
1143 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
1144 separate_decls_in_loop_stmt (loop, bsi_stmt (bsi), name_copies,
1145 decl_copies);
1147 free (body);
1149 if (htab_elements (name_copies) == 0)
1151 /* It may happen that there is nothing to copy (if there are only
1152 loop carried and external variables in the loop). */
1153 *arg_struct = NULL;
1154 *new_arg_struct = NULL;
1156 else
1158 /* Create the type for the structure to store the ssa names to. */
1159 type = lang_hooks.types.make_type (RECORD_TYPE);
1160 type_name = build_decl (TYPE_DECL, create_tmp_var_name (".paral_data"),
1161 type);
1162 TYPE_NAME (type) = type_name;
1164 htab_traverse (name_copies, add_field_for_name, type);
1165 if (htab_elements (reduction_list) > 0)
1167 /* Create the fields for reductions. */
1168 htab_traverse (reduction_list, add_field_for_reduction,
1169 type);
1171 layout_type (type);
1173 /* Create the loads and stores. */
1174 *arg_struct = create_tmp_var (type, ".paral_data_store");
1175 add_referenced_var (*arg_struct);
1176 nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
1177 add_referenced_var (nvar);
1178 *new_arg_struct = make_ssa_name (nvar, NULL_TREE);
1180 ld_st_data->store = *arg_struct;
1181 ld_st_data->load = *new_arg_struct;
1182 ld_st_data->store_bb = bb0;
1183 ld_st_data->load_bb = bb1;
1185 htab_traverse (name_copies, create_loads_and_stores_for_name,
1186 ld_st_data);
1188 /* Load the calculation from memory (after the join of the threads). */
1190 if (htab_elements (reduction_list) > 0)
1192 htab_traverse (reduction_list, create_stores_for_reduction,
1193 ld_st_data);
1194 clsn_data.load = make_ssa_name (nvar, NULL_TREE);
1195 clsn_data.load_bb = single_dom_exit (loop)->dest;
1196 clsn_data.store = ld_st_data->store;
1197 create_final_loads_for_reduction (reduction_list, &clsn_data);
1201 htab_delete (decl_copies);
1202 htab_delete (name_copies);
1205 /* Bitmap containing uids of functions created by parallelization. We cannot
1206 allocate it from the default obstack, as it must live across compilation
1207 of several functions; we make it gc allocated instead. */
1209 static GTY(()) bitmap parallelized_functions;
1211 /* Returns true if FN was created by create_loop_fn. */
1213 static bool
1214 parallelized_function_p (tree fn)
1216 if (!parallelized_functions || !DECL_ARTIFICIAL (fn))
1217 return false;
1219 return bitmap_bit_p (parallelized_functions, DECL_UID (fn));
1222 /* Creates and returns an empty function that will receive the body of
1223 a parallelized loop. */
1225 static tree
1226 create_loop_fn (void)
1228 char buf[100];
1229 char *tname;
1230 tree decl, type, name, t;
1231 struct function *act_cfun = cfun;
1232 static unsigned loopfn_num;
1234 snprintf (buf, 100, "%s.$loopfn", current_function_name ());
1235 ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
1236 clean_symbol_name (tname);
1237 name = get_identifier (tname);
1238 type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
1240 decl = build_decl (FUNCTION_DECL, name, type);
1241 if (!parallelized_functions)
1242 parallelized_functions = BITMAP_GGC_ALLOC ();
1243 bitmap_set_bit (parallelized_functions, DECL_UID (decl));
1245 TREE_STATIC (decl) = 1;
1246 TREE_USED (decl) = 1;
1247 DECL_ARTIFICIAL (decl) = 1;
1248 DECL_IGNORED_P (decl) = 0;
1249 TREE_PUBLIC (decl) = 0;
1250 DECL_UNINLINABLE (decl) = 1;
1251 DECL_EXTERNAL (decl) = 0;
1252 DECL_CONTEXT (decl) = NULL_TREE;
1253 DECL_INITIAL (decl) = make_node (BLOCK);
1255 t = build_decl (RESULT_DECL, NULL_TREE, void_type_node);
1256 DECL_ARTIFICIAL (t) = 1;
1257 DECL_IGNORED_P (t) = 1;
1258 DECL_RESULT (decl) = t;
1260 t = build_decl (PARM_DECL, get_identifier (".paral_data_param"),
1261 ptr_type_node);
1262 DECL_ARTIFICIAL (t) = 1;
1263 DECL_ARG_TYPE (t) = ptr_type_node;
1264 DECL_CONTEXT (t) = decl;
1265 TREE_USED (t) = 1;
1266 DECL_ARGUMENTS (decl) = t;
1268 allocate_struct_function (decl, false);
1270 /* The call to allocate_struct_function clobbers CFUN, so we need to restore
1271 it. */
1272 set_cfun (act_cfun);
1274 return decl;
1277 /* Bases all the induction variables in LOOP on a single induction variable
1278 (unsigned with base 0 and step 1), whose final value is compared with
1279 NIT. The induction variable is incremented in the loop latch.
1280 REDUCTION_LIST describes the reductions in LOOP. */
1282 static void
1283 canonicalize_loop_ivs (struct loop *loop, htab_t reduction_list, tree nit)
1285 unsigned precision = TYPE_PRECISION (TREE_TYPE (nit));
1286 tree phi, prev, res, type, var_before, val, atype, mtype, t, next;
1287 block_stmt_iterator bsi;
1288 bool ok;
1289 affine_iv iv;
1290 edge exit = single_dom_exit (loop);
1291 struct reduction_info *red;
1293 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
1295 res = PHI_RESULT (phi);
1297 if (is_gimple_reg (res) && TYPE_PRECISION (TREE_TYPE (res)) > precision)
1298 precision = TYPE_PRECISION (TREE_TYPE (res));
1301 type = lang_hooks.types.type_for_size (precision, 1);
1303 bsi = bsi_last (loop->latch);
1304 create_iv (build_int_cst_type (type, 0), build_int_cst (type, 1), NULL_TREE,
1305 loop, &bsi, true, &var_before, NULL);
1307 bsi = bsi_after_labels (loop->header);
1308 prev = NULL;
1309 for (phi = phi_nodes (loop->header); phi; phi = next)
1311 next = PHI_CHAIN (phi);
1312 res = PHI_RESULT (phi);
1314 if (!is_gimple_reg (res) || res == var_before)
1316 prev = phi;
1317 continue;
1320 ok = simple_iv (loop, phi, res, &iv, true);
1321 red = reduction_phi (reduction_list, phi);
1322 /* We preserve the reduction phi nodes. */
1323 if (!ok && red)
1325 prev = phi;
1326 continue;
1328 else
1329 gcc_assert (ok);
1330 remove_phi_node (phi, prev, false);
1332 atype = TREE_TYPE (res);
1333 mtype = POINTER_TYPE_P (atype) ? sizetype : atype;
1334 val = fold_build2 (MULT_EXPR, mtype, unshare_expr (iv.step),
1335 fold_convert (mtype, var_before));
1336 val = fold_build2 (POINTER_TYPE_P (atype)
1337 ? POINTER_PLUS_EXPR : PLUS_EXPR,
1338 atype, unshare_expr (iv.base), val);
1339 val = force_gimple_operand_bsi (&bsi, val, false, NULL_TREE, true,
1340 BSI_SAME_STMT);
1341 t = build_gimple_modify_stmt (res, val);
1342 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
1343 SSA_NAME_DEF_STMT (res) = t;
1346 t = last_stmt (exit->src);
1347 /* Make the loop exit if the control condition is not satisfied. */
1348 if (exit->flags & EDGE_TRUE_VALUE)
1350 edge te, fe;
1352 extract_true_false_edges_from_block (exit->src, &te, &fe);
1353 te->flags = EDGE_FALSE_VALUE;
1354 fe->flags = EDGE_TRUE_VALUE;
1356 COND_EXPR_COND (t) = build2 (LT_EXPR, boolean_type_node, var_before, nit);
1359 /* Moves the exit condition of LOOP to the beginning of its header, and
1360 duplicates the part of the last iteration that gets disabled to the
1361 exit of the loop. NIT is the number of iterations of the loop
1362 (used to initialize the variables in the duplicated part).
1364 TODO: the common case is that latch of the loop is empty and immediatelly
1365 follows the loop exit. In this case, it would be better not to copy the
1366 body of the loop, but only move the entry of the loop directly before the
1367 exit check and increase the number of iterations of the loop by one.
1368 This may need some additional preconditioning in case NIT = ~0.
1369 REDUCTION_LIST describes the reductions in LOOP. */
1371 static void
1372 transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit)
1374 basic_block *bbs, *nbbs, ex_bb, orig_header;
1375 unsigned n;
1376 bool ok;
1377 edge exit = single_dom_exit (loop), hpred;
1378 tree phi, nphi, cond, control, control_name, res, t, cond_stmt;
1379 block_stmt_iterator bsi;
1381 split_block_after_labels (loop->header);
1382 orig_header = single_succ (loop->header);
1383 hpred = single_succ_edge (loop->header);
1385 cond_stmt = last_stmt (exit->src);
1386 cond = COND_EXPR_COND (cond_stmt);
1387 control = TREE_OPERAND (cond, 0);
1388 gcc_assert (TREE_OPERAND (cond, 1) == nit);
1390 /* Make sure that we have phi nodes on exit for all loop header phis
1391 (create_parallel_loop requires that). */
1392 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
1394 res = PHI_RESULT (phi);
1395 t = make_ssa_name (SSA_NAME_VAR (res), phi);
1396 SET_PHI_RESULT (phi, t);
1398 nphi = create_phi_node (res, orig_header);
1399 SSA_NAME_DEF_STMT (res) = nphi;
1400 add_phi_arg (nphi, t, hpred);
1402 if (res == control)
1404 TREE_OPERAND (cond, 0) = t;
1405 update_stmt (cond_stmt);
1406 control = t;
1410 bbs = get_loop_body_in_dom_order (loop);
1411 for (n = 0; bbs[n] != exit->src; n++)
1412 continue;
1413 nbbs = XNEWVEC (basic_block, n);
1414 ok = tree_duplicate_sese_tail (single_succ_edge (loop->header), exit,
1415 bbs + 1, n, nbbs);
1416 gcc_assert (ok);
1417 free (bbs);
1418 ex_bb = nbbs[0];
1419 free (nbbs);
1421 /* Other than reductions, the only gimple reg that should be copied
1422 out of the loop is the control variable. */
1424 control_name = NULL_TREE;
1425 for (phi = phi_nodes (ex_bb); phi; phi = PHI_CHAIN (phi))
1427 res = PHI_RESULT (phi);
1428 if (!is_gimple_reg (res))
1429 continue;
1431 /* Check if it is a part of reduction. If it is,
1432 keep the phi at the reduction's keep_res field. The
1433 PHI_RESULT of this phi is the resulting value of the reduction
1434 variable when exiting the loop. */
1436 exit = single_dom_exit (loop);
1438 if (htab_elements (reduction_list) > 0)
1440 struct reduction_info *red;
1442 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1444 red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
1445 if (red)
1446 red->keep_res = phi;
1448 else
1449 gcc_assert (control_name == NULL_TREE
1450 && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
1451 control_name = res;
1453 gcc_assert (control_name != NULL_TREE);
1454 phi = SSA_NAME_DEF_STMT (control_name);
1455 remove_phi_node (phi, NULL_TREE, false);
1457 /* Initialize the control variable to NIT. */
1458 bsi = bsi_after_labels (ex_bb);
1459 nit = force_gimple_operand_bsi (&bsi,
1460 fold_convert (TREE_TYPE (control_name), nit),
1461 false, NULL_TREE, false, BSI_SAME_STMT);
1462 t = build_gimple_modify_stmt (control_name, nit);
1463 bsi_insert_before (&bsi, t, BSI_NEW_STMT);
1464 SSA_NAME_DEF_STMT (control_name) = t;
1467 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
1468 LOOP_FN and DATA are the arguments of OMP_PARALLEL.
1469 NEW_DATA is the variable that should be initialized from the argument
1470 of LOOP_FN. N_THREADS is the requested number of threads. Returns the
1471 basic block containing OMP_PARALLEL tree. */
1473 static basic_block
1474 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
1475 tree new_data, unsigned n_threads)
1477 block_stmt_iterator bsi;
1478 basic_block bb, paral_bb, for_bb, ex_bb;
1479 tree t, param, res, for_stmt;
1480 tree cvar, cvar_init, initvar, cvar_next, cvar_base, cond, phi, type;
1481 edge exit, nexit, guard, end, e;
1483 /* Prepare the OMP_PARALLEL statement. */
1484 bb = loop_preheader_edge (loop)->src;
1485 paral_bb = single_pred (bb);
1486 bsi = bsi_last (paral_bb);
1488 t = build_omp_clause (OMP_CLAUSE_NUM_THREADS);
1489 OMP_CLAUSE_NUM_THREADS_EXPR (t)
1490 = build_int_cst (integer_type_node, n_threads);
1491 t = build4 (OMP_PARALLEL, void_type_node, NULL_TREE, t, loop_fn, data);
1493 bsi_insert_after (&bsi, t, BSI_NEW_STMT);
1495 /* Initialize NEW_DATA. */
1496 if (data)
1498 bsi = bsi_after_labels (bb);
1500 param = make_ssa_name (DECL_ARGUMENTS (loop_fn), NULL_TREE);
1501 t = build_gimple_modify_stmt (param, build_fold_addr_expr (data));
1502 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
1503 SSA_NAME_DEF_STMT (param) = t;
1505 t = build_gimple_modify_stmt (new_data,
1506 fold_convert (TREE_TYPE (new_data),
1507 param));
1508 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
1509 SSA_NAME_DEF_STMT (new_data) = t;
1512 /* Emit OMP_RETURN for OMP_PARALLEL. */
1513 bb = split_loop_exit_edge (single_dom_exit (loop));
1514 bsi = bsi_last (bb);
1515 bsi_insert_after (&bsi, make_node (OMP_RETURN), BSI_NEW_STMT);
1517 /* Extract data for OMP_FOR. */
1518 gcc_assert (loop->header == single_dom_exit (loop)->src);
1519 cond = COND_EXPR_COND (last_stmt (loop->header));
1521 cvar = TREE_OPERAND (cond, 0);
1522 cvar_base = SSA_NAME_VAR (cvar);
1523 phi = SSA_NAME_DEF_STMT (cvar);
1524 cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1525 initvar = make_ssa_name (cvar_base, NULL_TREE);
1526 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
1527 initvar);
1528 cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
1530 bsi = bsi_last (loop->latch);
1531 gcc_assert (bsi_stmt (bsi) == SSA_NAME_DEF_STMT (cvar_next));
1532 bsi_remove (&bsi, true);
1534 /* Prepare cfg. */
1535 for_bb = split_edge (loop_preheader_edge (loop));
1536 ex_bb = split_loop_exit_edge (single_dom_exit (loop));
1537 extract_true_false_edges_from_block (loop->header, &nexit, &exit);
1538 gcc_assert (exit == single_dom_exit (loop));
1540 guard = make_edge (for_bb, ex_bb, 0);
1541 single_succ_edge (loop->latch)->flags = 0;
1542 end = make_edge (loop->latch, ex_bb, EDGE_FALLTHRU);
1543 for (phi = phi_nodes (ex_bb); phi; phi = PHI_CHAIN (phi))
1545 res = PHI_RESULT (phi);
1546 gcc_assert (!is_gimple_reg (phi));
1547 t = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, exit));
1548 add_phi_arg (phi, PHI_ARG_DEF_FROM_EDGE (t, loop_preheader_edge (loop)),
1549 guard);
1550 add_phi_arg (phi, PHI_ARG_DEF_FROM_EDGE (t, loop_latch_edge (loop)),
1551 end);
1553 e = redirect_edge_and_branch (exit, nexit->dest);
1554 PENDING_STMT (e) = NULL;
1556 /* Emit OMP_FOR. */
1557 TREE_OPERAND (cond, 0) = cvar_base;
1558 type = TREE_TYPE (cvar);
1559 t = build_omp_clause (OMP_CLAUSE_SCHEDULE);
1560 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
1562 for_stmt = make_node (OMP_FOR);
1563 TREE_TYPE (for_stmt) = void_type_node;
1564 OMP_FOR_CLAUSES (for_stmt) = t;
1565 OMP_FOR_INIT (for_stmt) = build_gimple_modify_stmt (initvar, cvar_init);
1566 OMP_FOR_COND (for_stmt) = cond;
1567 OMP_FOR_INCR (for_stmt) = build_gimple_modify_stmt (cvar_base,
1568 build2 (PLUS_EXPR, type,
1569 cvar_base,
1570 build_int_cst
1571 (type, 1)));
1572 OMP_FOR_BODY (for_stmt) = NULL_TREE;
1573 OMP_FOR_PRE_BODY (for_stmt) = NULL_TREE;
1575 bsi = bsi_last (for_bb);
1576 bsi_insert_after (&bsi, for_stmt, BSI_NEW_STMT);
1577 SSA_NAME_DEF_STMT (initvar) = for_stmt;
1579 /* Emit OMP_CONTINUE. */
1580 bsi = bsi_last (loop->latch);
1581 t = build2 (OMP_CONTINUE, void_type_node, cvar_next, cvar);
1582 bsi_insert_after (&bsi, t, BSI_NEW_STMT);
1583 SSA_NAME_DEF_STMT (cvar_next) = t;
1585 /* Emit OMP_RETURN for OMP_FOR. */
1586 bsi = bsi_last (ex_bb);
1587 bsi_insert_after (&bsi, make_node (OMP_RETURN), BSI_NEW_STMT);
1589 return paral_bb;
1592 /* Generates code to execute the iterations of LOOP in N_THREADS threads in
1593 parallel. NITER describes number of iterations of LOOP.
1594 REDUCTION_LIST describes the reductions existant in the LOOP. */
1596 static void
1597 gen_parallel_loop (struct loop *loop, htab_t reduction_list,
1598 unsigned n_threads, struct tree_niter_desc *niter)
1600 struct loop *nloop;
1601 loop_iterator li;
1602 tree many_iterations_cond, type, nit;
1603 tree stmts, arg_struct, new_arg_struct;
1604 basic_block parallel_head;
1605 struct clsn_data clsn_data;
1606 unsigned prob;
1608 /* From
1610 ---------------------------------------------------------------------
1611 loop
1613 IV = phi (INIT, IV + STEP)
1614 BODY1;
1615 if (COND)
1616 break;
1617 BODY2;
1619 ---------------------------------------------------------------------
1621 with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
1622 we generate the following code:
1624 ---------------------------------------------------------------------
1626 if (MAY_BE_ZERO
1627 || NITER < MIN_PER_THREAD * N_THREADS)
1628 goto original;
1630 BODY1;
1631 store all local loop-invariant variables used in body of the loop to DATA.
1632 OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
1633 load the variables from DATA.
1634 OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
1635 BODY2;
1636 BODY1;
1637 OMP_CONTINUE;
1638 OMP_RETURN -- OMP_FOR
1639 OMP_RETURN -- OMP_PARALLEL
1640 goto end;
1642 original:
1643 loop
1645 IV = phi (INIT, IV + STEP)
1646 BODY1;
1647 if (COND)
1648 break;
1649 BODY2;
1652 end:
1656 /* Create two versions of the loop -- in the old one, we know that the
1657 number of iterations is large enough, and we will transform it into the
1658 loop that will be split to loop_fn, the new one will be used for the
1659 remaining iterations. */
1661 type = TREE_TYPE (niter->niter);
1662 nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
1663 NULL_TREE);
1664 if (stmts)
1665 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1667 many_iterations_cond =
1668 fold_build2 (GE_EXPR, boolean_type_node,
1669 nit, build_int_cst (type, MIN_PER_THREAD * n_threads));
1670 many_iterations_cond
1671 = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1672 invert_truthvalue (unshare_expr (niter->may_be_zero)),
1673 many_iterations_cond);
1674 many_iterations_cond
1675 = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
1676 if (stmts)
1677 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1678 if (!is_gimple_condexpr (many_iterations_cond))
1680 many_iterations_cond
1681 = force_gimple_operand (many_iterations_cond, &stmts,
1682 true, NULL_TREE);
1683 if (stmts)
1684 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1687 initialize_original_copy_tables ();
1689 /* We assume that the loop usually iterates a lot. */
1690 prob = 4 * REG_BR_PROB_BASE / 5;
1691 nloop = loop_version (loop, many_iterations_cond, NULL,
1692 prob, prob, REG_BR_PROB_BASE - prob, true);
1693 update_ssa (TODO_update_ssa);
1694 free_original_copy_tables ();
1696 /* Base all the induction variables in LOOP on a single control one. */
1697 canonicalize_loop_ivs (loop, reduction_list, nit);
1699 /* Ensure that the exit condition is the first statement in the loop. */
1700 transform_to_exit_first_loop (loop, reduction_list, nit);
1703 /* Generate intializations for reductions. */
1705 if (htab_elements (reduction_list) > 0)
1706 htab_traverse (reduction_list, initialize_reductions, loop);
1708 /* Eliminate the references to local variables from the loop. */
1709 eliminate_local_variables (loop);
1711 /* In the old loop, move all variables non-local to the loop to a structure
1712 and back, and create separate decls for the variables used in loop. */
1713 separate_decls_in_loop (loop, reduction_list, &arg_struct, &new_arg_struct, &clsn_data);
1715 /* Create the parallel constructs. */
1716 parallel_head = create_parallel_loop (loop, create_loop_fn (), arg_struct,
1717 new_arg_struct, n_threads);
1718 if (htab_elements (reduction_list) > 0)
1719 create_call_for_reduction (loop, reduction_list, &clsn_data);
1721 scev_reset ();
1723 /* Cancel the loop (it is simpler to do it here rather than to teach the
1724 expander to do it). */
1725 cancel_loop_tree (loop);
1727 /* Free loop bound estimations that could contain references to
1728 removed statements. */
1729 FOR_EACH_LOOP (li, loop, 0)
1730 free_numbers_of_iterations_estimates_loop (loop);
1732 /* Expand the parallel constructs. We do it directly here instead of running
1733 a separate expand_omp pass, since it is more efficient, and less likely to
1734 cause troubles with further analyses not being able to deal with the
1735 OMP trees. */
1737 omp_expand_local (parallel_head);
1740 /* Detect parallel loops and generate parallel code using libgomp
1741 primitives. Returns true if some loop was parallelized, false
1742 otherwise. */
1744 bool
1745 parallelize_loops (void)
1747 unsigned n_threads = flag_tree_parallelize_loops;
1748 bool changed = false;
1749 struct loop *loop;
1750 struct tree_niter_desc niter_desc;
1751 loop_iterator li;
1752 htab_t reduction_list;
1754 /* Do not parallelize loops in the functions created by parallelization. */
1755 if (parallelized_function_p (cfun->decl))
1756 return false;
1758 reduction_list = htab_create (10, reduction_info_hash,
1759 reduction_info_eq, free);
1761 FOR_EACH_LOOP (li, loop, 0)
1763 htab_empty (reduction_list);
1764 if (/* Do not bother with loops in cold areas. */
1765 !maybe_hot_bb_p (loop->header)
1766 /* Or loops that roll too little. */
1767 || expected_loop_iterations (loop) <= n_threads
1768 /* And of course, the loop must be parallelizable. */
1769 || !can_duplicate_loop_p (loop)
1770 || loop_has_blocks_with_irreducible_flag (loop)
1771 || !loop_parallel_p (loop, reduction_list, &niter_desc))
1772 continue;
1774 changed = true;
1775 gen_parallel_loop (loop, reduction_list, n_threads, &niter_desc);
1776 verify_flow_info ();
1777 verify_dominators (CDI_DOMINATORS);
1778 verify_loop_structure ();
1779 verify_loop_closed_ssa ();
1782 htab_delete (reduction_list);
1783 return changed;
1786 #include "gt-tree-parloops.h"