re PR target/29978 (redundant jumps)
[official-gcc.git] / gcc / tree-parloops.c
blobb4f85193ebfa8a3a8fc295cef7de69d6a180731f
1 /* Loop autoparallelization.
2 Copyright (C) 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Sebastian Pop <pop@cri.ensmp.fr> and
4 Zdenek Dvorak <dvorakz@suse.cz>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
21 02110-1301, USA. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "rtl.h"
29 #include "tree-flow.h"
30 #include "cfgloop.h"
31 #include "ggc.h"
32 #include "tree-data-ref.h"
33 #include "diagnostic.h"
34 #include "tree-pass.h"
35 #include "tree-scalar-evolution.h"
36 #include "hashtab.h"
37 #include "langhooks.h"
38 #include "tree-vectorizer.h"
40 /* This pass tries to distribute iterations of loops into several threads.
41 The implementation is straightforward -- for each loop we test whether its
42 iterations are independent, and if it is the case (and some additional
43 conditions regarding profitability and correctness are satisfied), we
44 add OMP_PARALLEL and OMP_FOR codes and let omp expansion machinery do
45 its job.
47 The most of the complexity is in bringing the code into shape expected
48 by the omp expanders:
49 -- for OMP_FOR, ensuring that the loop has only one induction variable
50 and that the exit test is at the start of the loop body
51 -- for OMP_PARALLEL, replacing the references to local addressable
52 variables by accesses through pointers, and breaking up ssa chains
53 by storing the values incoming to the parallelized loop to a structure
54 passed to the new function as an argument (something similar is done
55 in omp gimplification, unfortunately only a small part of the code
56 can be shared).
58 TODO:
59 -- if there are several parallelizable loops in a function, it may be
60 possible to generate the threads just once (using synchronization to
61 ensure that cross-loop dependences are obeyed).
62 -- handling of common scalar dependence patterns (accumulation, ...)
63 -- handling of non-innermost loops */
65 /*
66 Reduction handling:
67 currently we use vect_is_simple_reduction() to detect reduction patterns.
68 The code transformation will be introduced by an example.
71 parloop
73 int sum=1;
75 for (i = 0; i < N; i++)
77 x[i] = i + 3;
78 sum+=x[i];
82 gimple-like code:
83 header_bb:
85 # sum_29 = PHI <sum_11(5), 1(3)>
86 # i_28 = PHI <i_12(5), 0(3)>
87 D.1795_8 = i_28 + 3;
88 x[i_28] = D.1795_8;
89 sum_11 = D.1795_8 + sum_29;
90 i_12 = i_28 + 1;
91 if (N_6(D) > i_12)
92 goto header_bb;
95 exit_bb:
97 # sum_21 = PHI <sum_11(4)>
98 printf (&"%d"[0], sum_21);
101 after reduction transformation (only relevant parts):
103 parloop
106 ....
109 # A new variable is created for each reduction:
110 "reduction_initial" is the initial value given by the user.
111 It is kept and will be used after the parallel computing is done. #
113 reduction_initial.24_46 = 1;
115 # Storing the neutral value of the
116 particular reduction's operation, e.g. 0 for PLUS_EXPR,
117 1 for MULT_EXPR, etc. into the reduction field.
118 This is done in create_stores_for_reduction. #
120 .paral_data_store.32.sum.27 = 0;
122 #pragma omp parallel num_threads(4)
124 #pragma omp for schedule(static)
125 # sum.27_29 = PHI <sum.27_11, 0> # The neutral element replaces
126 the user's inital value. #
127 sum.27_11 = D.1827_8 + sum.27_29;
128 OMP_CONTINUE
130 # Adding this reduction phi is done at create_phi_for_local_result() #
131 # sum.27_56 = PHI <sum.27_11, 0>
132 OMP_RETURN
134 # Creating the atomic operation is done at
135 create_call_for_reduction_1() #
137 #pragma omp atomic_load
138 D.1839_59 = *&.paral_data_load.33_51->reduction.23;
139 D.1840_60 = sum.27_56 + D.1839_59;
140 #pragma omp atomic_store (D.1840_60);
142 OMP_RETURN
144 # collecting the result after the join of the threads is done at
145 create_loads_for_reductions().
146 a new variable "reduction_final" is created. It calculates the final
147 value from the initial value and the value computed by the threads #
149 .paral_data_load.33_52 = &.paral_data_store.32;
150 reduction_final.34_53 = .paral_data_load.33_52->sum.27;
151 sum_37 = reduction_initial.24_46 + reduction_final.34_53;
152 sum_43 = D.1795_41 + sum_37;
154 exit bb:
155 # sum_21 = PHI <sum_43, sum_26>
156 printf (&"%d"[0], sum_21);
164 /* Minimal number of iterations of a loop that should be executed in each
165 thread. */
166 #define MIN_PER_THREAD 100
168 /* Element of the hashtable, representing a
169 reduction in the current loop. */
170 struct reduction_info
172 tree reduc_stmt; /* reduction statement. */
173 tree reduc_phi; /* The phi node defining the reduction. */
174 enum tree_code reduction_code; /* code for the reduction operation. */
175 tree keep_res; /* The PHI_RESULT of this phi is the resulting value
176 of the reduction variable when existing the loop. */
177 tree initial_value; /* An ssa name representing a new variable holding
178 the initial value of the reduction var before entering the loop. */
179 tree field; /* the name of the field in the parloop data structure intended for reduction. */
180 tree init; /* reduction initialization value. */
181 tree new_phi; /* (helper field) Newly created phi node whose result
182 will be passed to the atomic operation. Represents
183 the local result each thread computed for the reduction
184 operation. */
187 /* Equality and hash functions for hashtab code. */
189 static int
190 reduction_info_eq (const void *aa, const void *bb)
192 const struct reduction_info *a = (const struct reduction_info *) aa;
193 const struct reduction_info *b = (const struct reduction_info *) bb;
195 return (a->reduc_phi == b->reduc_phi);
198 static hashval_t
199 reduction_info_hash (const void *aa)
201 const struct reduction_info *a = (const struct reduction_info *) aa;
203 return htab_hash_pointer (a->reduc_phi);
206 static struct reduction_info *
207 reduction_phi (htab_t reduction_list, tree phi)
209 struct reduction_info tmpred, *red;
211 if (htab_elements (reduction_list) == 0)
212 return NULL;
214 tmpred.reduc_phi = phi;
215 red = htab_find (reduction_list, &tmpred);
217 return red;
220 /* Element of hashtable of names to copy. */
222 struct name_to_copy_elt
224 unsigned version; /* The version of the name to copy. */
225 tree new_name; /* The new name used in the copy. */
226 tree field; /* The field of the structure used to pass the
227 value. */
230 /* Equality and hash functions for hashtab code. */
232 static int
233 name_to_copy_elt_eq (const void *aa, const void *bb)
235 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
236 const struct name_to_copy_elt *b = (const struct name_to_copy_elt *) bb;
238 return a->version == b->version;
241 static hashval_t
242 name_to_copy_elt_hash (const void *aa)
244 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
246 return (hashval_t) a->version;
249 /* Returns true if the iterations of LOOP are independent on each other (that
250 is, if we can execute them in parallel), and if LOOP satisfies other
251 conditions that we need to be able to parallelize it. Description of number
252 of iterations is stored to NITER. Reduction analysis is done, if
253 reductions are found, they are inserted to the REDUCTION_LIST. */
255 static bool
256 loop_parallel_p (struct loop *loop, htab_t reduction_list, struct tree_niter_desc *niter)
258 edge exit = single_dom_exit (loop);
259 VEC (ddr_p, heap) * dependence_relations;
260 VEC (data_reference_p, heap) * datarefs;
261 lambda_trans_matrix trans;
262 bool ret = false;
263 tree phi;
264 loop_vec_info simple_loop_info;
266 /* Only consider innermost loops with just one exit. The innermost-loop
267 restriction is not necessary, but it makes things simpler. */
268 if (loop->inner || !exit)
269 return false;
271 if (dump_file && (dump_flags & TDF_DETAILS))
272 fprintf (dump_file, "\nConsidering loop %d\n", loop->num);
274 /* We need to know # of iterations, and there should be no uses of values
275 defined inside loop outside of it, unless the values are invariants of
276 the loop. */
277 if (!number_of_iterations_exit (loop, exit, niter, false))
279 if (dump_file && (dump_flags & TDF_DETAILS))
280 fprintf (dump_file, " FAILED: number of iterations not known\n");
281 return false;
284 simple_loop_info = vect_analyze_loop_form (loop);
286 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
288 tree reduc_stmt = NULL, operation;
290 /* ??? TODO: Change this into a generic function that
291 recognizes reductions. */
292 if (!is_gimple_reg (PHI_RESULT (phi)))
293 continue;
294 if (simple_loop_info)
295 reduc_stmt = vect_is_simple_reduction (simple_loop_info, phi);
297 /* Create a reduction_info struct, initialize it and insert it to
298 the reduction list. */
300 if (reduc_stmt)
302 PTR *slot;
303 struct reduction_info *new_reduction;
305 if (dump_file && (dump_flags & TDF_DETAILS))
307 fprintf (dump_file,
308 "Detected reduction. reduction stmt is: \n");
309 print_generic_stmt (dump_file, reduc_stmt, 0);
310 fprintf (dump_file, "\n");
313 new_reduction = XCNEW (struct reduction_info);
315 new_reduction->reduc_stmt = reduc_stmt;
316 new_reduction->reduc_phi = phi;
317 operation = GIMPLE_STMT_OPERAND (reduc_stmt, 1);
318 new_reduction->reduction_code = TREE_CODE (operation);
319 slot = htab_find_slot (reduction_list, new_reduction, INSERT);
320 *slot = new_reduction;
324 for (phi = phi_nodes (exit->dest); phi; phi = PHI_CHAIN (phi))
326 struct reduction_info *red;
327 imm_use_iterator imm_iter;
328 use_operand_p use_p;
329 tree reduc_phi;
331 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
333 if (is_gimple_reg (val))
335 if (dump_file && (dump_flags & TDF_DETAILS))
337 fprintf (dump_file, "phi is ");
338 print_generic_expr (dump_file, phi, 0);
339 fprintf (dump_file, "arg of phi to exit: value ");
340 print_generic_expr (dump_file, val, 0);
341 fprintf (dump_file, " used outside loop\n");
342 fprintf (dump_file,
343 " checking if it a part of reduction pattern: \n");
345 if (htab_elements (reduction_list) == 0)
347 if (dump_file && (dump_flags & TDF_DETAILS))
348 fprintf (dump_file,
349 " FAILED: it is not a part of reduction.\n");
350 return false;
352 reduc_phi = NULL;
353 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val)
355 if (flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
357 reduc_phi = USE_STMT (use_p);
358 break;
361 red = reduction_phi (reduction_list, reduc_phi);
362 if (red == NULL)
364 if (dump_file && (dump_flags & TDF_DETAILS))
365 fprintf (dump_file,
366 " FAILED: it is not a part of reduction.\n");
367 return false;
369 if (dump_file && (dump_flags & TDF_DETAILS))
371 fprintf (dump_file, "reduction phi is ");
372 print_generic_expr (dump_file, red->reduc_phi, 0);
373 fprintf (dump_file, "reduction stmt is ");
374 print_generic_expr (dump_file, red->reduc_stmt, 0);
380 /* The iterations of the loop may communicate only through bivs whose
381 iteration space can be distributed efficiently. */
382 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
384 tree def = PHI_RESULT (phi);
385 affine_iv iv;
387 if (is_gimple_reg (def) && !simple_iv (loop, phi, def, &iv, true))
389 struct reduction_info *red;
391 red = reduction_phi (reduction_list, phi);
392 if (red == NULL)
394 if (dump_file && (dump_flags & TDF_DETAILS))
395 fprintf (dump_file,
396 " FAILED: scalar dependency between iterations\n");
397 return false;
402 /* We need to version the loop to verify assumptions in runtime. */
403 if (!can_duplicate_loop_p (loop))
405 if (dump_file && (dump_flags & TDF_DETAILS))
406 fprintf (dump_file, " FAILED: cannot be duplicated\n");
407 return false;
410 /* Check for problems with dependences. If the loop can be reversed,
411 the iterations are independent. */
412 datarefs = VEC_alloc (data_reference_p, heap, 10);
413 dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
414 compute_data_dependences_for_loop (loop, true, &datarefs,
415 &dependence_relations);
416 if (dump_file && (dump_flags & TDF_DETAILS))
417 dump_data_dependence_relations (dump_file, dependence_relations);
419 trans = lambda_trans_matrix_new (1, 1);
420 LTM_MATRIX (trans)[0][0] = -1;
422 if (lambda_transform_legal_p (trans, 1, dependence_relations))
424 ret = true;
425 if (dump_file && (dump_flags & TDF_DETAILS))
426 fprintf (dump_file, " SUCCESS: may be parallelized\n");
428 else if (dump_file && (dump_flags & TDF_DETAILS))
429 fprintf (dump_file,
430 " FAILED: data dependencies exist across iterations\n");
432 free_dependence_relations (dependence_relations);
433 free_data_refs (datarefs);
435 return ret;
438 /* Assigns the address of VAR in TYPE to an ssa name, and returns this name.
439 The assignment statement is placed before LOOP. DECL_ADDRESS maps decls
440 to their addresses that can be reused. */
442 static tree
443 take_address_of (tree var, tree type, struct loop *loop, htab_t decl_address)
445 int uid = DECL_UID (var);
446 void **dslot;
447 struct int_tree_map ielt, *nielt;
448 tree name, bvar, stmt;
449 edge entry = loop_preheader_edge (loop);
451 ielt.uid = uid;
452 dslot = htab_find_slot_with_hash (decl_address, &ielt, uid, INSERT);
453 if (!*dslot)
455 bvar = create_tmp_var (type, get_name (var));
456 add_referenced_var (bvar);
457 stmt = build_gimple_modify_stmt (bvar,
458 fold_convert (type,
459 build_addr (var,
460 current_function_decl)));
461 name = make_ssa_name (bvar, stmt);
462 GIMPLE_STMT_OPERAND (stmt, 0) = name;
463 bsi_insert_on_edge_immediate (entry, stmt);
465 nielt = XNEW (struct int_tree_map);
466 nielt->uid = uid;
467 nielt->to = name;
468 *dslot = nielt;
470 return name;
473 name = ((struct int_tree_map *) *dslot)->to;
474 if (TREE_TYPE (name) == type)
475 return name;
477 bvar = SSA_NAME_VAR (name);
478 stmt = build_gimple_modify_stmt (bvar, fold_convert (type, name));
479 name = make_ssa_name (bvar, stmt);
480 GIMPLE_STMT_OPERAND (stmt, 0) = name;
481 bsi_insert_on_edge_immediate (entry, stmt);
483 return name;
486 /* Callback for htab_traverse. Create the initialization statement
487 for reduction described in SLOT, and place it at the preheader of
488 the loop described in DATA. */
490 static int
491 initialize_reductions (void **slot, void *data)
493 tree stmt;
494 tree init, c;
495 tree name1;
496 tree bvar, type, arg;
497 edge e;
499 struct reduction_info *reduc = *slot;
500 struct loop *loop = (struct loop *) data;
502 /* Create initialization in preheader:
503 reduction_variable = initialization value of reduction. */
505 /* In the phi node at the header, replace the argument coming
506 from the preheader with the reduction initialization value. */
508 /* Create a new variable to initialize the reduction. */
509 type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
510 bvar = create_tmp_var (type, "reduction");
511 add_referenced_var (bvar);
513 c = build_omp_clause (OMP_CLAUSE_REDUCTION);
514 OMP_CLAUSE_REDUCTION_CODE (c) = reduc->reduction_code;
515 OMP_CLAUSE_DECL (c) =
516 SSA_NAME_VAR (GIMPLE_STMT_OPERAND (reduc->reduc_stmt, 0));
518 init = omp_reduction_init (c, TREE_TYPE (bvar));
519 reduc->init = init;
521 /* Replace the argument representing the initialization value
522 with the initialization value for the reduction (neutral
523 element for the particular operation, e.g. 0 for PLUS_EXPR,
524 1 for MULT_EXPR, etc).
525 Keep the old value in a new variable "reduction_initial",
526 that will be taken in consideration after the parallel
527 computing is done. */
529 e = loop_preheader_edge (loop);
530 arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e);
531 /* Create new variable to hold the initial value. */
532 type = TREE_TYPE (bvar);
533 bvar = create_tmp_var (type, "reduction_initial");
534 add_referenced_var (bvar);
536 stmt = build_gimple_modify_stmt (bvar, arg);
537 name1 = make_ssa_name (bvar, stmt);
538 GIMPLE_STMT_OPERAND (stmt, 0) = name1;
539 SSA_NAME_DEF_STMT (name1) = stmt;
541 bsi_insert_on_edge_immediate (e, stmt);
542 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
543 (reduc->reduc_phi, loop_preheader_edge (loop)), init);
544 reduc->initial_value = name1;
545 return 1;
548 struct elv_data
550 struct loop *loop;
551 htab_t decl_address;
552 bool changed;
555 /* Eliminates references to local variables in *TP out of LOOP. DECL_ADDRESS
556 contains addresses of the references that had their address taken already.
557 If the expression is changed, CHANGED is set to true. Callback for
558 walk_tree. */
560 static tree
561 eliminate_local_variables_1 (tree * tp, int *walk_subtrees, void *data)
563 struct elv_data *dta = data;
564 tree t = *tp, var, addr, addr_type, type;
566 if (DECL_P (t))
568 *walk_subtrees = 0;
570 if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
571 return NULL_TREE;
573 type = TREE_TYPE (t);
574 addr_type = build_pointer_type (type);
575 addr = take_address_of (t, addr_type, dta->loop, dta->decl_address);
576 *tp = build1 (INDIRECT_REF, TREE_TYPE (*tp), addr);
578 dta->changed = true;
579 return NULL_TREE;
582 if (TREE_CODE (t) == ADDR_EXPR)
584 var = TREE_OPERAND (t, 0);
585 if (!DECL_P (var))
586 return NULL_TREE;
588 *walk_subtrees = 0;
589 if (!SSA_VAR_P (var) || DECL_EXTERNAL (var))
590 return NULL_TREE;
592 addr_type = TREE_TYPE (t);
593 addr = take_address_of (var, addr_type, dta->loop, dta->decl_address);
594 *tp = addr;
596 dta->changed = true;
597 return NULL_TREE;
600 if (!EXPR_P (t) && !GIMPLE_STMT_P (t))
601 *walk_subtrees = 0;
603 return NULL_TREE;
606 /* Moves the references to local variables in STMT from LOOP. DECL_ADDRESS
607 contains addresses for the references for that we have already taken
608 them. */
610 static void
611 eliminate_local_variables_stmt (struct loop *loop, tree stmt,
612 htab_t decl_address)
614 struct elv_data dta;
616 dta.loop = loop;
617 dta.decl_address = decl_address;
618 dta.changed = false;
620 walk_tree (&stmt, eliminate_local_variables_1, &dta, NULL);
622 if (dta.changed)
623 update_stmt (stmt);
626 /* Eliminates the references to local variables from LOOP.
627 This includes:
628 1) Taking address of a local variable -- these are moved out of the
629 loop (and temporary variable is created to hold the address if
630 necessary).
631 2) Dereferencing a local variable -- these are replaced with indirect
632 references. */
634 static void
635 eliminate_local_variables (struct loop *loop)
637 basic_block bb, *body = get_loop_body (loop);
638 unsigned i;
639 block_stmt_iterator bsi;
640 htab_t decl_address = htab_create (10, int_tree_map_hash, int_tree_map_eq,
641 free);
643 /* Find and rename the ssa names defined outside of loop. */
644 for (i = 0; i < loop->num_nodes; i++)
646 bb = body[i];
648 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
649 eliminate_local_variables_stmt (loop, bsi_stmt (bsi), decl_address);
652 htab_delete (decl_address);
655 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
656 The copies are stored to NAME_COPIES, if NAME was already duplicated,
657 its duplicate stored in NAME_COPIES is returned.
659 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
660 duplicated, storing the copies in DECL_COPIES. */
662 static tree
663 separate_decls_in_loop_name (tree name,
664 htab_t name_copies, htab_t decl_copies,
665 bool copy_name_p)
667 tree copy, var, var_copy;
668 unsigned idx, uid, nuid;
669 struct int_tree_map ielt, *nielt;
670 struct name_to_copy_elt elt, *nelt;
671 void **slot, **dslot;
673 if (TREE_CODE (name) != SSA_NAME)
674 return name;
676 idx = SSA_NAME_VERSION (name);
677 elt.version = idx;
678 slot = htab_find_slot_with_hash (name_copies, &elt, idx,
679 copy_name_p ? INSERT : NO_INSERT);
680 if (slot && *slot)
681 return ((struct name_to_copy_elt *) *slot)->new_name;
683 var = SSA_NAME_VAR (name);
684 uid = DECL_UID (var);
685 ielt.uid = uid;
686 dslot = htab_find_slot_with_hash (decl_copies, &ielt, uid, INSERT);
687 if (!*dslot)
689 var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
690 DECL_GIMPLE_REG_P (var_copy) = DECL_GIMPLE_REG_P (var);
691 add_referenced_var (var_copy);
692 nielt = XNEW (struct int_tree_map);
693 nielt->uid = uid;
694 nielt->to = var_copy;
695 *dslot = nielt;
697 /* Ensure that when we meet this decl next time, we won't duplicate
698 it again. */
699 nuid = DECL_UID (var_copy);
700 ielt.uid = nuid;
701 dslot = htab_find_slot_with_hash (decl_copies, &ielt, nuid, INSERT);
702 gcc_assert (!*dslot);
703 nielt = XNEW (struct int_tree_map);
704 nielt->uid = nuid;
705 nielt->to = var_copy;
706 *dslot = nielt;
708 else
709 var_copy = ((struct int_tree_map *) *dslot)->to;
711 if (copy_name_p)
713 copy = duplicate_ssa_name (name, NULL_TREE);
714 nelt = XNEW (struct name_to_copy_elt);
715 nelt->version = idx;
716 nelt->new_name = copy;
717 nelt->field = NULL_TREE;
718 *slot = nelt;
720 else
722 gcc_assert (!slot);
723 copy = name;
726 SSA_NAME_VAR (copy) = var_copy;
727 return copy;
730 /* Finds the ssa names used in STMT that are defined outside of LOOP and
731 replaces such ssa names with their duplicates. The duplicates are stored to
732 NAME_COPIES. Base decls of all ssa names used in STMT
733 (including those defined in LOOP) are replaced with the new temporary
734 variables; the replacement decls are stored in DECL_COPIES. */
736 static void
737 separate_decls_in_loop_stmt (struct loop *loop, tree stmt,
738 htab_t name_copies, htab_t decl_copies)
740 use_operand_p use;
741 def_operand_p def;
742 ssa_op_iter oi;
743 tree name, copy;
744 bool copy_name_p;
746 mark_virtual_ops_for_renaming (stmt);
748 FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
750 name = DEF_FROM_PTR (def);
751 gcc_assert (TREE_CODE (name) == SSA_NAME);
752 copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
753 false);
754 gcc_assert (copy == name);
757 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
759 name = USE_FROM_PTR (use);
760 if (TREE_CODE (name) != SSA_NAME)
761 continue;
763 copy_name_p = expr_invariant_in_loop_p (loop, name);
764 copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
765 copy_name_p);
766 SET_USE (use, copy);
770 /* Callback for htab_traverse. Adds a field corresponding to the reduction
771 specified in SLOT. The type is passed in DATA. */
773 static int
774 add_field_for_reduction (void **slot, void *data)
777 struct reduction_info *red = *slot;
778 tree type = data;
779 tree var = SSA_NAME_VAR (GIMPLE_STMT_OPERAND (red->reduc_stmt, 0));
780 tree field = build_decl (FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
782 insert_field_into_struct (type, field);
784 red->field = field;
786 return 1;
789 /* Callback for htab_traverse. Adds a field corresponding to a ssa name
790 described in SLOT. The type is passed in DATA. */
792 static int
793 add_field_for_name (void **slot, void *data)
795 struct name_to_copy_elt *elt = *slot;
796 tree type = data;
797 tree name = ssa_name (elt->version);
798 tree var = SSA_NAME_VAR (name);
799 tree field = build_decl (FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
801 insert_field_into_struct (type, field);
802 elt->field = field;
804 return 1;
807 /* Callback for htab_traverse. A local result is the intermediate result
808 computed by a single
809 thread, or the intial value in case no iteration was executed.
810 This function creates a phi node reflecting these values.
811 The phi's result will be stored in NEW_PHI field of the
812 reduction's data structure. */
814 static int
815 create_phi_for_local_result (void **slot, void *data)
817 struct reduction_info *reduc = *slot;
818 struct loop *loop = data;
819 edge e;
820 tree new_phi;
821 basic_block store_bb;
822 tree local_res;
824 /* STORE_BB is the block where the phi
825 should be stored. It is the destination of the loop exit.
826 (Find the fallthru edge from OMP_CONTINUE). */
827 store_bb = FALLTHRU_EDGE (loop->latch)->dest;
829 /* STORE_BB has two predecessors. One coming from the loop
830 (the reduction's result is computed at the loop),
831 and another coming from a block preceding the loop,
832 when no iterations
833 are executed (the initial value should be taken). */
834 if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (loop->latch))
835 e = EDGE_PRED (store_bb, 1);
836 else
837 e = EDGE_PRED (store_bb, 0);
838 local_res = make_ssa_name (SSA_NAME_VAR (GIMPLE_STMT_OPERAND (reduc->reduc_stmt, 0)), NULL_TREE);
839 new_phi = create_phi_node (local_res, store_bb);
840 SSA_NAME_DEF_STMT (local_res) = new_phi;
841 add_phi_arg (new_phi, reduc->init, e);
842 add_phi_arg (new_phi, GIMPLE_STMT_OPERAND (reduc->reduc_stmt, 0),
843 FALLTHRU_EDGE (loop->latch));
844 reduc->new_phi = new_phi;
846 return 1;
849 struct clsn_data
851 tree store;
852 tree load;
854 basic_block store_bb;
855 basic_block load_bb;
858 /* Callback for htab_traverse. Create an atomic instruction for the
859 reduction described in SLOT.
860 DATA annotates the place in memory the atomic operation relates to,
861 and the basic block it needs to be generated in. */
863 static int
864 create_call_for_reduction_1 (void **slot, void *data)
866 struct reduction_info *reduc = *slot;
867 struct clsn_data *clsn_data = data;
868 block_stmt_iterator bsi;
869 tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
870 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
871 tree load_struct;
872 basic_block bb;
873 basic_block new_bb;
874 edge e;
875 tree t, addr, addr_type, ref, x;
876 tree tmp_load, load, name;
878 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
879 t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
880 addr_type = build_pointer_type (type);
882 addr = build_addr (t, current_function_decl);
884 /* Create phi node. */
885 bb = clsn_data->load_bb;
887 e = split_block (bb, t);
888 new_bb = e->dest;
890 tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)), NULL);
891 add_referenced_var (tmp_load);
892 tmp_load = make_ssa_name (tmp_load, NULL);
893 load = build2 (OMP_ATOMIC_LOAD, void_type_node, tmp_load, addr);
894 SSA_NAME_DEF_STMT (tmp_load) = load;
895 bsi = bsi_start (new_bb);
896 bsi_insert_after (&bsi, load, BSI_NEW_STMT);
898 e = split_block (new_bb, load);
899 new_bb = e->dest;
900 bsi = bsi_start (new_bb);
901 ref = tmp_load;
903 fold_build2 (reduc->reduction_code,
904 TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref,
905 PHI_RESULT (reduc->new_phi));
907 name =
908 force_gimple_operand_bsi (&bsi, x, true, NULL_TREE, true,
909 BSI_CONTINUE_LINKING);
911 x = build1 (OMP_ATOMIC_STORE, void_type_node, name);
913 bsi_insert_after (&bsi, x, BSI_NEW_STMT);
914 return 1;
917 /* Create the atomic operation at the join point of the threads.
918 REDUCTION_LIST describes the reductions in the LOOP.
919 LD_ST_DATA describes the shared data structure where
920 shared data is stored in and loaded from. */
921 static void
922 create_call_for_reduction (struct loop *loop, htab_t reduction_list,
923 struct clsn_data *ld_st_data)
925 htab_traverse (reduction_list, create_phi_for_local_result, loop);
926 /* Find the fallthru edge from OMP_CONTINUE. */
927 ld_st_data->load_bb = FALLTHRU_EDGE (loop->latch)->dest;
928 htab_traverse (reduction_list, create_call_for_reduction_1, ld_st_data);
931 /* Callback for htab_traverse. Create a new variable that loads the
932 final reduction value at the
933 join point of all threads, adds the initial value the reduction
934 variable had before the parallel computation started, and
935 inserts it in the right place. */
937 static int
938 create_loads_for_reductions (void **slot, void *data)
940 struct reduction_info *red = *slot;
941 struct clsn_data *clsn_data = data;
942 tree stmt;
943 block_stmt_iterator bsi;
944 tree type = TREE_TYPE (GIMPLE_STMT_OPERAND (red->reduc_stmt, 0));
945 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
946 tree load_struct;
947 tree bvar, name;
948 tree x;
950 bsi = bsi_after_labels (clsn_data->load_bb);
951 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
952 load_struct = build3 (COMPONENT_REF, type, load_struct, red->field,
953 NULL_TREE);
954 bvar = create_tmp_var (type, "reduction_final");
955 add_referenced_var (bvar);
957 /* Apply operation between the new variable which is the result
958 of computation all threads, and the initial value which is kept
959 at reduction->inital_value. */
961 stmt = build_gimple_modify_stmt (bvar, load_struct);
962 name = make_ssa_name (bvar, stmt);
963 GIMPLE_STMT_OPERAND (stmt, 0) = name;
964 SSA_NAME_DEF_STMT (name) = stmt;
965 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
967 fold_build2 (red->reduction_code, TREE_TYPE (load_struct),
968 name, red->initial_value);
969 name = PHI_RESULT (red->keep_res);
970 stmt = build_gimple_modify_stmt (name, x);
971 GIMPLE_STMT_OPERAND (stmt, 0) = name;
972 SSA_NAME_DEF_STMT (name) = stmt;
974 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
976 remove_phi_node (red->keep_res, NULL_TREE, false);
978 return 1;
981 /* Load the reduction result that was stored in LD_ST_DATA.
982 REDUCTION_LIST describes the list of reductions that the
983 loades should be generated for. */
984 static void
985 create_final_loads_for_reduction (htab_t reduction_list,
986 struct clsn_data *ld_st_data)
988 block_stmt_iterator bsi;
989 tree t;
991 bsi = bsi_after_labels (ld_st_data->load_bb);
992 t = build_fold_addr_expr (ld_st_data->store);
994 build_gimple_modify_stmt (ld_st_data->load,
995 build_fold_addr_expr (ld_st_data->store));
997 bsi_insert_before (&bsi, t, BSI_NEW_STMT);
998 SSA_NAME_DEF_STMT (ld_st_data->load) = t;
999 GIMPLE_STMT_OPERAND (t, 0) = ld_st_data->load;
1001 htab_traverse (reduction_list, create_loads_for_reductions, ld_st_data);
1005 /* Callback for htab_traverse. Store the neutral value for the
1006 particular reduction's operation, e.g. 0 for PLUS_EXPR,
1007 1 for MULT_EXPR, etc. into the reduction field.
1008 The reduction is specified in SLOT. The store information is
1009 passed in DATA. */
1011 static int
1012 create_stores_for_reduction (void **slot, void *data)
1014 struct reduction_info *red = *slot;
1015 struct clsn_data *clsn_data = data;
1016 tree stmt;
1017 block_stmt_iterator bsi;
1018 tree type = TREE_TYPE (GIMPLE_STMT_OPERAND (red->reduc_stmt, 0));
1020 bsi = bsi_last (clsn_data->store_bb);
1021 stmt =
1022 build_gimple_modify_stmt (build3
1023 (COMPONENT_REF, type, clsn_data->store,
1024 red->field, NULL_TREE),
1025 red->init );
1026 mark_virtual_ops_for_renaming (stmt);
1027 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1029 return 1;
1032 /* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and
1033 store to a field of STORE in STORE_BB for the ssa name and its duplicate
1034 specified in SLOT. */
1036 static int
1037 create_loads_and_stores_for_name (void **slot, void *data)
1039 struct name_to_copy_elt *elt = *slot;
1040 struct clsn_data *clsn_data = data;
1041 tree stmt;
1042 block_stmt_iterator bsi;
1043 tree type = TREE_TYPE (elt->new_name);
1044 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
1045 tree load_struct;
1047 bsi = bsi_last (clsn_data->store_bb);
1048 stmt =
1049 build_gimple_modify_stmt (build3
1050 (COMPONENT_REF, type, clsn_data->store,
1051 elt->field, NULL_TREE),
1052 ssa_name (elt->version));
1053 mark_virtual_ops_for_renaming (stmt);
1054 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1056 bsi = bsi_last (clsn_data->load_bb);
1057 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
1058 stmt = build_gimple_modify_stmt (elt->new_name,
1059 build3 (COMPONENT_REF, type, load_struct,
1060 elt->field, NULL_TREE));
1061 SSA_NAME_DEF_STMT (elt->new_name) = stmt;
1062 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1064 return 1;
1067 /* Moves all the variables used in LOOP and defined outside of it (including
1068 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1069 name) to a structure created for this purpose. The code
1071 while (1)
1073 use (a);
1074 use (b);
1077 is transformed this way:
1079 bb0:
1080 old.a = a;
1081 old.b = b;
1083 bb1:
1084 a' = new->a;
1085 b' = new->b;
1086 while (1)
1088 use (a');
1089 use (b');
1092 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The
1093 pointer `new' is intentionally not initialized (the loop will be split to a
1094 separate function later, and `new' will be initialized from its arguments).
1095 LD_ST_DATA holds information about the shared data structure used to pass
1096 information among the threads. It is initialized here, and
1097 gen_parallel_loop will pass it to create_call_for_reduction that
1098 needs this information. REDUCTION_LIST describes the reductions
1099 in LOOP. */
1101 static void
1102 separate_decls_in_loop (struct loop *loop, htab_t reduction_list,
1103 tree * arg_struct, tree * new_arg_struct,
1104 struct clsn_data *ld_st_data)
1107 basic_block bb1 = split_edge (loop_preheader_edge (loop));
1108 basic_block bb0 = single_pred (bb1);
1109 htab_t name_copies = htab_create (10, name_to_copy_elt_hash,
1110 name_to_copy_elt_eq, free);
1111 htab_t decl_copies = htab_create (10, int_tree_map_hash, int_tree_map_eq,
1112 free);
1113 basic_block bb, *body = get_loop_body (loop);
1114 unsigned i;
1115 tree phi, type, type_name, nvar;
1116 block_stmt_iterator bsi;
1117 struct clsn_data clsn_data;
1119 /* Find and rename the ssa names defined outside of loop. */
1120 for (i = 0; i < loop->num_nodes; i++)
1122 bb = body[i];
1124 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1125 separate_decls_in_loop_stmt (loop, phi, name_copies, decl_copies);
1127 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
1128 separate_decls_in_loop_stmt (loop, bsi_stmt (bsi), name_copies,
1129 decl_copies);
1131 free (body);
1133 if (htab_elements (name_copies) == 0)
1135 /* It may happen that there is nothing to copy (if there are only
1136 loop carried and external variables in the loop). */
1137 *arg_struct = NULL;
1138 *new_arg_struct = NULL;
1140 else
1142 /* Create the type for the structure to store the ssa names to. */
1143 type = lang_hooks.types.make_type (RECORD_TYPE);
1144 type_name = build_decl (TYPE_DECL, create_tmp_var_name (".paral_data"),
1145 type);
1146 TYPE_NAME (type) = type_name;
1148 htab_traverse (name_copies, add_field_for_name, type);
1149 if (htab_elements (reduction_list) > 0)
1151 /* Create the fields for reductions. */
1152 htab_traverse (reduction_list, add_field_for_reduction,
1153 type);
1155 layout_type (type);
1157 /* Create the loads and stores. */
1158 *arg_struct = create_tmp_var (type, ".paral_data_store");
1159 add_referenced_var (*arg_struct);
1160 nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
1161 add_referenced_var (nvar);
1162 *new_arg_struct = make_ssa_name (nvar, NULL_TREE);
1164 ld_st_data->store = *arg_struct;
1165 ld_st_data->load = *new_arg_struct;
1166 ld_st_data->store_bb = bb0;
1167 ld_st_data->load_bb = bb1;
1169 htab_traverse (name_copies, create_loads_and_stores_for_name,
1170 ld_st_data);
1172 /* Load the calculation from memory into a new
1173 reduction variable (after the join of the threads). */
1174 if (htab_elements (reduction_list) > 0)
1176 htab_traverse (reduction_list, create_stores_for_reduction,
1177 ld_st_data);
1178 clsn_data.load = make_ssa_name (nvar, NULL_TREE);
1179 clsn_data.load_bb = single_dom_exit (loop)->dest;
1180 clsn_data.store = ld_st_data->store;
1181 create_final_loads_for_reduction (reduction_list, &clsn_data);
1185 htab_delete (decl_copies);
1186 htab_delete (name_copies);
1189 /* Bitmap containing uids of functions created by parallelization. We cannot
1190 allocate it from the default obstack, as it must live across compilation
1191 of several functions; we make it gc allocated instead. */
1193 static GTY(()) bitmap parallelized_functions;
1195 /* Returns true if FN was created by create_loop_fn. */
1197 static bool
1198 parallelized_function_p (tree fn)
1200 if (!parallelized_functions || !DECL_ARTIFICIAL (fn))
1201 return false;
1203 return bitmap_bit_p (parallelized_functions, DECL_UID (fn));
1206 /* Creates and returns an empty function that will receive the body of
1207 a parallelized loop. */
1209 static tree
1210 create_loop_fn (void)
1212 char buf[100];
1213 char *tname;
1214 tree decl, type, name, t;
1215 struct function *act_cfun = cfun;
1216 static unsigned loopfn_num;
1218 snprintf (buf, 100, "%s.$loopfn", current_function_name ());
1219 ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
1220 clean_symbol_name (tname);
1221 name = get_identifier (tname);
1222 type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
1224 decl = build_decl (FUNCTION_DECL, name, type);
1225 if (!parallelized_functions)
1226 parallelized_functions = BITMAP_GGC_ALLOC ();
1227 bitmap_set_bit (parallelized_functions, DECL_UID (decl));
1229 TREE_STATIC (decl) = 1;
1230 TREE_USED (decl) = 1;
1231 DECL_ARTIFICIAL (decl) = 1;
1232 DECL_IGNORED_P (decl) = 0;
1233 TREE_PUBLIC (decl) = 0;
1234 DECL_UNINLINABLE (decl) = 1;
1235 DECL_EXTERNAL (decl) = 0;
1236 DECL_CONTEXT (decl) = NULL_TREE;
1237 DECL_INITIAL (decl) = make_node (BLOCK);
1239 t = build_decl (RESULT_DECL, NULL_TREE, void_type_node);
1240 DECL_ARTIFICIAL (t) = 1;
1241 DECL_IGNORED_P (t) = 1;
1242 DECL_RESULT (decl) = t;
1244 t = build_decl (PARM_DECL, get_identifier (".paral_data_param"),
1245 ptr_type_node);
1246 DECL_ARTIFICIAL (t) = 1;
1247 DECL_ARG_TYPE (t) = ptr_type_node;
1248 DECL_CONTEXT (t) = decl;
1249 TREE_USED (t) = 1;
1250 DECL_ARGUMENTS (decl) = t;
1252 allocate_struct_function (decl, false);
1254 /* The call to allocate_struct_function clobbers CFUN, so we need to restore
1255 it. */
1256 set_cfun (act_cfun);
1258 return decl;
1261 /* Bases all the induction variables in LOOP on a single induction variable
1262 (unsigned with base 0 and step 1), whose final value is compared with
1263 NIT. The induction variable is incremented in the loop latch.
1264 REDUCTION_LIST describes the reductions in LOOP. */
1266 static void
1267 canonicalize_loop_ivs (struct loop *loop, htab_t reduction_list, tree nit)
1269 unsigned precision = TYPE_PRECISION (TREE_TYPE (nit));
1270 tree phi, prev, res, type, var_before, val, atype, mtype, t, next;
1271 block_stmt_iterator bsi;
1272 bool ok;
1273 affine_iv iv;
1274 edge exit = single_dom_exit (loop);
1275 struct reduction_info *red;
1277 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
1279 res = PHI_RESULT (phi);
1281 if (is_gimple_reg (res) && TYPE_PRECISION (TREE_TYPE (res)) > precision)
1282 precision = TYPE_PRECISION (TREE_TYPE (res));
1285 type = lang_hooks.types.type_for_size (precision, 1);
1287 bsi = bsi_last (loop->latch);
1288 create_iv (build_int_cst_type (type, 0), build_int_cst (type, 1), NULL_TREE,
1289 loop, &bsi, true, &var_before, NULL);
1291 bsi = bsi_after_labels (loop->header);
1292 prev = NULL;
1293 for (phi = phi_nodes (loop->header); phi; phi = next)
1295 next = PHI_CHAIN (phi);
1296 res = PHI_RESULT (phi);
1298 if (!is_gimple_reg (res) || res == var_before)
1300 prev = phi;
1301 continue;
1304 ok = simple_iv (loop, phi, res, &iv, true);
1305 red = reduction_phi (reduction_list, phi);
1306 /* We preserve the reduction phi nodes. */
1307 if (!ok && red)
1309 prev = phi;
1310 continue;
1312 else
1313 gcc_assert (ok);
1314 remove_phi_node (phi, prev, false);
1316 atype = TREE_TYPE (res);
1317 mtype = POINTER_TYPE_P (atype) ? sizetype : atype;
1318 val = fold_build2 (MULT_EXPR, mtype, unshare_expr (iv.step),
1319 fold_convert (mtype, var_before));
1320 val = fold_build2 (POINTER_TYPE_P (atype)
1321 ? POINTER_PLUS_EXPR : PLUS_EXPR,
1322 atype, unshare_expr (iv.base), val);
1323 val = force_gimple_operand_bsi (&bsi, val, false, NULL_TREE, true,
1324 BSI_SAME_STMT);
1325 t = build_gimple_modify_stmt (res, val);
1326 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
1327 SSA_NAME_DEF_STMT (res) = t;
1330 t = last_stmt (exit->src);
1331 /* Make the loop exit if the control condition is not satisfied. */
1332 if (exit->flags & EDGE_TRUE_VALUE)
1334 edge te, fe;
1336 extract_true_false_edges_from_block (exit->src, &te, &fe);
1337 te->flags = EDGE_FALSE_VALUE;
1338 fe->flags = EDGE_TRUE_VALUE;
1340 COND_EXPR_COND (t) = build2 (LT_EXPR, boolean_type_node, var_before, nit);
1343 /* Moves the exit condition of LOOP to the beginning of its header, and
1344 duplicates the part of the last iteration that gets disabled to the
1345 exit of the loop. NIT is the number of iterations of the loop
1346 (used to initialize the variables in the duplicated part).
1348 TODO: the common case is that latch of the loop is empty and immediatelly
1349 follows the loop exit. In this case, it would be better not to copy the
1350 body of the loop, but only move the entry of the loop directly before the
1351 exit check and increase the number of iterations of the loop by one.
1352 This may need some additional preconditioning in case NIT = ~0.
1353 REDUCTION_LIST describes the reductions in LOOP. */
1355 static void
1356 transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit)
1358 basic_block *bbs, *nbbs, ex_bb, orig_header;
1359 unsigned n;
1360 bool ok;
1361 edge exit = single_dom_exit (loop), hpred;
1362 tree phi, nphi, cond, control, control_name, res, t, cond_stmt;
1363 block_stmt_iterator bsi;
1365 split_block_after_labels (loop->header);
1366 orig_header = single_succ (loop->header);
1367 hpred = single_succ_edge (loop->header);
1369 cond_stmt = last_stmt (exit->src);
1370 cond = COND_EXPR_COND (cond_stmt);
1371 control = TREE_OPERAND (cond, 0);
1372 gcc_assert (TREE_OPERAND (cond, 1) == nit);
1374 /* Make sure that we have phi nodes on exit for all loop header phis
1375 (create_parallel_loop requires that). */
1376 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
1378 res = PHI_RESULT (phi);
1379 t = make_ssa_name (SSA_NAME_VAR (res), phi);
1380 SET_PHI_RESULT (phi, t);
1382 nphi = create_phi_node (res, orig_header);
1383 SSA_NAME_DEF_STMT (res) = nphi;
1384 add_phi_arg (nphi, t, hpred);
1386 if (res == control)
1388 TREE_OPERAND (cond, 0) = t;
1389 update_stmt (cond_stmt);
1390 control = t;
1394 bbs = get_loop_body_in_dom_order (loop);
1395 for (n = 0; bbs[n] != exit->src; n++)
1396 continue;
1397 nbbs = XNEWVEC (basic_block, n);
1398 ok = tree_duplicate_sese_tail (single_succ_edge (loop->header), exit,
1399 bbs + 1, n, nbbs);
1400 gcc_assert (ok);
1401 free (bbs);
1402 ex_bb = nbbs[0];
1403 free (nbbs);
1405 /* Other than reductions, the only gimple reg that should be copied
1406 out of the loop is the control variable. */
1408 control_name = NULL_TREE;
1409 for (phi = phi_nodes (ex_bb); phi; phi = PHI_CHAIN (phi))
1411 res = PHI_RESULT (phi);
1412 if (!is_gimple_reg (res))
1413 continue;
1415 /* Check if it is a part of reduction. If it is,
1416 keep the phi at the reduction's keep_res field. The
1417 PHI_RESULT of this phi is the resulting value of the reduction
1418 variable when exiting the loop. */
1420 exit = single_dom_exit (loop);
1422 if (htab_elements (reduction_list) > 0)
1424 struct reduction_info *red;
1426 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1428 red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
1429 if (red)
1430 red->keep_res = phi;
1432 else
1433 gcc_assert (control_name == NULL_TREE
1434 && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
1435 control_name = res;
1437 gcc_assert (control_name != NULL_TREE);
1438 phi = SSA_NAME_DEF_STMT (control_name);
1439 remove_phi_node (phi, NULL_TREE, false);
1441 /* Initialize the control variable to NIT. */
1442 bsi = bsi_after_labels (ex_bb);
1443 t = build_gimple_modify_stmt (control_name, nit);
1444 bsi_insert_before (&bsi, t, BSI_NEW_STMT);
1445 SSA_NAME_DEF_STMT (control_name) = t;
1448 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
1449 LOOP_FN and DATA are the arguments of OMP_PARALLEL.
1450 NEW_DATA is the variable that should be initialized from the argument
1451 of LOOP_FN. N_THREADS is the requested number of threads. Returns the
1452 basic block containing OMP_PARALLEL tree. */
1454 static basic_block
1455 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
1456 tree new_data, unsigned n_threads)
1458 block_stmt_iterator bsi;
1459 basic_block bb, paral_bb, for_bb, ex_bb;
1460 tree t, param, res, for_stmt;
1461 tree cvar, cvar_init, initvar, cvar_next, cvar_base, cond, phi, type;
1462 edge exit, nexit, guard, end, e;
1464 /* Prepare the OMP_PARALLEL statement. */
1465 bb = loop_preheader_edge (loop)->src;
1466 paral_bb = single_pred (bb);
1467 bsi = bsi_last (paral_bb);
1469 t = build_omp_clause (OMP_CLAUSE_NUM_THREADS);
1470 OMP_CLAUSE_NUM_THREADS_EXPR (t)
1471 = build_int_cst (integer_type_node, n_threads);
1472 t = build4 (OMP_PARALLEL, void_type_node, NULL_TREE, t, loop_fn, data);
1474 bsi_insert_after (&bsi, t, BSI_NEW_STMT);
1476 /* Initialize NEW_DATA. */
1477 if (data)
1479 bsi = bsi_after_labels (bb);
1481 param = make_ssa_name (DECL_ARGUMENTS (loop_fn), NULL_TREE);
1482 t = build_gimple_modify_stmt (param, build_fold_addr_expr (data));
1483 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
1484 SSA_NAME_DEF_STMT (param) = t;
1486 t = build_gimple_modify_stmt (new_data,
1487 fold_convert (TREE_TYPE (new_data),
1488 param));
1489 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
1490 SSA_NAME_DEF_STMT (new_data) = t;
1493 /* Emit OMP_RETURN for OMP_PARALLEL. */
1494 bb = split_loop_exit_edge (single_dom_exit (loop));
1495 bsi = bsi_last (bb);
1496 bsi_insert_after (&bsi, make_node (OMP_RETURN), BSI_NEW_STMT);
1498 /* Extract data for OMP_FOR. */
1499 gcc_assert (loop->header == single_dom_exit (loop)->src);
1500 cond = COND_EXPR_COND (last_stmt (loop->header));
1502 cvar = TREE_OPERAND (cond, 0);
1503 cvar_base = SSA_NAME_VAR (cvar);
1504 phi = SSA_NAME_DEF_STMT (cvar);
1505 cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1506 initvar = make_ssa_name (cvar_base, NULL_TREE);
1507 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
1508 initvar);
1509 cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
1511 bsi = bsi_last (loop->latch);
1512 gcc_assert (bsi_stmt (bsi) == SSA_NAME_DEF_STMT (cvar_next));
1513 bsi_remove (&bsi, true);
1515 /* Prepare cfg. */
1516 for_bb = split_edge (loop_preheader_edge (loop));
1517 ex_bb = split_loop_exit_edge (single_dom_exit (loop));
1518 extract_true_false_edges_from_block (loop->header, &nexit, &exit);
1519 gcc_assert (exit == single_dom_exit (loop));
1521 guard = make_edge (for_bb, ex_bb, 0);
1522 single_succ_edge (loop->latch)->flags = 0;
1523 end = make_edge (loop->latch, ex_bb, EDGE_FALLTHRU);
1524 for (phi = phi_nodes (ex_bb); phi; phi = PHI_CHAIN (phi))
1526 res = PHI_RESULT (phi);
1527 gcc_assert (!is_gimple_reg (phi));
1528 t = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, exit));
1529 add_phi_arg (phi, PHI_ARG_DEF_FROM_EDGE (t, loop_preheader_edge (loop)),
1530 guard);
1531 add_phi_arg (phi, PHI_ARG_DEF_FROM_EDGE (t, loop_latch_edge (loop)),
1532 end);
1534 e = redirect_edge_and_branch (exit, nexit->dest);
1535 PENDING_STMT (e) = NULL;
1537 /* Emit OMP_FOR. */
1538 TREE_OPERAND (cond, 0) = cvar_base;
1539 type = TREE_TYPE (cvar);
1540 t = build_omp_clause (OMP_CLAUSE_SCHEDULE);
1541 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
1543 for_stmt = make_node (OMP_FOR);
1544 TREE_TYPE (for_stmt) = void_type_node;
1545 OMP_FOR_CLAUSES (for_stmt) = t;
1546 OMP_FOR_INIT (for_stmt) = build_gimple_modify_stmt (initvar, cvar_init);
1547 OMP_FOR_COND (for_stmt) = cond;
1548 OMP_FOR_INCR (for_stmt) = build_gimple_modify_stmt (cvar_base,
1549 build2 (PLUS_EXPR, type,
1550 cvar_base,
1551 build_int_cst
1552 (type, 1)));
1553 OMP_FOR_BODY (for_stmt) = NULL_TREE;
1554 OMP_FOR_PRE_BODY (for_stmt) = NULL_TREE;
1556 bsi = bsi_last (for_bb);
1557 bsi_insert_after (&bsi, for_stmt, BSI_NEW_STMT);
1558 SSA_NAME_DEF_STMT (initvar) = for_stmt;
1560 /* Emit OMP_CONTINUE. */
1561 bsi = bsi_last (loop->latch);
1562 t = build2 (OMP_CONTINUE, void_type_node, cvar_next, cvar);
1563 bsi_insert_after (&bsi, t, BSI_NEW_STMT);
1564 SSA_NAME_DEF_STMT (cvar_next) = t;
1566 /* Emit OMP_RETURN for OMP_FOR. */
1567 bsi = bsi_last (ex_bb);
1568 bsi_insert_after (&bsi, make_node (OMP_RETURN), BSI_NEW_STMT);
1570 return paral_bb;
1573 /* Generates code to execute the iterations of LOOP in N_THREADS threads in
1574 parallel. NITER describes number of iterations of LOOP.
1575 REDUCTION_LIST describes the reductions existant in the LOOP. */
1577 static void
1578 gen_parallel_loop (struct loop *loop, htab_t reduction_list,
1579 unsigned n_threads, struct tree_niter_desc *niter)
1581 struct loop *nloop;
1582 tree many_iterations_cond, type, nit;
1583 tree stmts, arg_struct, new_arg_struct;
1584 basic_block parallel_head;
1585 struct clsn_data clsn_data;
1586 unsigned prob;
1588 /* From
1590 ---------------------------------------------------------------------
1591 loop
1593 IV = phi (INIT, IV + STEP)
1594 BODY1;
1595 if (COND)
1596 break;
1597 BODY2;
1599 ---------------------------------------------------------------------
1601 with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
1602 we generate the following code:
1604 ---------------------------------------------------------------------
1606 if (MAY_BE_ZERO
1607 || NITER < MIN_PER_THREAD * N_THREADS)
1608 goto original;
1610 BODY1;
1611 store all local loop-invariant variables used in body of the loop to DATA.
1612 OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
1613 load the variables from DATA.
1614 OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
1615 BODY2;
1616 BODY1;
1617 OMP_CONTINUE;
1618 OMP_RETURN -- OMP_FOR
1619 OMP_RETURN -- OMP_PARALLEL
1620 goto end;
1622 original:
1623 loop
1625 IV = phi (INIT, IV + STEP)
1626 BODY1;
1627 if (COND)
1628 break;
1629 BODY2;
1632 end:
1636 /* Create two versions of the loop -- in the old one, we know that the
1637 number of iterations is large enough, and we will transform it into the
1638 loop that will be split to loop_fn, the new one will be used for the
1639 remaining iterations. */
1641 type = TREE_TYPE (niter->niter);
1642 nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
1643 NULL_TREE);
1644 if (stmts)
1645 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1647 many_iterations_cond =
1648 fold_build2 (GE_EXPR, boolean_type_node,
1649 nit, build_int_cst (type, MIN_PER_THREAD * n_threads));
1650 many_iterations_cond
1651 = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1652 invert_truthvalue (unshare_expr (niter->may_be_zero)),
1653 many_iterations_cond);
1654 many_iterations_cond
1655 = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
1656 if (stmts)
1657 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1658 if (!is_gimple_condexpr (many_iterations_cond))
1660 many_iterations_cond
1661 = force_gimple_operand (many_iterations_cond, &stmts,
1662 true, NULL_TREE);
1663 if (stmts)
1664 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1667 initialize_original_copy_tables ();
1669 /* We assume that the loop usually iterates a lot. */
1670 prob = 4 * REG_BR_PROB_BASE / 5;
1671 nloop = loop_version (loop, many_iterations_cond, NULL,
1672 prob, prob, REG_BR_PROB_BASE - prob, true);
1673 update_ssa (TODO_update_ssa);
1674 free_original_copy_tables ();
1676 /* Base all the induction variables in LOOP on a single control one. */
1677 canonicalize_loop_ivs (loop, reduction_list, nit);
1679 /* Ensure that the exit condition is the first statement in the loop. */
1680 transform_to_exit_first_loop (loop, reduction_list, nit);
1683 /* Generate intializations for reductions. */
1685 if (htab_elements (reduction_list) > 0)
1686 htab_traverse (reduction_list, initialize_reductions, loop);
1688 /* Eliminate the references to local variables from the loop. */
1689 eliminate_local_variables (loop);
1691 /* In the old loop, move all variables non-local to the loop to a structure
1692 and back, and create separate decls for the variables used in loop. */
1693 separate_decls_in_loop (loop, reduction_list, &arg_struct, &new_arg_struct, &clsn_data);
1695 /* Create the parallel constructs. */
1696 parallel_head = create_parallel_loop (loop, create_loop_fn (), arg_struct,
1697 new_arg_struct, n_threads);
1698 if (htab_elements (reduction_list) > 0)
1699 create_call_for_reduction (loop, reduction_list, &clsn_data);
1701 scev_reset ();
1703 /* Cancel the loop (it is simpler to do it here rather than to teach the
1704 expander to do it). */
1705 cancel_loop_tree (loop);
1707 /* Expand the parallel constructs. We do it directly here instead of running
1708 a separate expand_omp pass, since it is more efficient, and less likely to
1709 cause troubles with further analyses not being able to deal with the
1710 OMP trees. */
1712 omp_expand_local (parallel_head);
1715 /* Detect parallel loops and generate parallel code using libgomp
1716 primitives. Returns true if some loop was parallelized, false
1717 otherwise. */
1719 bool
1720 parallelize_loops (void)
1722 unsigned n_threads = flag_tree_parallelize_loops;
1723 bool changed = false;
1724 struct loop *loop;
1725 struct tree_niter_desc niter_desc;
1726 loop_iterator li;
1727 htab_t reduction_list;
1729 /* Do not parallelize loops in the functions created by parallelization. */
1730 if (parallelized_function_p (cfun->decl))
1731 return false;
1733 reduction_list = htab_create (10, reduction_info_hash,
1734 reduction_info_eq, free);
1736 FOR_EACH_LOOP (li, loop, 0)
1738 htab_empty (reduction_list);
1739 if (/* Do not bother with loops in cold areas. */
1740 !maybe_hot_bb_p (loop->header)
1741 /* Or loops that roll too little. */
1742 || expected_loop_iterations (loop) <= n_threads
1743 /* And of course, the loop must be parallelizable. */
1744 || !can_duplicate_loop_p (loop)
1745 || !loop_parallel_p (loop, reduction_list, &niter_desc))
1746 continue;
1748 changed = true;
1749 gen_parallel_loop (loop, reduction_list, n_threads, &niter_desc);
1750 verify_flow_info ();
1751 verify_dominators (CDI_DOMINATORS);
1752 verify_loop_structure ();
1753 verify_loop_closed_ssa ();
1756 htab_delete (reduction_list);
1757 return changed;
1760 #include "gt-tree-parloops.h"