1 /* Loop autoparallelization.
2 Copyright (C) 2006 Free Software Foundation, Inc.
3 Contributed by Sebastian Pop <pop@cri.ensmp.fr> and
4 Zdenek Dvorak <dvorakz@suse.cz>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
25 #include "coretypes.h"
29 #include "tree-flow.h"
32 #include "tree-data-ref.h"
33 #include "diagnostic.h"
34 #include "tree-pass.h"
35 #include "tree-scalar-evolution.h"
37 #include "langhooks.h"
38 #include "tree-vectorizer.h"
40 /* This pass tries to distribute iterations of loops into several threads.
41 The implementation is straightforward -- for each loop we test whether its
42 iterations are independent, and if it is the case (and some additional
43 conditions regarding profitability and correctness are satisfied), we
44 add OMP_PARALLEL and OMP_FOR codes and let omp expansion machinery do
47 The most of the complexity is in bringing the code into shape expected
49 -- for OMP_FOR, ensuring that the loop has only one induction variable
50 and that the exit test is at the start of the loop body
51 -- for OMP_PARALLEL, replacing the references to local addressable
52 variables by accesses through pointers, and breaking up ssa chains
53 by storing the values incoming to the parallelized loop to a structure
54 passed to the new function as an argument (something similar is done
55 in omp gimplification, unfortunately only a small part of the code
59 -- if there are several parallelizable loops in a function, it may be
60 possible to generate the threads just once (using synchronization to
61 ensure that cross-loop dependences are obeyed).
62 -- handling of common scalar dependence patterns (accumulation, ...)
63 -- handling of non-innermost loops */
67 currently we use vect_is_simple_reduction() to detect reduction patterns.
68 The code transformation will be introduced by an example.
76 for (i = 0; i < N/1000; i++)
87 # sum_24 = PHI <sum_14(3), 1(2)>;
88 # i_21 = PHI <i_15(3), 0(2)>;
92 sum_14 = D.2191_10 + sum_24;
94 if (N_8 > i_15) goto <L0>; else goto <L2>;
98 # sum_25 = PHI <sum_14(3)>;
102 after reduction transformation (only relevant parts):
110 D.2241_2 = (unsigned int) N_8;
111 D.2242_26 = D.2241_2 - 1;
112 if (D.2242_26 > 399) goto <L26>; else goto <L27>;
114 #two new variables are created for each reduction:
115 "reduction" is the variable holding the neutral element
116 for the particular operation, e.g. 0 for PLUS_EXPR,
117 1 for MULT_EXPR, etc.
118 "reduction_initial" is the initial value given by the user.
119 It is kept and will be used after the parallel computing
124 reduction_initial.39_43 = 1;
126 .paral_data_store.47.D.2261 = D.2242_26;
127 .paral_data_store.47.reduction.38 = reduction.38_42;
128 .paral_data_store.47.x.40 = x.40_44;
129 __builtin_GOMP_parallel_start (parloop._loopfn.0, &.paral_data_store.47, 4);
130 parloop._loopfn.0 (&.paral_data_store.47);
131 __builtin_GOMP_parallel_end ();
133 # collecting the result after the join of the threads is done at
134 create_loads_for_reductions().
135 a new variable "reduction_final" is created. It calculates the
136 final value from the initial value and the value computed by
139 .paral_data_load.48_49 = &.paral_data_store.47;
140 reduction_final.49_50 = .paral_data_load.48_49->reduction.38;
141 reduction_final.49_51 = reduction_initial.39_43 + reduction_final.49_50;
142 ivtmp.37_36 = D.2242_26;
143 i_37 = (int) ivtmp.37_36;
144 D.2191_38 = i_37 + 3;
146 sum_40 = D.2191_38 + reduction_final.49_51;
150 # sum_25 = PHI <sum_40(4), sum_9(6)>;
152 printf (&"sum is %d\n"[0], sum_25);
158 parloop._loopfn.0 (.paral_data_param)
163 .paral_data_param_52 = .paral_data_param_75;
164 .paral_data_load.48_48 = (struct .paral_data.46 *) .paral_data_param_52;
165 D.2289_46 = .paral_data_load.48_48->D.2261;
166 reduction.43_45 = .paral_data_load.48_48->reduction.38;
167 x.45_47 = .paral_data_load.48_48->x.40;
168 # SUCC: 23 [100.0%] (fallthru)
171 # PRED: 21 [100.0%] (fallthru)
173 D.2292_60 = __builtin_omp_get_num_threads ();
174 D.2293_61 = (unsigned int) D.2292_60;
175 D.2294_62 = __builtin_omp_get_thread_num ();
176 D.2295_63 = (unsigned int) D.2294_62;
177 D.2296_64 = D.2289_46 / D.2293_61;
178 D.2297_65 = D.2293_61 * D.2296_64;
179 D.2298_66 = D.2297_65 != D.2289_46;
180 D.2299_67 = D.2296_64 + D.2298_66;
181 D.2300_68 = D.2299_67 * D.2295_63;
182 D.2301_69 = D.2299_67 + D.2300_68;
183 D.2302_70 = MIN_EXPR <D.2301_69, D.2289_46>;
184 ivtmp.41_54 = D.2300_68;
185 if (D.2300_68 >= D.2302_70) goto <L31>; else goto <L32>;
186 # SUCC: 26 [100.0%] (false) 24 (true)
189 # PRED: 23 [100.0%] (false)
191 # SUCC: 4 [100.0%] (fallthru)
194 # PRED: 5 [100.0%] (true) 26 [100.0%] (fallthru)
195 # ivtmp.41_31 = PHI <ivtmp.41_30(5), ivtmp.41_54(26)>;
196 # sum.42_32 = PHI <sum.42_14(5), reduction.43_45(26)>;
198 # SUCC: 19 [100.0%] (fallthru)
201 # PRED: 4 [100.0%] (fallthru)
202 # sum.42_24 = PHI <sum.42_32(4)>;
203 # ivtmp.41_17 = PHI <ivtmp.41_31(4)>;
204 i.44_21 = (int) ivtmp.41_17;
205 D.2310_10 = i.44_21 + 3;
206 (*x.45_47)[i.44_21] = D.2310_10;
207 sum.42_14 = D.2310_10 + sum.42_24;
208 i.44_15 = i.44_21 + 1;
209 # SUCC: 5 [100.0%] (fallthru)
212 # PRED: 19 [100.0%] (fallthru)
214 ivtmp.41_30 = ivtmp.41_31 + 1;
215 if (ivtmp.41_30 < D.2302_70) goto <L0>; else goto <L31>;
216 # SUCC: 4 [100.0%] (true) 24 (false)
218 # Adding this reduction phi is done at
219 create_phi_for_local_result() #
222 # PRED: 5 (false) 23 (true)
223 # reduction.38_56 = PHI <sum.42_14(5), 0(23)>;
225 __builtin_GOMP_barrier ();
226 # SUCC: 25 [100.0%] (fallthru)
228 # Creating the atomic operation is
229 done at create_call_for_reduction_1() #
232 # PRED: 24 [100.0%] (fallthru)
233 D.2306_57 = &.paral_data_load.48_48->reduction.38;
234 D.2307_58 = (unsigned int) reduction.38_56;
235 D.2308_59 = __sync_fetch_and_add_4 (D.2306_57, D.2307_58);
236 # SUCC: 22 [100.0%] (fallthru)
239 # PRED: 25 [100.0%] (fallthru)
248 /* Minimal number of iterations of a loop that should be executed in each
250 #define MIN_PER_THREAD 100
252 /* Element of the hashtable, representing a
253 reduction in the current loop. */
254 struct reduction_info
256 tree reduc_stmt
; /* reduction statement. */
257 tree reduc_phi
; /* The phi node defining the reduction. */
258 enum tree_code reduction_code
; /* code for the reduction operation. */
259 tree keep_res
; /* The PHI_RESULT of this phi is the resulting value
260 of the reduction variable when existing the loop. */
261 tree initial_value
; /* An ssa name representing a new variable holding
262 the initial value of the reduction var before entering the loop. */
263 tree field
; /* the name of the field in the parloop data structure intended for reduction. */
264 tree reduction_init
; /* An ssa name representing a new variable which will be
265 assigned the proper reduction initialization value (init). */
266 tree init
; /* reduction initialization value. */
267 tree new_phi
; /* (helper field) Newly created phi node whose result
268 will be passed to the atomic operation. Represents
269 the local result each thread computed for the reduction
273 /* Equality and hash functions for hashtab code. */
276 reduction_info_eq (const void *aa
, const void *bb
)
278 const struct reduction_info
*a
= (const struct reduction_info
*) aa
;
279 const struct reduction_info
*b
= (const struct reduction_info
*) bb
;
281 return (a
->reduc_phi
== b
->reduc_phi
);
285 reduction_info_hash (const void *aa
)
287 const struct reduction_info
*a
= (const struct reduction_info
*) aa
;
289 return htab_hash_pointer (a
->reduc_phi
);
292 static struct reduction_info
*
293 reduction_phi (htab_t reduction_list
, tree phi
)
295 struct reduction_info tmpred
, *red
;
297 if (htab_elements (reduction_list
) == 0)
300 tmpred
.reduc_phi
= phi
;
301 red
= htab_find (reduction_list
, &tmpred
);
306 /* Element of hashtable of names to copy. */
308 struct name_to_copy_elt
310 unsigned version
; /* The version of the name to copy. */
311 tree new_name
; /* The new name used in the copy. */
312 tree field
; /* The field of the structure used to pass the
316 /* Equality and hash functions for hashtab code. */
319 name_to_copy_elt_eq (const void *aa
, const void *bb
)
321 const struct name_to_copy_elt
*a
= (const struct name_to_copy_elt
*) aa
;
322 const struct name_to_copy_elt
*b
= (const struct name_to_copy_elt
*) bb
;
324 return a
->version
== b
->version
;
328 name_to_copy_elt_hash (const void *aa
)
330 const struct name_to_copy_elt
*a
= (const struct name_to_copy_elt
*) aa
;
332 return (hashval_t
) a
->version
;
335 /* Returns true if the iterations of LOOP are independent on each other (that
336 is, if we can execute them in parallel), and if LOOP satisfies other
337 conditions that we need to be able to parallelize it. Description of number
338 of iterations is stored to NITER. Reduction analysis is done, if
339 reductions are found, they are inserted to the REDUCTION_LIST. */
342 loop_parallel_p (struct loop
*loop
, htab_t reduction_list
, struct tree_niter_desc
*niter
)
344 edge exit
= single_dom_exit (loop
);
345 VEC (ddr_p
, heap
) * dependence_relations
;
346 VEC (data_reference_p
, heap
) * datarefs
;
347 lambda_trans_matrix trans
;
350 loop_vec_info simple_loop_info
;
352 /* Only consider innermost loops with just one exit. The innermost-loop
353 restriction is not necessary, but it makes things simpler. */
354 if (loop
->inner
|| !exit
)
357 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
358 fprintf (dump_file
, "\nConsidering loop %d\n", loop
->num
);
360 /* We need to know # of iterations, and there should be no uses of values
361 defined inside loop outside of it, unless the values are invariants of
363 if (!number_of_iterations_exit (loop
, exit
, niter
, false))
365 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
366 fprintf (dump_file
, " FAILED: number of iterations not known\n");
370 simple_loop_info
= vect_analyze_loop_form (loop
);
372 for (phi
= phi_nodes (loop
->header
); phi
; phi
= PHI_CHAIN (phi
))
374 tree reduc_stmt
= NULL
, operation
;
376 /* ??? TODO: Change this into a generic function that
377 recognizes reductions. */
378 if (!is_gimple_reg (PHI_RESULT (phi
)))
380 if (simple_loop_info
)
381 reduc_stmt
= vect_is_simple_reduction (simple_loop_info
, phi
);
383 /* Create a reduction_info struct, initialize it and insert it to
384 the reduction list. */
389 struct reduction_info
*new_reduction
;
391 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
394 "Detected reduction. reduction stmt is: \n");
395 print_generic_stmt (dump_file
, reduc_stmt
, 0);
396 fprintf (dump_file
, "\n");
399 new_reduction
= XCNEW (struct reduction_info
);
401 new_reduction
->reduc_stmt
= reduc_stmt
;
402 new_reduction
->reduc_phi
= phi
;
403 operation
= GIMPLE_STMT_OPERAND (reduc_stmt
, 1);
404 new_reduction
->reduction_code
= TREE_CODE (operation
);
405 slot
= htab_find_slot (reduction_list
, new_reduction
, INSERT
);
406 *slot
= new_reduction
;
410 for (phi
= phi_nodes (exit
->dest
); phi
; phi
= PHI_CHAIN (phi
))
412 struct reduction_info
*red
;
413 imm_use_iterator imm_iter
;
417 tree val
= PHI_ARG_DEF_FROM_EDGE (phi
, exit
);
419 if (is_gimple_reg (val
))
421 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
423 fprintf (dump_file
, "phi is ");
424 print_generic_expr (dump_file
, phi
, 0);
425 fprintf (dump_file
, "arg of phi to exit: value ");
426 print_generic_expr (dump_file
, val
, 0);
427 fprintf (dump_file
, " used outside loop\n");
429 " checking if it a part of reduction pattern: \n");
431 if (htab_elements (reduction_list
) == 0)
433 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
435 " FAILED: it is not a part of reduction.\n");
439 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, val
)
441 if (flow_bb_inside_loop_p (loop
, bb_for_stmt (USE_STMT (use_p
))))
443 reduc_phi
= USE_STMT (use_p
);
447 red
= reduction_phi (reduction_list
, reduc_phi
);
450 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
452 " FAILED: it is not a part of reduction.\n");
455 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
457 fprintf (dump_file
, "reduction phi is ");
458 print_generic_expr (dump_file
, red
->reduc_phi
, 0);
459 fprintf (dump_file
, "reduction stmt is ");
460 print_generic_expr (dump_file
, red
->reduc_stmt
, 0);
466 /* The iterations of the loop may communicate only through bivs whose
467 iteration space can be distributed efficiently. */
468 for (phi
= phi_nodes (loop
->header
); phi
; phi
= PHI_CHAIN (phi
))
470 tree def
= PHI_RESULT (phi
);
473 if (is_gimple_reg (def
) && !simple_iv (loop
, phi
, def
, &iv
, true))
475 struct reduction_info
*red
;
477 red
= reduction_phi (reduction_list
, phi
);
480 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
482 " FAILED: scalar dependency between iterations\n");
488 /* We need to version the loop to verify assumptions in runtime. */
489 if (!can_duplicate_loop_p (loop
))
491 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
492 fprintf (dump_file
, " FAILED: cannot be duplicated\n");
496 /* Check for problems with dependences. If the loop can be reversed,
497 the iterations are independent. */
498 datarefs
= VEC_alloc (data_reference_p
, heap
, 10);
499 dependence_relations
= VEC_alloc (ddr_p
, heap
, 10 * 10);
500 compute_data_dependences_for_loop (loop
, true, &datarefs
,
501 &dependence_relations
);
502 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
503 dump_data_dependence_relations (dump_file
, dependence_relations
);
505 trans
= lambda_trans_matrix_new (1, 1);
506 LTM_MATRIX (trans
)[0][0] = -1;
508 if (lambda_transform_legal_p (trans
, 1, dependence_relations
))
511 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
512 fprintf (dump_file
, " SUCCESS: may be parallelized\n");
514 else if (dump_file
&& (dump_flags
& TDF_DETAILS
))
516 " FAILED: data dependencies exist across iterations\n");
518 free_dependence_relations (dependence_relations
);
519 free_data_refs (datarefs
);
524 /* Assigns the address of VAR in TYPE to an ssa name, and returns this name.
525 The assignment statement is placed before LOOP. DECL_ADDRESS maps decls
526 to their addresses that can be reused. */
529 take_address_of (tree var
, tree type
, struct loop
*loop
, htab_t decl_address
)
531 int uid
= DECL_UID (var
);
533 struct int_tree_map ielt
, *nielt
;
534 tree name
, bvar
, stmt
;
535 edge entry
= loop_preheader_edge (loop
);
538 dslot
= htab_find_slot_with_hash (decl_address
, &ielt
, uid
, INSERT
);
541 bvar
= create_tmp_var (type
, get_name (var
));
542 add_referenced_var (bvar
);
543 stmt
= build_gimple_modify_stmt (bvar
,
546 current_function_decl
)));
547 name
= make_ssa_name (bvar
, stmt
);
548 GIMPLE_STMT_OPERAND (stmt
, 0) = name
;
549 bsi_insert_on_edge_immediate (entry
, stmt
);
551 nielt
= XNEW (struct int_tree_map
);
559 name
= ((struct int_tree_map
*) *dslot
)->to
;
560 if (TREE_TYPE (name
) == type
)
563 bvar
= SSA_NAME_VAR (name
);
564 stmt
= build_gimple_modify_stmt (bvar
, fold_convert (type
, name
));
565 name
= make_ssa_name (bvar
, stmt
);
566 GIMPLE_STMT_OPERAND (stmt
, 0) = name
;
567 bsi_insert_on_edge_immediate (entry
, stmt
);
572 /* Callback for htab_traverse. Create the initialization statement
573 for reduction described in SLOT, and place it at the preheader of
574 the loop described in DATA. */
577 initialize_reductions (void **slot
, void *data
)
582 tree bvar
, type
, arg
;
585 struct reduction_info
*reduc
= *slot
;
586 struct loop
*loop
= (struct loop
*) data
;
588 /* Create initialization in preheader:
589 reduction_variable = initialization value of reduction. */
591 /* In the phi node at the header, replace the argument coming
592 from the preheader with the reduction initialization value. */
594 /* Create a new variable to initialize the reduction. */
595 type
= TREE_TYPE (PHI_RESULT (reduc
->reduc_phi
));
596 bvar
= create_tmp_var (type
, "reduction");
597 add_referenced_var (bvar
);
599 c
= build_omp_clause (OMP_CLAUSE_REDUCTION
);
600 OMP_CLAUSE_REDUCTION_CODE (c
) = reduc
->reduction_code
;
601 OMP_CLAUSE_DECL (c
) =
602 SSA_NAME_VAR (GIMPLE_STMT_OPERAND (reduc
->reduc_stmt
, 0));
604 init
= omp_reduction_init (c
, TREE_TYPE (bvar
));
607 t
= build_gimple_modify_stmt (bvar
, init
);
608 name
= make_ssa_name (bvar
, t
);
610 GIMPLE_STMT_OPERAND (t
, 0) = name
;
611 SSA_NAME_DEF_STMT (name
) = t
;
613 /* Replace the argument
614 representing the initialization value. Keeping the old value
615 in a new variable "reduction_initial", that will be taken in
616 consideration after the parallel computing is done. */
618 e
= loop_preheader_edge (loop
);
619 arg
= PHI_ARG_DEF_FROM_EDGE (reduc
->reduc_phi
, e
);
620 /* Create new variable to hold the initial value. */
621 type
= TREE_TYPE (bvar
);
622 bvar
= create_tmp_var (type
, "reduction_initial");
623 add_referenced_var (bvar
);
625 stmt
= build_gimple_modify_stmt (bvar
, arg
);
626 name1
= make_ssa_name (bvar
, stmt
);
627 GIMPLE_STMT_OPERAND (stmt
, 0) = name1
;
628 SSA_NAME_DEF_STMT (name1
) = stmt
;
630 bsi_insert_on_edge_immediate (e
, stmt
);
631 bsi_insert_on_edge_immediate (e
, t
);
632 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
633 (reduc
->reduc_phi
, loop_preheader_edge (loop
)), name
);
634 reduc
->initial_value
= name1
;
635 reduc
->reduction_init
= name
;
646 /* Eliminates references to local variables in *TP out of LOOP. DECL_ADDRESS
647 contains addresses of the references that had their address taken already.
648 If the expression is changed, CHANGED is set to true. Callback for
652 eliminate_local_variables_1 (tree
* tp
, int *walk_subtrees
, void *data
)
654 struct elv_data
*dta
= data
;
655 tree t
= *tp
, var
, addr
, addr_type
, type
;
661 if (!SSA_VAR_P (t
) || DECL_EXTERNAL (t
))
664 type
= TREE_TYPE (t
);
665 addr_type
= build_pointer_type (type
);
666 addr
= take_address_of (t
, addr_type
, dta
->loop
, dta
->decl_address
);
667 *tp
= build1 (INDIRECT_REF
, TREE_TYPE (*tp
), addr
);
673 if (TREE_CODE (t
) == ADDR_EXPR
)
675 var
= TREE_OPERAND (t
, 0);
680 if (!SSA_VAR_P (var
) || DECL_EXTERNAL (var
))
683 addr_type
= TREE_TYPE (t
);
684 addr
= take_address_of (var
, addr_type
, dta
->loop
, dta
->decl_address
);
691 if (!EXPR_P (t
) && !GIMPLE_STMT_P (t
))
697 /* Moves the references to local variables in STMT from LOOP. DECL_ADDRESS
698 contains addresses for the references for that we have already taken
702 eliminate_local_variables_stmt (struct loop
*loop
, tree stmt
,
708 dta
.decl_address
= decl_address
;
711 walk_tree (&stmt
, eliminate_local_variables_1
, &dta
, NULL
);
717 /* Eliminates the references to local variables from LOOP.
719 1) Taking address of a local variable -- these are moved out of the
720 loop (and temporary variable is created to hold the address if
722 2) Dereferencing a local variable -- these are replaced with indirect
726 eliminate_local_variables (struct loop
*loop
)
728 basic_block bb
, *body
= get_loop_body (loop
);
730 block_stmt_iterator bsi
;
731 htab_t decl_address
= htab_create (10, int_tree_map_hash
, int_tree_map_eq
,
734 /* Find and rename the ssa names defined outside of loop. */
735 for (i
= 0; i
< loop
->num_nodes
; i
++)
739 for (bsi
= bsi_start (bb
); !bsi_end_p (bsi
); bsi_next (&bsi
))
740 eliminate_local_variables_stmt (loop
, bsi_stmt (bsi
), decl_address
);
743 htab_delete (decl_address
);
746 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
747 The copies are stored to NAME_COPIES, if NAME was already duplicated,
748 its duplicate stored in NAME_COPIES is returned.
750 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
751 duplicated, storing the copies in DECL_COPIES. */
754 separate_decls_in_loop_name (tree name
,
755 htab_t name_copies
, htab_t decl_copies
,
758 tree copy
, var
, var_copy
;
759 unsigned idx
, uid
, nuid
;
760 struct int_tree_map ielt
, *nielt
;
761 struct name_to_copy_elt elt
, *nelt
;
762 void **slot
, **dslot
;
764 if (TREE_CODE (name
) != SSA_NAME
)
767 idx
= SSA_NAME_VERSION (name
);
769 slot
= htab_find_slot_with_hash (name_copies
, &elt
, idx
,
770 copy_name_p
? INSERT
: NO_INSERT
);
772 return ((struct name_to_copy_elt
*) *slot
)->new_name
;
774 var
= SSA_NAME_VAR (name
);
775 uid
= DECL_UID (var
);
777 dslot
= htab_find_slot_with_hash (decl_copies
, &ielt
, uid
, INSERT
);
780 var_copy
= create_tmp_var (TREE_TYPE (var
), get_name (var
));
781 add_referenced_var (var_copy
);
782 nielt
= XNEW (struct int_tree_map
);
784 nielt
->to
= var_copy
;
787 /* Ensure that when we meet this decl next time, we won't duplicate
789 nuid
= DECL_UID (var_copy
);
791 dslot
= htab_find_slot_with_hash (decl_copies
, &ielt
, nuid
, INSERT
);
792 gcc_assert (!*dslot
);
793 nielt
= XNEW (struct int_tree_map
);
795 nielt
->to
= var_copy
;
799 var_copy
= ((struct int_tree_map
*) *dslot
)->to
;
803 copy
= duplicate_ssa_name (name
, NULL_TREE
);
804 nelt
= XNEW (struct name_to_copy_elt
);
806 nelt
->new_name
= copy
;
807 nelt
->field
= NULL_TREE
;
816 SSA_NAME_VAR (copy
) = var_copy
;
820 /* Finds the ssa names used in STMT that are defined outside of LOOP and
821 replaces such ssa names with their duplicates. The duplicates are stored to
822 NAME_COPIES. Base decls of all ssa names used in STMT
823 (including those defined in LOOP) are replaced with the new temporary
824 variables; the replacement decls are stored in DECL_COPIES. */
827 separate_decls_in_loop_stmt (struct loop
*loop
, tree stmt
,
828 htab_t name_copies
, htab_t decl_copies
)
836 mark_virtual_ops_for_renaming (stmt
);
838 FOR_EACH_PHI_OR_STMT_DEF (def
, stmt
, oi
, SSA_OP_DEF
)
840 name
= DEF_FROM_PTR (def
);
841 gcc_assert (TREE_CODE (name
) == SSA_NAME
);
842 copy
= separate_decls_in_loop_name (name
, name_copies
, decl_copies
,
844 gcc_assert (copy
== name
);
847 FOR_EACH_PHI_OR_STMT_USE (use
, stmt
, oi
, SSA_OP_USE
)
849 name
= USE_FROM_PTR (use
);
850 if (TREE_CODE (name
) != SSA_NAME
)
853 copy_name_p
= expr_invariant_in_loop_p (loop
, name
);
854 copy
= separate_decls_in_loop_name (name
, name_copies
, decl_copies
,
860 /* A helper structure for passing the TYPE and REDUCTION_LIST
861 to the DATA parameter of add_field_for_name. */
865 htab_t reduction_list
;
868 /* Callback for htab_traverse. Adds a field corresponding to a ssa name
869 described in SLOT. The type is passed in DATA. The Reduction list
870 is also passes in DATA. */
873 add_field_for_name (void **slot
, void *data
)
876 use_operand_p use_p
= NULL
;
878 struct name_to_copy_elt
*elt
= *slot
;
879 struct data_arg
*data_arg
= (struct data_arg
*) data
;
880 tree type
= data_arg
->type
;
881 tree name
= ssa_name (elt
->version
);
882 tree var
= SSA_NAME_VAR (name
);
883 tree field
= build_decl (FIELD_DECL
, DECL_NAME (var
), TREE_TYPE (var
));
885 insert_field_into_struct (type
, field
);
888 /* Find uses of name to determine if this name is related to
889 a reduction phi, and if so, record the field in the reduction struct. */
891 if ((htab_elements (data_arg
->reduction_list
) > 0)
892 && single_imm_use (elt
->new_name
, &use_p
, &stmt
)
893 && TREE_CODE (stmt
) == PHI_NODE
)
895 /* check if STMT is a REDUC_PHI of some reduction. */
896 struct reduction_info
*red
;
898 red
= reduction_phi (data_arg
->reduction_list
,stmt
);
906 /* Callback for htab_traverse. A local result is the intermediate result
908 thread, or the intial value in case no iteration was executed.
909 This function creates a phi node reflecting these values.
910 The phi's result will be stored in NEW_PHI field of the
911 reduction's data structure. */
914 create_phi_for_local_result (void **slot
, void *data
)
916 struct reduction_info
*reduc
= *slot
;
917 struct loop
*loop
= data
;
920 basic_block store_bb
;
923 /* STORE_BB is the block where the phi
924 should be stored. It is the destination of the loop exit.
925 (Find the fallthru edge from OMP_CONTINUE). */
926 store_bb
= FALLTHRU_EDGE (loop
->latch
)->dest
;
928 /* STORE_BB has two predecessors. One coming from the loop
929 (the reduction's result is computed at the loop),
930 and another coming from a block preceding the loop,
932 are executed (the initial value should be taken). */
933 if (EDGE_PRED (store_bb
, 0) == FALLTHRU_EDGE (loop
->latch
))
934 e
= EDGE_PRED (store_bb
, 1);
936 e
= EDGE_PRED (store_bb
, 0);
937 local_res
= make_ssa_name (SSA_NAME_VAR (reduc
->reduction_init
), NULL_TREE
);
938 new_phi
= create_phi_node (local_res
, store_bb
);
939 SSA_NAME_DEF_STMT (local_res
) = new_phi
;
940 add_phi_arg (new_phi
, reduc
->init
, e
);
941 add_phi_arg (new_phi
, GIMPLE_STMT_OPERAND (reduc
->reduc_stmt
, 0),
942 FALLTHRU_EDGE (loop
->latch
));
943 reduc
->new_phi
= new_phi
;
953 basic_block store_bb
;
957 /* Callback for htab_traverse. Create an atomic instruction for the
958 reduction described in SLOT.
959 DATA annotates the place in memory the atomic operation relates to,
960 and the basic block it needs to be generated in. */
963 create_call_for_reduction_1 (void **slot
, void *data
)
965 struct reduction_info
*reduc
= *slot
;
966 struct clsn_data
*clsn_data
= data
;
967 block_stmt_iterator bsi
;
968 tree type
= TREE_TYPE (PHI_RESULT (reduc
->reduc_phi
));
969 tree struct_type
= TREE_TYPE (TREE_TYPE (clsn_data
->load
));
974 tree t
, addr
, addr_type
, ref
, x
;
975 tree tmp_load
, load
, name
;
977 load_struct
= fold_build1 (INDIRECT_REF
, struct_type
, clsn_data
->load
);
978 t
= build3 (COMPONENT_REF
, type
, load_struct
, reduc
->field
, NULL_TREE
);
979 addr_type
= build_pointer_type (type
);
981 addr
= build_addr (t
, current_function_decl
);
983 /* Create phi node. */
984 bb
= clsn_data
->load_bb
;
986 e
= split_block (bb
, t
);
989 tmp_load
= create_tmp_var (TREE_TYPE (TREE_TYPE (addr
)), NULL
);
990 add_referenced_var (tmp_load
);
991 tmp_load
= make_ssa_name (tmp_load
, NULL
);
992 load
= build2 (OMP_ATOMIC_LOAD
, void_type_node
, tmp_load
, addr
);
993 SSA_NAME_DEF_STMT (tmp_load
) = load
;
994 bsi
= bsi_start (new_bb
);
995 bsi_insert_after (&bsi
, load
, BSI_NEW_STMT
);
997 e
= split_block (new_bb
, load
);
999 bsi
= bsi_start (new_bb
);
1002 fold_build2 (reduc
->reduction_code
,
1003 TREE_TYPE (PHI_RESULT (reduc
->new_phi
)), ref
,
1004 PHI_RESULT (reduc
->new_phi
));
1007 force_gimple_operand_bsi (&bsi
, x
, true, NULL_TREE
, true,
1008 BSI_CONTINUE_LINKING
);
1010 x
= build1 (OMP_ATOMIC_STORE
, void_type_node
, name
);
1012 bsi_insert_after (&bsi
, x
, BSI_NEW_STMT
);
1016 /* Create the atomic operation at the join point of the threads.
1017 REDUCTION_LIST describes the reductions in the LOOP.
1018 LD_ST_DATA describes the shared data structure where
1019 shared data is stored in and loaded from. */
1021 create_call_for_reduction (struct loop
*loop
, htab_t reduction_list
,
1022 struct clsn_data
*ld_st_data
)
1024 htab_traverse (reduction_list
, create_phi_for_local_result
, loop
);
1025 /* Find the fallthru edge from OMP_CONTINUE. */
1026 ld_st_data
->load_bb
= FALLTHRU_EDGE (loop
->latch
)->dest
;
1027 htab_traverse (reduction_list
, create_call_for_reduction_1
, ld_st_data
);
1030 /* Callback for htab_traverse. Create a new variable that loads the
1031 final reduction value at the
1032 join point of all threads, adds the initial value the reduction
1033 variable had before the parallel computation started, and
1034 inserts it in the right place. */
1037 create_loads_for_reductions (void **slot
, void *data
)
1039 struct reduction_info
*red
= *slot
;
1040 struct clsn_data
*clsn_data
= data
;
1042 block_stmt_iterator bsi
;
1043 tree type
= TREE_TYPE (red
->reduction_init
);
1044 tree struct_type
= TREE_TYPE (TREE_TYPE (clsn_data
->load
));
1049 bsi
= bsi_after_labels (clsn_data
->load_bb
);
1050 load_struct
= fold_build1 (INDIRECT_REF
, struct_type
, clsn_data
->load
);
1051 load_struct
= build3 (COMPONENT_REF
, type
, load_struct
, red
->field
,
1053 bvar
= create_tmp_var (type
, "reduction_final");
1054 add_referenced_var (bvar
);
1056 /* Apply operation between the new variable which is the result
1057 of computation all threads, and the initial value which is kept
1058 at reduction->inital_value. */
1060 stmt
= build_gimple_modify_stmt (bvar
, load_struct
);
1061 name
= make_ssa_name (bvar
, stmt
);
1062 GIMPLE_STMT_OPERAND (stmt
, 0) = name
;
1063 SSA_NAME_DEF_STMT (name
) = stmt
;
1065 bsi_insert_after (&bsi
, stmt
, BSI_NEW_STMT
);
1068 fold_build2 (red
->reduction_code
, TREE_TYPE (load_struct
),
1069 name
, red
->initial_value
);
1070 name
= PHI_RESULT (red
->keep_res
);
1071 stmt
= build_gimple_modify_stmt (name
, x
);
1072 GIMPLE_STMT_OPERAND (stmt
, 0) = name
;
1073 SSA_NAME_DEF_STMT (name
) = stmt
;
1075 bsi_insert_after (&bsi
, stmt
, BSI_NEW_STMT
);
1077 remove_phi_node (red
->keep_res
, NULL_TREE
, false);
1082 /* Load the reduction result that was stored in LD_ST_DATA.
1083 REDUCTION_LIST describes the list of reductions that the
1084 loades should be generated for. */
1086 create_final_loads_for_reduction (htab_t reduction_list
,
1087 struct clsn_data
*ld_st_data
)
1089 block_stmt_iterator bsi
;
1092 bsi
= bsi_after_labels (ld_st_data
->load_bb
);
1093 t
= build_fold_addr_expr (ld_st_data
->store
);
1095 build_gimple_modify_stmt (ld_st_data
->load
,
1096 build_fold_addr_expr (ld_st_data
->store
));
1098 bsi_insert_before (&bsi
, t
, BSI_NEW_STMT
);
1099 SSA_NAME_DEF_STMT (ld_st_data
->load
) = t
;
1100 GIMPLE_STMT_OPERAND (t
, 0) = ld_st_data
->load
;
1102 htab_traverse (reduction_list
, create_loads_for_reductions
, ld_st_data
);
1106 /* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and
1107 store to a field of STORE in STORE_BB for the ssa name and its duplicate
1108 specified in SLOT. */
1111 create_loads_and_stores_for_name (void **slot
, void *data
)
1113 struct name_to_copy_elt
*elt
= *slot
;
1114 struct clsn_data
*clsn_data
= data
;
1116 block_stmt_iterator bsi
;
1117 tree type
= TREE_TYPE (elt
->new_name
);
1118 tree struct_type
= TREE_TYPE (TREE_TYPE (clsn_data
->load
));
1121 bsi
= bsi_last (clsn_data
->store_bb
);
1123 build_gimple_modify_stmt (build3
1124 (COMPONENT_REF
, type
, clsn_data
->store
,
1125 elt
->field
, NULL_TREE
),
1126 ssa_name (elt
->version
));
1127 mark_virtual_ops_for_renaming (stmt
);
1128 bsi_insert_after (&bsi
, stmt
, BSI_NEW_STMT
);
1130 bsi
= bsi_last (clsn_data
->load_bb
);
1131 load_struct
= fold_build1 (INDIRECT_REF
, struct_type
, clsn_data
->load
);
1132 stmt
= build_gimple_modify_stmt (elt
->new_name
,
1133 build3 (COMPONENT_REF
, type
, load_struct
,
1134 elt
->field
, NULL_TREE
));
1135 SSA_NAME_DEF_STMT (elt
->new_name
) = stmt
;
1136 bsi_insert_after (&bsi
, stmt
, BSI_NEW_STMT
);
1141 /* Moves all the variables used in LOOP and defined outside of it (including
1142 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1143 name) to a structure created for this purpose. The code
1151 is transformed this way:
1166 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The
1167 pointer `new' is intentionally not initialized (the loop will be split to a
1168 separate function later, and `new' will be initialized from its arguments).
1169 LD_ST_DATA holds information about the shared data structure used to pass
1170 information among the threads. It is initialized here, and
1171 gen_parallel_loop will pass it to create_call_for_reduction that
1172 needs this information. REDUCTION_LIST describes the reductions
1176 separate_decls_in_loop (struct loop
*loop
, htab_t reduction_list
,
1177 tree
* arg_struct
, tree
* new_arg_struct
,
1178 struct clsn_data
*ld_st_data
)
1181 basic_block bb1
= split_edge (loop_preheader_edge (loop
));
1182 basic_block bb0
= single_pred (bb1
);
1183 htab_t name_copies
= htab_create (10, name_to_copy_elt_hash
,
1184 name_to_copy_elt_eq
, free
);
1185 htab_t decl_copies
= htab_create (10, int_tree_map_hash
, int_tree_map_eq
,
1187 basic_block bb
, *body
= get_loop_body (loop
);
1189 tree phi
, type
, type_name
, nvar
;
1190 block_stmt_iterator bsi
;
1191 struct clsn_data clsn_data
;
1193 /* Find and rename the ssa names defined outside of loop. */
1194 for (i
= 0; i
< loop
->num_nodes
; i
++)
1198 for (phi
= phi_nodes (bb
); phi
; phi
= PHI_CHAIN (phi
))
1199 separate_decls_in_loop_stmt (loop
, phi
, name_copies
, decl_copies
);
1201 for (bsi
= bsi_start (bb
); !bsi_end_p (bsi
); bsi_next (&bsi
))
1202 separate_decls_in_loop_stmt (loop
, bsi_stmt (bsi
), name_copies
,
1207 if (htab_elements (name_copies
) == 0)
1209 /* It may happen that there is nothing to copy (if there are only
1210 loop carried and external variables in the loop). */
1212 *new_arg_struct
= NULL
;
1216 struct data_arg data_arg
;
1218 /* Create the type for the structure to store the ssa names to. */
1219 type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
1220 type_name
= build_decl (TYPE_DECL
, create_tmp_var_name (".paral_data"),
1222 TYPE_NAME (type
) = type_name
;
1224 data_arg
.type
= type
;
1225 data_arg
.reduction_list
= reduction_list
;
1226 htab_traverse (name_copies
, add_field_for_name
, &data_arg
);
1229 /* Create the loads and stores. */
1230 *arg_struct
= create_tmp_var (type
, ".paral_data_store");
1231 add_referenced_var (*arg_struct
);
1232 nvar
= create_tmp_var (build_pointer_type (type
), ".paral_data_load");
1233 add_referenced_var (nvar
);
1234 *new_arg_struct
= make_ssa_name (nvar
, NULL_TREE
);
1236 ld_st_data
->store
= *arg_struct
;
1237 ld_st_data
->load
= *new_arg_struct
;
1238 ld_st_data
->store_bb
= bb0
;
1239 ld_st_data
->load_bb
= bb1
;
1240 htab_traverse (name_copies
, create_loads_and_stores_for_name
,
1243 /* Load the calculation from memory into a new
1244 reduction variable (after the join of the threads). */
1245 if (htab_elements (reduction_list
) > 0)
1247 clsn_data
.load
= make_ssa_name (nvar
, NULL_TREE
);
1248 clsn_data
.load_bb
= single_dom_exit (loop
)->dest
;
1249 clsn_data
.store
= ld_st_data
->store
;
1250 create_final_loads_for_reduction (reduction_list
, &clsn_data
);
1254 htab_delete (decl_copies
);
1255 htab_delete (name_copies
);
1258 /* Bitmap containing uids of functions created by parallelization. We cannot
1259 allocate it from the default obstack, as it must live across compilation
1260 of several functions; we make it gc allocated instead. */
1262 static GTY(()) bitmap parallelized_functions
;
1264 /* Returns true if FN was created by create_loop_fn. */
1267 parallelized_function_p (tree fn
)
1269 if (!parallelized_functions
|| !DECL_ARTIFICIAL (fn
))
1272 return bitmap_bit_p (parallelized_functions
, DECL_UID (fn
));
1275 /* Creates and returns an empty function that will receive the body of
1276 a parallelized loop. */
1279 create_loop_fn (void)
1283 tree decl
, type
, name
, t
;
1284 struct function
*act_cfun
= cfun
;
1285 static unsigned loopfn_num
;
1287 snprintf (buf
, 100, "%s.$loopfn", current_function_name ());
1288 ASM_FORMAT_PRIVATE_NAME (tname
, buf
, loopfn_num
++);
1289 clean_symbol_name (tname
);
1290 name
= get_identifier (tname
);
1291 type
= build_function_type_list (void_type_node
, ptr_type_node
, NULL_TREE
);
1293 decl
= build_decl (FUNCTION_DECL
, name
, type
);
1294 if (!parallelized_functions
)
1295 parallelized_functions
= BITMAP_GGC_ALLOC ();
1296 bitmap_set_bit (parallelized_functions
, DECL_UID (decl
));
1298 TREE_STATIC (decl
) = 1;
1299 TREE_USED (decl
) = 1;
1300 DECL_ARTIFICIAL (decl
) = 1;
1301 DECL_IGNORED_P (decl
) = 0;
1302 TREE_PUBLIC (decl
) = 0;
1303 DECL_UNINLINABLE (decl
) = 1;
1304 DECL_EXTERNAL (decl
) = 0;
1305 DECL_CONTEXT (decl
) = NULL_TREE
;
1306 DECL_INITIAL (decl
) = make_node (BLOCK
);
1308 t
= build_decl (RESULT_DECL
, NULL_TREE
, void_type_node
);
1309 DECL_ARTIFICIAL (t
) = 1;
1310 DECL_IGNORED_P (t
) = 1;
1311 DECL_RESULT (decl
) = t
;
1313 t
= build_decl (PARM_DECL
, get_identifier (".paral_data_param"),
1315 DECL_ARTIFICIAL (t
) = 1;
1316 DECL_ARG_TYPE (t
) = ptr_type_node
;
1317 DECL_CONTEXT (t
) = decl
;
1319 DECL_ARGUMENTS (decl
) = t
;
1321 allocate_struct_function (decl
);
1323 /* The call to allocate_struct_function clobbers CFUN, so we need to restore
1330 /* Bases all the induction variables in LOOP on a single induction variable
1331 (unsigned with base 0 and step 1), whose final value is compared with
1332 NIT. The induction variable is incremented in the loop latch.
1333 REDUCTION_LIST describes the reductions in LOOP. */
1336 canonicalize_loop_ivs (struct loop
*loop
, htab_t reduction_list
, tree nit
)
1338 unsigned precision
= TYPE_PRECISION (TREE_TYPE (nit
));
1339 tree phi
, prev
, res
, type
, var_before
, val
, atype
, t
, next
;
1340 block_stmt_iterator bsi
;
1343 edge exit
= single_dom_exit (loop
);
1344 struct reduction_info
*red
;
1346 for (phi
= phi_nodes (loop
->header
); phi
; phi
= PHI_CHAIN (phi
))
1348 res
= PHI_RESULT (phi
);
1350 if (is_gimple_reg (res
) && TYPE_PRECISION (TREE_TYPE (res
)) > precision
)
1351 precision
= TYPE_PRECISION (TREE_TYPE (res
));
1354 type
= lang_hooks
.types
.type_for_size (precision
, 1);
1356 bsi
= bsi_last (loop
->latch
);
1357 create_iv (build_int_cst_type (type
, 0), build_int_cst (type
, 1), NULL_TREE
,
1358 loop
, &bsi
, true, &var_before
, NULL
);
1360 bsi
= bsi_after_labels (loop
->header
);
1362 for (phi
= phi_nodes (loop
->header
); phi
; phi
= next
)
1364 next
= PHI_CHAIN (phi
);
1365 res
= PHI_RESULT (phi
);
1367 if (!is_gimple_reg (res
) || res
== var_before
)
1373 ok
= simple_iv (loop
, phi
, res
, &iv
, true);
1374 red
= reduction_phi (reduction_list
, phi
);
1375 /* We preserve the reduction phi nodes. */
1383 remove_phi_node (phi
, prev
, false);
1385 atype
= TREE_TYPE (res
);
1386 val
= fold_build2 (PLUS_EXPR
, atype
,
1387 unshare_expr (iv
.base
),
1388 fold_build2 (MULT_EXPR
, atype
,
1389 unshare_expr (iv
.step
),
1390 fold_convert (atype
, var_before
)));
1391 val
= force_gimple_operand_bsi (&bsi
, val
, false, NULL_TREE
, true,
1393 t
= build_gimple_modify_stmt (res
, val
);
1394 bsi_insert_before (&bsi
, t
, BSI_SAME_STMT
);
1395 SSA_NAME_DEF_STMT (res
) = t
;
1398 t
= last_stmt (exit
->src
);
1399 /* Make the loop exit if the control condition is not satisfied. */
1400 if (exit
->flags
& EDGE_TRUE_VALUE
)
1404 extract_true_false_edges_from_block (exit
->src
, &te
, &fe
);
1405 te
->flags
= EDGE_FALSE_VALUE
;
1406 fe
->flags
= EDGE_TRUE_VALUE
;
1408 COND_EXPR_COND (t
) = build2 (LT_EXPR
, boolean_type_node
, var_before
, nit
);
1411 /* Moves the exit condition of LOOP to the beginning of its header, and
1412 duplicates the part of the last iteration that gets disabled to the
1413 exit of the loop. NIT is the number of iterations of the loop
1414 (used to initialize the variables in the duplicated part).
1416 TODO: the common case is that latch of the loop is empty and immediatelly
1417 follows the loop exit. In this case, it would be better not to copy the
1418 body of the loop, but only move the entry of the loop directly before the
1419 exit check and increase the number of iterations of the loop by one.
1420 This may need some additional preconditioning in case NIT = ~0.
1421 REDUCTION_LIST describes the reductions in LOOP. */
1424 transform_to_exit_first_loop (struct loop
*loop
, htab_t reduction_list
, tree nit
)
1426 basic_block
*bbs
, *nbbs
, ex_bb
, orig_header
;
1429 edge exit
= single_dom_exit (loop
), hpred
;
1430 tree phi
, nphi
, cond
, control
, control_name
, res
, t
, cond_stmt
;
1431 block_stmt_iterator bsi
;
1433 split_block_after_labels (loop
->header
);
1434 orig_header
= single_succ (loop
->header
);
1435 hpred
= single_succ_edge (loop
->header
);
1437 cond_stmt
= last_stmt (exit
->src
);
1438 cond
= COND_EXPR_COND (cond_stmt
);
1439 control
= TREE_OPERAND (cond
, 0);
1440 gcc_assert (TREE_OPERAND (cond
, 1) == nit
);
1442 /* Make sure that we have phi nodes on exit for all loop header phis
1443 (create_parallel_loop requires that). */
1444 for (phi
= phi_nodes (loop
->header
); phi
; phi
= PHI_CHAIN (phi
))
1446 res
= PHI_RESULT (phi
);
1447 t
= make_ssa_name (SSA_NAME_VAR (res
), phi
);
1448 SET_PHI_RESULT (phi
, t
);
1450 nphi
= create_phi_node (res
, orig_header
);
1451 SSA_NAME_DEF_STMT (res
) = nphi
;
1452 add_phi_arg (nphi
, t
, hpred
);
1456 TREE_OPERAND (cond
, 0) = t
;
1457 update_stmt (cond_stmt
);
1462 bbs
= get_loop_body_in_dom_order (loop
);
1463 for (n
= 0; bbs
[n
] != exit
->src
; n
++)
1465 nbbs
= XNEWVEC (basic_block
, n
);
1466 ok
= tree_duplicate_sese_tail (single_succ_edge (loop
->header
), exit
,
1473 /* Other than reductions, the only gimple reg that should be copied
1474 out of the loop is the control variable. */
1476 control_name
= NULL_TREE
;
1477 for (phi
= phi_nodes (ex_bb
); phi
; phi
= PHI_CHAIN (phi
))
1479 res
= PHI_RESULT (phi
);
1480 if (!is_gimple_reg (res
))
1483 /* Check if it is a part of reduction. If it is,
1484 keep the phi at the reduction's keep_res field. The
1485 PHI_RESULT of this phi is the resulting value of the reduction
1486 variable when exiting the loop. */
1488 exit
= single_dom_exit (loop
);
1490 if (htab_elements (reduction_list
) > 0)
1492 struct reduction_info
*red
;
1494 tree val
= PHI_ARG_DEF_FROM_EDGE (phi
, exit
);
1496 red
= reduction_phi (reduction_list
, SSA_NAME_DEF_STMT (val
));
1498 red
->keep_res
= phi
;
1501 gcc_assert (control_name
== NULL_TREE
1502 && SSA_NAME_VAR (res
) == SSA_NAME_VAR (control
));
1505 gcc_assert (control_name
!= NULL_TREE
);
1506 phi
= SSA_NAME_DEF_STMT (control_name
);
1507 remove_phi_node (phi
, NULL_TREE
, false);
1509 /* Initialize the control variable to NIT. */
1510 bsi
= bsi_after_labels (ex_bb
);
1511 t
= build_gimple_modify_stmt (control_name
, nit
);
1512 bsi_insert_before (&bsi
, t
, BSI_NEW_STMT
);
1513 SSA_NAME_DEF_STMT (control_name
) = t
;
1516 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
1517 LOOP_FN and DATA are the arguments of OMP_PARALLEL.
1518 NEW_DATA is the variable that should be initialized from the argument
1519 of LOOP_FN. N_THREADS is the requested number of threads. Returns the
1520 basic block containing OMP_PARALLEL tree. */
1523 create_parallel_loop (struct loop
*loop
, tree loop_fn
, tree data
,
1524 tree new_data
, unsigned n_threads
)
1526 block_stmt_iterator bsi
;
1527 basic_block bb
, paral_bb
, for_bb
, ex_bb
;
1528 tree t
, param
, res
, for_stmt
;
1529 tree cvar
, cvar_init
, initvar
, cvar_next
, cvar_base
, cond
, phi
, type
;
1530 edge exit
, nexit
, guard
, end
, e
;
1532 /* Prepare the OMP_PARALLEL statement. */
1533 bb
= loop_preheader_edge (loop
)->src
;
1534 paral_bb
= single_pred (bb
);
1535 bsi
= bsi_last (paral_bb
);
1537 t
= build_omp_clause (OMP_CLAUSE_NUM_THREADS
);
1538 OMP_CLAUSE_NUM_THREADS_EXPR (t
)
1539 = build_int_cst (integer_type_node
, n_threads
);
1540 t
= build4 (OMP_PARALLEL
, void_type_node
, NULL_TREE
, t
, loop_fn
, data
);
1542 bsi_insert_after (&bsi
, t
, BSI_NEW_STMT
);
1544 /* Initialize NEW_DATA. */
1547 bsi
= bsi_after_labels (bb
);
1549 param
= make_ssa_name (DECL_ARGUMENTS (loop_fn
), NULL_TREE
);
1550 t
= build_gimple_modify_stmt (param
, build_fold_addr_expr (data
));
1551 bsi_insert_before (&bsi
, t
, BSI_SAME_STMT
);
1552 SSA_NAME_DEF_STMT (param
) = t
;
1554 t
= build_gimple_modify_stmt (new_data
,
1555 fold_convert (TREE_TYPE (new_data
),
1557 bsi_insert_before (&bsi
, t
, BSI_SAME_STMT
);
1558 SSA_NAME_DEF_STMT (new_data
) = t
;
1561 /* Emit OMP_RETURN for OMP_PARALLEL. */
1562 bb
= split_loop_exit_edge (single_dom_exit (loop
));
1563 bsi
= bsi_last (bb
);
1564 bsi_insert_after (&bsi
, make_node (OMP_RETURN
), BSI_NEW_STMT
);
1566 /* Extract data for OMP_FOR. */
1567 gcc_assert (loop
->header
== single_dom_exit (loop
)->src
);
1568 cond
= COND_EXPR_COND (last_stmt (loop
->header
));
1570 cvar
= TREE_OPERAND (cond
, 0);
1571 cvar_base
= SSA_NAME_VAR (cvar
);
1572 phi
= SSA_NAME_DEF_STMT (cvar
);
1573 cvar_init
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_preheader_edge (loop
));
1574 initvar
= make_ssa_name (cvar_base
, NULL_TREE
);
1575 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi
, loop_preheader_edge (loop
)),
1577 cvar_next
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_latch_edge (loop
));
1579 bsi
= bsi_last (loop
->latch
);
1580 gcc_assert (bsi_stmt (bsi
) == SSA_NAME_DEF_STMT (cvar_next
));
1581 bsi_remove (&bsi
, true);
1584 for_bb
= split_edge (loop_preheader_edge (loop
));
1585 ex_bb
= split_loop_exit_edge (single_dom_exit (loop
));
1586 extract_true_false_edges_from_block (loop
->header
, &nexit
, &exit
);
1587 gcc_assert (exit
== single_dom_exit (loop
));
1589 guard
= make_edge (for_bb
, ex_bb
, 0);
1590 single_succ_edge (loop
->latch
)->flags
= 0;
1591 end
= make_edge (loop
->latch
, ex_bb
, EDGE_FALLTHRU
);
1592 for (phi
= phi_nodes (ex_bb
); phi
; phi
= PHI_CHAIN (phi
))
1594 res
= PHI_RESULT (phi
);
1595 gcc_assert (!is_gimple_reg (phi
));
1596 t
= SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi
, exit
));
1597 add_phi_arg (phi
, PHI_ARG_DEF_FROM_EDGE (t
, loop_preheader_edge (loop
)),
1599 add_phi_arg (phi
, PHI_ARG_DEF_FROM_EDGE (t
, loop_latch_edge (loop
)),
1602 e
= redirect_edge_and_branch (exit
, nexit
->dest
);
1603 PENDING_STMT (e
) = NULL
;
1606 TREE_OPERAND (cond
, 0) = cvar_base
;
1607 type
= TREE_TYPE (cvar
);
1608 t
= build_omp_clause (OMP_CLAUSE_SCHEDULE
);
1609 OMP_CLAUSE_SCHEDULE_KIND (t
) = OMP_CLAUSE_SCHEDULE_STATIC
;
1611 for_stmt
= make_node (OMP_FOR
);
1612 TREE_TYPE (for_stmt
) = void_type_node
;
1613 OMP_FOR_CLAUSES (for_stmt
) = t
;
1614 OMP_FOR_INIT (for_stmt
) = build_gimple_modify_stmt (initvar
, cvar_init
);
1615 OMP_FOR_COND (for_stmt
) = cond
;
1616 OMP_FOR_INCR (for_stmt
) = build_gimple_modify_stmt (cvar_base
,
1617 build2 (PLUS_EXPR
, type
,
1621 OMP_FOR_BODY (for_stmt
) = NULL_TREE
;
1622 OMP_FOR_PRE_BODY (for_stmt
) = NULL_TREE
;
1624 bsi
= bsi_last (for_bb
);
1625 bsi_insert_after (&bsi
, for_stmt
, BSI_NEW_STMT
);
1626 SSA_NAME_DEF_STMT (initvar
) = for_stmt
;
1628 /* Emit OMP_CONTINUE. */
1629 bsi
= bsi_last (loop
->latch
);
1630 t
= build2 (OMP_CONTINUE
, void_type_node
, cvar_next
, cvar
);
1631 bsi_insert_after (&bsi
, t
, BSI_NEW_STMT
);
1632 SSA_NAME_DEF_STMT (cvar_next
) = t
;
1634 /* Emit OMP_RETURN for OMP_FOR. */
1635 bsi
= bsi_last (ex_bb
);
1636 bsi_insert_after (&bsi
, make_node (OMP_RETURN
), BSI_NEW_STMT
);
1641 /* Generates code to execute the iterations of LOOP in N_THREADS threads in
1642 parallel. NITER describes number of iterations of LOOP.
1643 REDUCTION_LIST describes the reductions existant in the LOOP. */
1646 gen_parallel_loop (struct loop
*loop
, htab_t reduction_list
,
1647 unsigned n_threads
, struct tree_niter_desc
*niter
)
1650 tree many_iterations_cond
, type
, nit
;
1651 tree stmts
, arg_struct
, new_arg_struct
;
1652 basic_block parallel_head
;
1653 struct clsn_data clsn_data
;
1658 ---------------------------------------------------------------------
1661 IV = phi (INIT, IV + STEP)
1667 ---------------------------------------------------------------------
1669 with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
1670 we generate the following code:
1672 ---------------------------------------------------------------------
1675 || NITER < MIN_PER_THREAD * N_THREADS)
1679 store all local loop-invariant variables used in body of the loop to DATA.
1680 OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
1681 load the variables from DATA.
1682 OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
1686 OMP_RETURN -- OMP_FOR
1687 OMP_RETURN -- OMP_PARALLEL
1693 IV = phi (INIT, IV + STEP)
1704 /* Create two versions of the loop -- in the old one, we know that the
1705 number of iterations is large enough, and we will transform it into the
1706 loop that will be split to loop_fn, the new one will be used for the
1707 remaining iterations. */
1709 type
= TREE_TYPE (niter
->niter
);
1710 nit
= force_gimple_operand (unshare_expr (niter
->niter
), &stmts
, true,
1713 bsi_insert_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
1715 many_iterations_cond
=
1716 fold_build2 (GE_EXPR
, boolean_type_node
,
1717 nit
, build_int_cst (type
, MIN_PER_THREAD
* n_threads
));
1718 many_iterations_cond
1719 = fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
1720 invert_truthvalue (unshare_expr (niter
->may_be_zero
)),
1721 many_iterations_cond
);
1722 many_iterations_cond
1723 = force_gimple_operand (many_iterations_cond
, &stmts
, false, NULL_TREE
);
1725 bsi_insert_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
1726 if (!is_gimple_condexpr (many_iterations_cond
))
1728 many_iterations_cond
1729 = force_gimple_operand (many_iterations_cond
, &stmts
,
1732 bsi_insert_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
1735 initialize_original_copy_tables ();
1737 /* We assume that the loop usually iterates a lot. */
1738 prob
= 4 * REG_BR_PROB_BASE
/ 5;
1739 nloop
= loop_version (loop
, many_iterations_cond
, NULL
,
1740 prob
, prob
, REG_BR_PROB_BASE
- prob
, true);
1741 update_ssa (TODO_update_ssa
);
1742 free_original_copy_tables ();
1744 /* Base all the induction variables in LOOP on a single control one. */
1745 canonicalize_loop_ivs (loop
, reduction_list
, nit
);
1747 /* Ensure that the exit condition is the first statement in the loop. */
1748 transform_to_exit_first_loop (loop
, reduction_list
, nit
);
1751 /* Generate intializations for reductions. */
1753 if (htab_elements (reduction_list
) > 0)
1754 htab_traverse (reduction_list
, initialize_reductions
, loop
);
1756 /* Eliminate the references to local variables from the loop. */
1757 eliminate_local_variables (loop
);
1759 /* In the old loop, move all variables non-local to the loop to a structure
1760 and back, and create separate decls for the variables used in loop. */
1761 separate_decls_in_loop (loop
, reduction_list
, &arg_struct
, &new_arg_struct
, &clsn_data
);
1763 /* Create the parallel constructs. */
1764 parallel_head
= create_parallel_loop (loop
, create_loop_fn (), arg_struct
,
1765 new_arg_struct
, n_threads
);
1766 if (htab_elements (reduction_list
) > 0)
1767 create_call_for_reduction (loop
, reduction_list
, &clsn_data
);
1771 /* Cancel the loop (it is simpler to do it here rather than to teach the
1772 expander to do it). */
1773 cancel_loop_tree (loop
);
1775 /* Expand the parallel constructs. We do it directly here instead of running
1776 a separate expand_omp pass, since it is more efficient, and less likely to
1777 cause troubles with further analyses not being able to deal with the
1780 omp_expand_local (parallel_head
);
1783 /* Detect parallel loops and generate parallel code using libgomp
1784 primitives. Returns true if some loop was parallelized, false
1788 parallelize_loops (void)
1790 unsigned n_threads
= flag_tree_parallelize_loops
;
1791 bool changed
= false;
1793 struct tree_niter_desc niter_desc
;
1795 htab_t reduction_list
;
1797 /* Do not parallelize loops in the functions created by parallelization. */
1798 if (parallelized_function_p (cfun
->decl
))
1801 reduction_list
= htab_create (10, reduction_info_hash
,
1802 reduction_info_eq
, free
);
1804 FOR_EACH_LOOP (li
, loop
, 0)
1806 htab_empty (reduction_list
);
1807 if (/* Do not bother with loops in cold areas. */
1808 !maybe_hot_bb_p (loop
->header
)
1809 /* Or loops that roll too little. */
1810 || expected_loop_iterations (loop
) <= n_threads
1811 /* And of course, the loop must be parallelizable. */
1812 || !can_duplicate_loop_p (loop
)
1813 || !loop_parallel_p (loop
, reduction_list
, &niter_desc
))
1817 gen_parallel_loop (loop
, reduction_list
, n_threads
, &niter_desc
);
1818 verify_flow_info ();
1819 verify_dominators (CDI_DOMINATORS
);
1820 verify_loop_structure ();
1821 verify_loop_closed_ssa ();
1824 htab_delete (reduction_list
);
1828 #include "gt-tree-parloops.h"