1 /* Loop autoparallelization.
2 Copyright (C) 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Sebastian Pop <pop@cri.ensmp.fr> and
4 Zdenek Dvorak <dvorakz@suse.cz>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
25 #include "coretypes.h"
29 #include "tree-flow.h"
32 #include "tree-data-ref.h"
33 #include "diagnostic.h"
34 #include "tree-pass.h"
35 #include "tree-scalar-evolution.h"
37 #include "langhooks.h"
38 #include "tree-vectorizer.h"
40 /* This pass tries to distribute iterations of loops into several threads.
41 The implementation is straightforward -- for each loop we test whether its
42 iterations are independent, and if it is the case (and some additional
43 conditions regarding profitability and correctness are satisfied), we
44 add OMP_PARALLEL and OMP_FOR codes and let omp expansion machinery do
47 The most of the complexity is in bringing the code into shape expected
49 -- for OMP_FOR, ensuring that the loop has only one induction variable
50 and that the exit test is at the start of the loop body
51 -- for OMP_PARALLEL, replacing the references to local addressable
52 variables by accesses through pointers, and breaking up ssa chains
53 by storing the values incoming to the parallelized loop to a structure
54 passed to the new function as an argument (something similar is done
55 in omp gimplification, unfortunately only a small part of the code
59 -- if there are several parallelizable loops in a function, it may be
60 possible to generate the threads just once (using synchronization to
61 ensure that cross-loop dependences are obeyed).
62 -- handling of common scalar dependence patterns (accumulation, ...)
63 -- handling of non-innermost loops */
67 currently we use vect_is_simple_reduction() to detect reduction patterns.
68 The code transformation will be introduced by an example.
75 for (i = 0; i < N; i++)
85 # sum_29 = PHI <sum_11(5), 1(3)>
86 # i_28 = PHI <i_12(5), 0(3)>
89 sum_11 = D.1795_8 + sum_29;
97 # sum_21 = PHI <sum_11(4)>
98 printf (&"%d"[0], sum_21);
101 after reduction transformation (only relevant parts):
109 # Storing the the initial value given by the user. #
111 .paral_data_store.32.sum.27 = 1;
113 #pragma omp parallel num_threads(4)
115 #pragma omp for schedule(static)
117 # The neutral element corresponding to the particular
118 reduction's operation, e.g. 0 for PLUS_EXPR,
119 1 for MULT_EXPR, etc. replaces the user's initial value. #
121 # sum.27_29 = PHI <sum.27_11, 0>
123 sum.27_11 = D.1827_8 + sum.27_29;
127 # Adding this reduction phi is done at create_phi_for_local_result() #
128 # sum.27_56 = PHI <sum.27_11, 0>
131 # Creating the atomic operation is done at
132 create_call_for_reduction_1() #
134 #pragma omp atomic_load
135 D.1839_59 = *&.paral_data_load.33_51->reduction.23;
136 D.1840_60 = sum.27_56 + D.1839_59;
137 #pragma omp atomic_store (D.1840_60);
141 # collecting the result after the join of the threads is done at
142 create_loads_for_reductions().
143 The value computed by the threads is loaded from the
147 .paral_data_load.33_52 = &.paral_data_store.32;
148 sum_37 = .paral_data_load.33_52->sum.27;
149 sum_43 = D.1795_41 + sum_37;
152 # sum_21 = PHI <sum_43, sum_26>
153 printf (&"%d"[0], sum_21);
161 /* Minimal number of iterations of a loop that should be executed in each
163 #define MIN_PER_THREAD 100
165 /* Element of the hashtable, representing a
166 reduction in the current loop. */
167 struct reduction_info
169 tree reduc_stmt
; /* reduction statement. */
170 tree reduc_phi
; /* The phi node defining the reduction. */
171 enum tree_code reduction_code
; /* code for the reduction operation. */
172 tree keep_res
; /* The PHI_RESULT of this phi is the resulting value
173 of the reduction variable when existing the loop. */
174 tree initial_value
; /* The initial value of the reduction var before entering the loop. */
175 tree field
; /* the name of the field in the parloop data structure intended for reduction. */
176 tree init
; /* reduction initialization value. */
177 tree new_phi
; /* (helper field) Newly created phi node whose result
178 will be passed to the atomic operation. Represents
179 the local result each thread computed for the reduction
183 /* Equality and hash functions for hashtab code. */
186 reduction_info_eq (const void *aa
, const void *bb
)
188 const struct reduction_info
*a
= (const struct reduction_info
*) aa
;
189 const struct reduction_info
*b
= (const struct reduction_info
*) bb
;
191 return (a
->reduc_phi
== b
->reduc_phi
);
195 reduction_info_hash (const void *aa
)
197 const struct reduction_info
*a
= (const struct reduction_info
*) aa
;
199 return htab_hash_pointer (a
->reduc_phi
);
202 static struct reduction_info
*
203 reduction_phi (htab_t reduction_list
, tree phi
)
205 struct reduction_info tmpred
, *red
;
207 if (htab_elements (reduction_list
) == 0)
210 tmpred
.reduc_phi
= phi
;
211 red
= htab_find (reduction_list
, &tmpred
);
216 /* Element of hashtable of names to copy. */
218 struct name_to_copy_elt
220 unsigned version
; /* The version of the name to copy. */
221 tree new_name
; /* The new name used in the copy. */
222 tree field
; /* The field of the structure used to pass the
226 /* Equality and hash functions for hashtab code. */
229 name_to_copy_elt_eq (const void *aa
, const void *bb
)
231 const struct name_to_copy_elt
*a
= (const struct name_to_copy_elt
*) aa
;
232 const struct name_to_copy_elt
*b
= (const struct name_to_copy_elt
*) bb
;
234 return a
->version
== b
->version
;
238 name_to_copy_elt_hash (const void *aa
)
240 const struct name_to_copy_elt
*a
= (const struct name_to_copy_elt
*) aa
;
242 return (hashval_t
) a
->version
;
245 /* Returns true if the iterations of LOOP are independent on each other (that
246 is, if we can execute them in parallel), and if LOOP satisfies other
247 conditions that we need to be able to parallelize it. Description of number
248 of iterations is stored to NITER. Reduction analysis is done, if
249 reductions are found, they are inserted to the REDUCTION_LIST. */
252 loop_parallel_p (struct loop
*loop
, htab_t reduction_list
, struct tree_niter_desc
*niter
)
254 edge exit
= single_dom_exit (loop
);
255 VEC (ddr_p
, heap
) * dependence_relations
;
256 VEC (data_reference_p
, heap
) * datarefs
;
257 lambda_trans_matrix trans
;
260 loop_vec_info simple_loop_info
;
262 /* Only consider innermost loops with just one exit. The innermost-loop
263 restriction is not necessary, but it makes things simpler. */
264 if (loop
->inner
|| !exit
)
267 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
268 fprintf (dump_file
, "\nConsidering loop %d\n", loop
->num
);
270 /* We need to know # of iterations, and there should be no uses of values
271 defined inside loop outside of it, unless the values are invariants of
273 if (!number_of_iterations_exit (loop
, exit
, niter
, false))
275 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
276 fprintf (dump_file
, " FAILED: number of iterations not known\n");
280 simple_loop_info
= vect_analyze_loop_form (loop
);
282 for (phi
= phi_nodes (loop
->header
); phi
; phi
= PHI_CHAIN (phi
))
284 tree reduc_stmt
= NULL
, operation
;
286 /* ??? TODO: Change this into a generic function that
287 recognizes reductions. */
288 if (!is_gimple_reg (PHI_RESULT (phi
)))
290 if (simple_loop_info
)
291 reduc_stmt
= vect_is_simple_reduction (simple_loop_info
, phi
);
293 /* Create a reduction_info struct, initialize it and insert it to
294 the reduction list. */
299 struct reduction_info
*new_reduction
;
301 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
304 "Detected reduction. reduction stmt is: \n");
305 print_generic_stmt (dump_file
, reduc_stmt
, 0);
306 fprintf (dump_file
, "\n");
309 new_reduction
= XCNEW (struct reduction_info
);
311 new_reduction
->reduc_stmt
= reduc_stmt
;
312 new_reduction
->reduc_phi
= phi
;
313 operation
= GIMPLE_STMT_OPERAND (reduc_stmt
, 1);
314 new_reduction
->reduction_code
= TREE_CODE (operation
);
315 slot
= htab_find_slot (reduction_list
, new_reduction
, INSERT
);
316 *slot
= new_reduction
;
320 for (phi
= phi_nodes (exit
->dest
); phi
; phi
= PHI_CHAIN (phi
))
322 struct reduction_info
*red
;
323 imm_use_iterator imm_iter
;
327 tree val
= PHI_ARG_DEF_FROM_EDGE (phi
, exit
);
329 if (is_gimple_reg (val
))
331 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
333 fprintf (dump_file
, "phi is ");
334 print_generic_expr (dump_file
, phi
, 0);
335 fprintf (dump_file
, "arg of phi to exit: value ");
336 print_generic_expr (dump_file
, val
, 0);
337 fprintf (dump_file
, " used outside loop\n");
339 " checking if it a part of reduction pattern: \n");
341 if (htab_elements (reduction_list
) == 0)
343 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
345 " FAILED: it is not a part of reduction.\n");
349 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, val
)
351 if (flow_bb_inside_loop_p (loop
, bb_for_stmt (USE_STMT (use_p
))))
353 reduc_phi
= USE_STMT (use_p
);
357 red
= reduction_phi (reduction_list
, reduc_phi
);
360 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
362 " FAILED: it is not a part of reduction.\n");
365 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
367 fprintf (dump_file
, "reduction phi is ");
368 print_generic_expr (dump_file
, red
->reduc_phi
, 0);
369 fprintf (dump_file
, "reduction stmt is ");
370 print_generic_expr (dump_file
, red
->reduc_stmt
, 0);
376 /* The iterations of the loop may communicate only through bivs whose
377 iteration space can be distributed efficiently. */
378 for (phi
= phi_nodes (loop
->header
); phi
; phi
= PHI_CHAIN (phi
))
380 tree def
= PHI_RESULT (phi
);
383 if (is_gimple_reg (def
) && !simple_iv (loop
, phi
, def
, &iv
, true))
385 struct reduction_info
*red
;
387 red
= reduction_phi (reduction_list
, phi
);
390 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
392 " FAILED: scalar dependency between iterations\n");
398 /* We need to version the loop to verify assumptions in runtime. */
399 if (!can_duplicate_loop_p (loop
))
401 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
402 fprintf (dump_file
, " FAILED: cannot be duplicated\n");
406 /* Check for problems with dependences. If the loop can be reversed,
407 the iterations are independent. */
408 datarefs
= VEC_alloc (data_reference_p
, heap
, 10);
409 dependence_relations
= VEC_alloc (ddr_p
, heap
, 10 * 10);
410 compute_data_dependences_for_loop (loop
, true, &datarefs
,
411 &dependence_relations
);
412 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
413 dump_data_dependence_relations (dump_file
, dependence_relations
);
415 trans
= lambda_trans_matrix_new (1, 1);
416 LTM_MATRIX (trans
)[0][0] = -1;
418 if (lambda_transform_legal_p (trans
, 1, dependence_relations
))
421 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
422 fprintf (dump_file
, " SUCCESS: may be parallelized\n");
424 else if (dump_file
&& (dump_flags
& TDF_DETAILS
))
426 " FAILED: data dependencies exist across iterations\n");
428 free_dependence_relations (dependence_relations
);
429 free_data_refs (datarefs
);
434 /* Return true when LOOP contains basic blocks marked with the
435 BB_IRREDUCIBLE_LOOP flag. */
438 loop_has_blocks_with_irreducible_flag (struct loop
*loop
)
441 basic_block
*bbs
= get_loop_body_in_dom_order (loop
);
444 for (i
= 0; i
< loop
->num_nodes
; i
++)
445 if (bbs
[i
]->flags
& BB_IRREDUCIBLE_LOOP
)
454 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name.
455 The assignment statement is placed before LOOP. DECL_ADDRESS maps decls
456 to their addresses that can be reused. The address of OBJ is known to
457 be invariant in the whole function. */
460 take_address_of (tree obj
, tree type
, struct loop
*loop
, htab_t decl_address
)
464 struct int_tree_map ielt
, *nielt
;
465 tree
*var_p
, name
, bvar
, stmt
, addr
;
466 edge entry
= loop_preheader_edge (loop
);
468 /* Since the address of OBJ is invariant, the trees may be shared.
469 Avoid rewriting unrelated parts of the code. */
470 obj
= unshare_expr (obj
);
472 handled_component_p (*var_p
);
473 var_p
= &TREE_OPERAND (*var_p
, 0))
475 uid
= DECL_UID (*var_p
);
478 dslot
= htab_find_slot_with_hash (decl_address
, &ielt
, uid
, INSERT
);
481 addr
= build_addr (*var_p
, current_function_decl
);
482 bvar
= create_tmp_var (TREE_TYPE (addr
), get_name (*var_p
));
483 add_referenced_var (bvar
);
484 stmt
= build_gimple_modify_stmt (bvar
, addr
);
485 name
= make_ssa_name (bvar
, stmt
);
486 GIMPLE_STMT_OPERAND (stmt
, 0) = name
;
487 bsi_insert_on_edge_immediate (entry
, stmt
);
489 nielt
= XNEW (struct int_tree_map
);
495 name
= ((struct int_tree_map
*) *dslot
)->to
;
499 *var_p
= build1 (INDIRECT_REF
, TREE_TYPE (*var_p
), name
);
500 name
= force_gimple_operand (build_addr (obj
, current_function_decl
),
501 &stmt
, true, NULL_TREE
);
503 bsi_insert_on_edge_immediate (entry
, stmt
);
506 if (TREE_TYPE (name
) != type
)
508 name
= force_gimple_operand (fold_convert (type
, name
), &stmt
, true,
511 bsi_insert_on_edge_immediate (entry
, stmt
);
517 /* Callback for htab_traverse. Create the initialization statement
518 for reduction described in SLOT, and place it at the preheader of
519 the loop described in DATA. */
522 initialize_reductions (void **slot
, void *data
)
525 tree bvar
, type
, arg
;
528 struct reduction_info
*reduc
= *slot
;
529 struct loop
*loop
= (struct loop
*) data
;
531 /* Create initialization in preheader:
532 reduction_variable = initialization value of reduction. */
534 /* In the phi node at the header, replace the argument coming
535 from the preheader with the reduction initialization value. */
537 /* Create a new variable to initialize the reduction. */
538 type
= TREE_TYPE (PHI_RESULT (reduc
->reduc_phi
));
539 bvar
= create_tmp_var (type
, "reduction");
540 add_referenced_var (bvar
);
542 c
= build_omp_clause (OMP_CLAUSE_REDUCTION
);
543 OMP_CLAUSE_REDUCTION_CODE (c
) = reduc
->reduction_code
;
544 OMP_CLAUSE_DECL (c
) =
545 SSA_NAME_VAR (GIMPLE_STMT_OPERAND (reduc
->reduc_stmt
, 0));
547 init
= omp_reduction_init (c
, TREE_TYPE (bvar
));
550 /* Replace the argument representing the initialization value
551 with the initialization value for the reduction (neutral
552 element for the particular operation, e.g. 0 for PLUS_EXPR,
553 1 for MULT_EXPR, etc).
554 Keep the old value in a new variable "reduction_initial",
555 that will be taken in consideration after the parallel
556 computing is done. */
558 e
= loop_preheader_edge (loop
);
559 arg
= PHI_ARG_DEF_FROM_EDGE (reduc
->reduc_phi
, e
);
560 /* Create new variable to hold the initial value. */
562 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
563 (reduc
->reduc_phi
, loop_preheader_edge (loop
)), init
);
564 reduc
->initial_value
= arg
;
575 /* Eliminates references to local variables in *TP out of LOOP. DECL_ADDRESS
576 contains addresses of the references that had their address taken already.
577 If the expression is changed, CHANGED is set to true. Callback for
581 eliminate_local_variables_1 (tree
*tp
, int *walk_subtrees
, void *data
)
583 struct elv_data
*dta
= data
;
584 tree t
= *tp
, var
, addr
, addr_type
, type
, obj
;
590 if (!SSA_VAR_P (t
) || DECL_EXTERNAL (t
))
593 type
= TREE_TYPE (t
);
594 addr_type
= build_pointer_type (type
);
595 addr
= take_address_of (t
, addr_type
, dta
->loop
, dta
->decl_address
);
596 *tp
= build1 (INDIRECT_REF
, TREE_TYPE (*tp
), addr
);
602 if (TREE_CODE (t
) == ADDR_EXPR
)
604 /* ADDR_EXPR may appear in two contexts:
605 -- as a gimple operand, when the address taken is a function invariant
606 -- as gimple rhs, when the resulting address in not a function
608 We do not need to do anything special in the latter case (the base of
609 the memory reference whose address is taken may be replaced in the
610 DECL_P case). The former case is more complicated, as we need to
611 ensure that the new address is still a gimple operand. Thus, it
612 is not sufficient to replace just the base of the memory reference --
613 we need to move the whole computation of the address out of the
615 if (!is_gimple_val (t
))
619 obj
= TREE_OPERAND (t
, 0);
620 var
= get_base_address (obj
);
621 if (!var
|| !SSA_VAR_P (var
) || DECL_EXTERNAL (var
))
624 addr_type
= TREE_TYPE (t
);
625 addr
= take_address_of (obj
, addr_type
, dta
->loop
, dta
->decl_address
);
632 if (!EXPR_P (t
) && !GIMPLE_STMT_P (t
))
638 /* Moves the references to local variables in STMT from LOOP. DECL_ADDRESS
639 contains addresses for the references for that we have already taken
643 eliminate_local_variables_stmt (struct loop
*loop
, tree stmt
,
649 dta
.decl_address
= decl_address
;
652 walk_tree (&stmt
, eliminate_local_variables_1
, &dta
, NULL
);
658 /* Eliminates the references to local variables from LOOP.
660 1) Taking address of a local variable -- these are moved out of the
661 loop (and temporary variable is created to hold the address if
663 2) Dereferencing a local variable -- these are replaced with indirect
667 eliminate_local_variables (struct loop
*loop
)
669 basic_block bb
, *body
= get_loop_body (loop
);
671 block_stmt_iterator bsi
;
672 htab_t decl_address
= htab_create (10, int_tree_map_hash
, int_tree_map_eq
,
675 /* Find and rename the ssa names defined outside of loop. */
676 for (i
= 0; i
< loop
->num_nodes
; i
++)
680 for (bsi
= bsi_start (bb
); !bsi_end_p (bsi
); bsi_next (&bsi
))
681 eliminate_local_variables_stmt (loop
, bsi_stmt (bsi
), decl_address
);
684 htab_delete (decl_address
);
687 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
688 The copies are stored to NAME_COPIES, if NAME was already duplicated,
689 its duplicate stored in NAME_COPIES is returned.
691 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
692 duplicated, storing the copies in DECL_COPIES. */
695 separate_decls_in_loop_name (tree name
,
696 htab_t name_copies
, htab_t decl_copies
,
699 tree copy
, var
, var_copy
;
700 unsigned idx
, uid
, nuid
;
701 struct int_tree_map ielt
, *nielt
;
702 struct name_to_copy_elt elt
, *nelt
;
703 void **slot
, **dslot
;
705 if (TREE_CODE (name
) != SSA_NAME
)
708 idx
= SSA_NAME_VERSION (name
);
710 slot
= htab_find_slot_with_hash (name_copies
, &elt
, idx
,
711 copy_name_p
? INSERT
: NO_INSERT
);
713 return ((struct name_to_copy_elt
*) *slot
)->new_name
;
715 var
= SSA_NAME_VAR (name
);
716 uid
= DECL_UID (var
);
718 dslot
= htab_find_slot_with_hash (decl_copies
, &ielt
, uid
, INSERT
);
721 var_copy
= create_tmp_var (TREE_TYPE (var
), get_name (var
));
722 DECL_GIMPLE_REG_P (var_copy
) = DECL_GIMPLE_REG_P (var
);
723 add_referenced_var (var_copy
);
724 nielt
= XNEW (struct int_tree_map
);
726 nielt
->to
= var_copy
;
729 /* Ensure that when we meet this decl next time, we won't duplicate
731 nuid
= DECL_UID (var_copy
);
733 dslot
= htab_find_slot_with_hash (decl_copies
, &ielt
, nuid
, INSERT
);
734 gcc_assert (!*dslot
);
735 nielt
= XNEW (struct int_tree_map
);
737 nielt
->to
= var_copy
;
741 var_copy
= ((struct int_tree_map
*) *dslot
)->to
;
745 copy
= duplicate_ssa_name (name
, NULL_TREE
);
746 nelt
= XNEW (struct name_to_copy_elt
);
748 nelt
->new_name
= copy
;
749 nelt
->field
= NULL_TREE
;
758 SSA_NAME_VAR (copy
) = var_copy
;
762 /* Finds the ssa names used in STMT that are defined outside of LOOP and
763 replaces such ssa names with their duplicates. The duplicates are stored to
764 NAME_COPIES. Base decls of all ssa names used in STMT
765 (including those defined in LOOP) are replaced with the new temporary
766 variables; the replacement decls are stored in DECL_COPIES. */
769 separate_decls_in_loop_stmt (struct loop
*loop
, tree stmt
,
770 htab_t name_copies
, htab_t decl_copies
)
778 mark_virtual_ops_for_renaming (stmt
);
780 FOR_EACH_PHI_OR_STMT_DEF (def
, stmt
, oi
, SSA_OP_DEF
)
782 name
= DEF_FROM_PTR (def
);
783 gcc_assert (TREE_CODE (name
) == SSA_NAME
);
784 copy
= separate_decls_in_loop_name (name
, name_copies
, decl_copies
,
786 gcc_assert (copy
== name
);
789 FOR_EACH_PHI_OR_STMT_USE (use
, stmt
, oi
, SSA_OP_USE
)
791 name
= USE_FROM_PTR (use
);
792 if (TREE_CODE (name
) != SSA_NAME
)
795 copy_name_p
= expr_invariant_in_loop_p (loop
, name
);
796 copy
= separate_decls_in_loop_name (name
, name_copies
, decl_copies
,
802 /* Callback for htab_traverse. Adds a field corresponding to the reduction
803 specified in SLOT. The type is passed in DATA. */
806 add_field_for_reduction (void **slot
, void *data
)
809 struct reduction_info
*red
= *slot
;
811 tree var
= SSA_NAME_VAR (GIMPLE_STMT_OPERAND (red
->reduc_stmt
, 0));
812 tree field
= build_decl (FIELD_DECL
, DECL_NAME (var
), TREE_TYPE (var
));
814 insert_field_into_struct (type
, field
);
821 /* Callback for htab_traverse. Adds a field corresponding to a ssa name
822 described in SLOT. The type is passed in DATA. */
825 add_field_for_name (void **slot
, void *data
)
827 struct name_to_copy_elt
*elt
= *slot
;
829 tree name
= ssa_name (elt
->version
);
830 tree var
= SSA_NAME_VAR (name
);
831 tree field
= build_decl (FIELD_DECL
, DECL_NAME (var
), TREE_TYPE (var
));
833 insert_field_into_struct (type
, field
);
839 /* Callback for htab_traverse. A local result is the intermediate result
841 thread, or the intial value in case no iteration was executed.
842 This function creates a phi node reflecting these values.
843 The phi's result will be stored in NEW_PHI field of the
844 reduction's data structure. */
847 create_phi_for_local_result (void **slot
, void *data
)
849 struct reduction_info
*reduc
= *slot
;
850 struct loop
*loop
= data
;
853 basic_block store_bb
;
856 /* STORE_BB is the block where the phi
857 should be stored. It is the destination of the loop exit.
858 (Find the fallthru edge from OMP_CONTINUE). */
859 store_bb
= FALLTHRU_EDGE (loop
->latch
)->dest
;
861 /* STORE_BB has two predecessors. One coming from the loop
862 (the reduction's result is computed at the loop),
863 and another coming from a block preceding the loop,
865 are executed (the initial value should be taken). */
866 if (EDGE_PRED (store_bb
, 0) == FALLTHRU_EDGE (loop
->latch
))
867 e
= EDGE_PRED (store_bb
, 1);
869 e
= EDGE_PRED (store_bb
, 0);
870 local_res
= make_ssa_name (SSA_NAME_VAR (GIMPLE_STMT_OPERAND (reduc
->reduc_stmt
, 0)), NULL_TREE
);
871 new_phi
= create_phi_node (local_res
, store_bb
);
872 SSA_NAME_DEF_STMT (local_res
) = new_phi
;
873 add_phi_arg (new_phi
, reduc
->init
, e
);
874 add_phi_arg (new_phi
, GIMPLE_STMT_OPERAND (reduc
->reduc_stmt
, 0),
875 FALLTHRU_EDGE (loop
->latch
));
876 reduc
->new_phi
= new_phi
;
886 basic_block store_bb
;
890 /* Callback for htab_traverse. Create an atomic instruction for the
891 reduction described in SLOT.
892 DATA annotates the place in memory the atomic operation relates to,
893 and the basic block it needs to be generated in. */
896 create_call_for_reduction_1 (void **slot
, void *data
)
898 struct reduction_info
*reduc
= *slot
;
899 struct clsn_data
*clsn_data
= data
;
900 block_stmt_iterator bsi
;
901 tree type
= TREE_TYPE (PHI_RESULT (reduc
->reduc_phi
));
902 tree struct_type
= TREE_TYPE (TREE_TYPE (clsn_data
->load
));
907 tree t
, addr
, addr_type
, ref
, x
;
908 tree tmp_load
, load
, name
;
910 load_struct
= fold_build1 (INDIRECT_REF
, struct_type
, clsn_data
->load
);
911 t
= build3 (COMPONENT_REF
, type
, load_struct
, reduc
->field
, NULL_TREE
);
912 addr_type
= build_pointer_type (type
);
914 addr
= build_addr (t
, current_function_decl
);
916 /* Create phi node. */
917 bb
= clsn_data
->load_bb
;
919 e
= split_block (bb
, t
);
922 tmp_load
= create_tmp_var (TREE_TYPE (TREE_TYPE (addr
)), NULL
);
923 add_referenced_var (tmp_load
);
924 tmp_load
= make_ssa_name (tmp_load
, NULL
);
925 load
= build2 (OMP_ATOMIC_LOAD
, void_type_node
, tmp_load
, addr
);
926 SSA_NAME_DEF_STMT (tmp_load
) = load
;
927 bsi
= bsi_start (new_bb
);
928 bsi_insert_after (&bsi
, load
, BSI_NEW_STMT
);
930 e
= split_block (new_bb
, load
);
932 bsi
= bsi_start (new_bb
);
935 fold_build2 (reduc
->reduction_code
,
936 TREE_TYPE (PHI_RESULT (reduc
->new_phi
)), ref
,
937 PHI_RESULT (reduc
->new_phi
));
940 force_gimple_operand_bsi (&bsi
, x
, true, NULL_TREE
, true,
941 BSI_CONTINUE_LINKING
);
943 x
= build1 (OMP_ATOMIC_STORE
, void_type_node
, name
);
945 bsi_insert_after (&bsi
, x
, BSI_NEW_STMT
);
949 /* Create the atomic operation at the join point of the threads.
950 REDUCTION_LIST describes the reductions in the LOOP.
951 LD_ST_DATA describes the shared data structure where
952 shared data is stored in and loaded from. */
954 create_call_for_reduction (struct loop
*loop
, htab_t reduction_list
,
955 struct clsn_data
*ld_st_data
)
957 htab_traverse (reduction_list
, create_phi_for_local_result
, loop
);
958 /* Find the fallthru edge from OMP_CONTINUE. */
959 ld_st_data
->load_bb
= FALLTHRU_EDGE (loop
->latch
)->dest
;
960 htab_traverse (reduction_list
, create_call_for_reduction_1
, ld_st_data
);
963 /* Callback for htab_traverse. Loads the final reduction value at the
964 join point of all threads, and inserts it in the right place. */
967 create_loads_for_reductions (void **slot
, void *data
)
969 struct reduction_info
*red
= *slot
;
970 struct clsn_data
*clsn_data
= data
;
972 block_stmt_iterator bsi
;
973 tree type
= TREE_TYPE (GIMPLE_STMT_OPERAND (red
->reduc_stmt
, 0));
974 tree struct_type
= TREE_TYPE (TREE_TYPE (clsn_data
->load
));
979 bsi
= bsi_after_labels (clsn_data
->load_bb
);
980 load_struct
= fold_build1 (INDIRECT_REF
, struct_type
, clsn_data
->load
);
981 load_struct
= build3 (COMPONENT_REF
, type
, load_struct
, red
->field
,
985 name
= PHI_RESULT (red
->keep_res
);
986 stmt
= build_gimple_modify_stmt (name
, x
);
987 GIMPLE_STMT_OPERAND (stmt
, 0) = name
;
988 SSA_NAME_DEF_STMT (name
) = stmt
;
990 bsi_insert_after (&bsi
, stmt
, BSI_NEW_STMT
);
992 remove_phi_node (red
->keep_res
, NULL_TREE
, false);
997 /* Load the reduction result that was stored in LD_ST_DATA.
998 REDUCTION_LIST describes the list of reductions that the
999 loades should be generated for. */
1001 create_final_loads_for_reduction (htab_t reduction_list
,
1002 struct clsn_data
*ld_st_data
)
1004 block_stmt_iterator bsi
;
1007 bsi
= bsi_after_labels (ld_st_data
->load_bb
);
1008 t
= build_fold_addr_expr (ld_st_data
->store
);
1010 build_gimple_modify_stmt (ld_st_data
->load
,
1011 build_fold_addr_expr (ld_st_data
->store
));
1013 bsi_insert_before (&bsi
, t
, BSI_NEW_STMT
);
1014 SSA_NAME_DEF_STMT (ld_st_data
->load
) = t
;
1015 GIMPLE_STMT_OPERAND (t
, 0) = ld_st_data
->load
;
1017 htab_traverse (reduction_list
, create_loads_for_reductions
, ld_st_data
);
1021 /* Callback for htab_traverse. Store the neutral value for the
1022 particular reduction's operation, e.g. 0 for PLUS_EXPR,
1023 1 for MULT_EXPR, etc. into the reduction field.
1024 The reduction is specified in SLOT. The store information is
1028 create_stores_for_reduction (void **slot
, void *data
)
1030 struct reduction_info
*red
= *slot
;
1031 struct clsn_data
*clsn_data
= data
;
1033 block_stmt_iterator bsi
;
1034 tree type
= TREE_TYPE (GIMPLE_STMT_OPERAND (red
->reduc_stmt
, 0));
1036 bsi
= bsi_last (clsn_data
->store_bb
);
1038 build_gimple_modify_stmt (build3
1039 (COMPONENT_REF
, type
, clsn_data
->store
,
1040 red
->field
, NULL_TREE
),
1041 red
->initial_value
);
1042 mark_virtual_ops_for_renaming (stmt
);
1043 bsi_insert_after (&bsi
, stmt
, BSI_NEW_STMT
);
1048 /* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and
1049 store to a field of STORE in STORE_BB for the ssa name and its duplicate
1050 specified in SLOT. */
1053 create_loads_and_stores_for_name (void **slot
, void *data
)
1055 struct name_to_copy_elt
*elt
= *slot
;
1056 struct clsn_data
*clsn_data
= data
;
1058 block_stmt_iterator bsi
;
1059 tree type
= TREE_TYPE (elt
->new_name
);
1060 tree struct_type
= TREE_TYPE (TREE_TYPE (clsn_data
->load
));
1063 bsi
= bsi_last (clsn_data
->store_bb
);
1065 build_gimple_modify_stmt (build3
1066 (COMPONENT_REF
, type
, clsn_data
->store
,
1067 elt
->field
, NULL_TREE
),
1068 ssa_name (elt
->version
));
1069 mark_virtual_ops_for_renaming (stmt
);
1070 bsi_insert_after (&bsi
, stmt
, BSI_NEW_STMT
);
1072 bsi
= bsi_last (clsn_data
->load_bb
);
1073 load_struct
= fold_build1 (INDIRECT_REF
, struct_type
, clsn_data
->load
);
1074 stmt
= build_gimple_modify_stmt (elt
->new_name
,
1075 build3 (COMPONENT_REF
, type
, load_struct
,
1076 elt
->field
, NULL_TREE
));
1077 SSA_NAME_DEF_STMT (elt
->new_name
) = stmt
;
1078 bsi_insert_after (&bsi
, stmt
, BSI_NEW_STMT
);
1083 /* Moves all the variables used in LOOP and defined outside of it (including
1084 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1085 name) to a structure created for this purpose. The code
1093 is transformed this way:
1108 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The
1109 pointer `new' is intentionally not initialized (the loop will be split to a
1110 separate function later, and `new' will be initialized from its arguments).
1111 LD_ST_DATA holds information about the shared data structure used to pass
1112 information among the threads. It is initialized here, and
1113 gen_parallel_loop will pass it to create_call_for_reduction that
1114 needs this information. REDUCTION_LIST describes the reductions
1118 separate_decls_in_loop (struct loop
*loop
, htab_t reduction_list
,
1119 tree
* arg_struct
, tree
* new_arg_struct
,
1120 struct clsn_data
*ld_st_data
)
1123 basic_block bb1
= split_edge (loop_preheader_edge (loop
));
1124 basic_block bb0
= single_pred (bb1
);
1125 htab_t name_copies
= htab_create (10, name_to_copy_elt_hash
,
1126 name_to_copy_elt_eq
, free
);
1127 htab_t decl_copies
= htab_create (10, int_tree_map_hash
, int_tree_map_eq
,
1129 basic_block bb
, *body
= get_loop_body (loop
);
1131 tree phi
, type
, type_name
, nvar
;
1132 block_stmt_iterator bsi
;
1133 struct clsn_data clsn_data
;
1135 /* Find and rename the ssa names defined outside of loop. */
1136 for (i
= 0; i
< loop
->num_nodes
; i
++)
1140 for (phi
= phi_nodes (bb
); phi
; phi
= PHI_CHAIN (phi
))
1141 separate_decls_in_loop_stmt (loop
, phi
, name_copies
, decl_copies
);
1143 for (bsi
= bsi_start (bb
); !bsi_end_p (bsi
); bsi_next (&bsi
))
1144 separate_decls_in_loop_stmt (loop
, bsi_stmt (bsi
), name_copies
,
1149 if (htab_elements (name_copies
) == 0)
1151 /* It may happen that there is nothing to copy (if there are only
1152 loop carried and external variables in the loop). */
1154 *new_arg_struct
= NULL
;
1158 /* Create the type for the structure to store the ssa names to. */
1159 type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
1160 type_name
= build_decl (TYPE_DECL
, create_tmp_var_name (".paral_data"),
1162 TYPE_NAME (type
) = type_name
;
1164 htab_traverse (name_copies
, add_field_for_name
, type
);
1165 if (htab_elements (reduction_list
) > 0)
1167 /* Create the fields for reductions. */
1168 htab_traverse (reduction_list
, add_field_for_reduction
,
1173 /* Create the loads and stores. */
1174 *arg_struct
= create_tmp_var (type
, ".paral_data_store");
1175 add_referenced_var (*arg_struct
);
1176 nvar
= create_tmp_var (build_pointer_type (type
), ".paral_data_load");
1177 add_referenced_var (nvar
);
1178 *new_arg_struct
= make_ssa_name (nvar
, NULL_TREE
);
1180 ld_st_data
->store
= *arg_struct
;
1181 ld_st_data
->load
= *new_arg_struct
;
1182 ld_st_data
->store_bb
= bb0
;
1183 ld_st_data
->load_bb
= bb1
;
1185 htab_traverse (name_copies
, create_loads_and_stores_for_name
,
1188 /* Load the calculation from memory (after the join of the threads). */
1190 if (htab_elements (reduction_list
) > 0)
1192 htab_traverse (reduction_list
, create_stores_for_reduction
,
1194 clsn_data
.load
= make_ssa_name (nvar
, NULL_TREE
);
1195 clsn_data
.load_bb
= single_dom_exit (loop
)->dest
;
1196 clsn_data
.store
= ld_st_data
->store
;
1197 create_final_loads_for_reduction (reduction_list
, &clsn_data
);
1201 htab_delete (decl_copies
);
1202 htab_delete (name_copies
);
1205 /* Bitmap containing uids of functions created by parallelization. We cannot
1206 allocate it from the default obstack, as it must live across compilation
1207 of several functions; we make it gc allocated instead. */
1209 static GTY(()) bitmap parallelized_functions
;
1211 /* Returns true if FN was created by create_loop_fn. */
1214 parallelized_function_p (tree fn
)
1216 if (!parallelized_functions
|| !DECL_ARTIFICIAL (fn
))
1219 return bitmap_bit_p (parallelized_functions
, DECL_UID (fn
));
1222 /* Creates and returns an empty function that will receive the body of
1223 a parallelized loop. */
1226 create_loop_fn (void)
1230 tree decl
, type
, name
, t
;
1231 struct function
*act_cfun
= cfun
;
1232 static unsigned loopfn_num
;
1234 snprintf (buf
, 100, "%s.$loopfn", current_function_name ());
1235 ASM_FORMAT_PRIVATE_NAME (tname
, buf
, loopfn_num
++);
1236 clean_symbol_name (tname
);
1237 name
= get_identifier (tname
);
1238 type
= build_function_type_list (void_type_node
, ptr_type_node
, NULL_TREE
);
1240 decl
= build_decl (FUNCTION_DECL
, name
, type
);
1241 if (!parallelized_functions
)
1242 parallelized_functions
= BITMAP_GGC_ALLOC ();
1243 bitmap_set_bit (parallelized_functions
, DECL_UID (decl
));
1245 TREE_STATIC (decl
) = 1;
1246 TREE_USED (decl
) = 1;
1247 DECL_ARTIFICIAL (decl
) = 1;
1248 DECL_IGNORED_P (decl
) = 0;
1249 TREE_PUBLIC (decl
) = 0;
1250 DECL_UNINLINABLE (decl
) = 1;
1251 DECL_EXTERNAL (decl
) = 0;
1252 DECL_CONTEXT (decl
) = NULL_TREE
;
1253 DECL_INITIAL (decl
) = make_node (BLOCK
);
1255 t
= build_decl (RESULT_DECL
, NULL_TREE
, void_type_node
);
1256 DECL_ARTIFICIAL (t
) = 1;
1257 DECL_IGNORED_P (t
) = 1;
1258 DECL_RESULT (decl
) = t
;
1260 t
= build_decl (PARM_DECL
, get_identifier (".paral_data_param"),
1262 DECL_ARTIFICIAL (t
) = 1;
1263 DECL_ARG_TYPE (t
) = ptr_type_node
;
1264 DECL_CONTEXT (t
) = decl
;
1266 DECL_ARGUMENTS (decl
) = t
;
1268 allocate_struct_function (decl
, false);
1270 /* The call to allocate_struct_function clobbers CFUN, so we need to restore
1272 set_cfun (act_cfun
);
1277 /* Bases all the induction variables in LOOP on a single induction variable
1278 (unsigned with base 0 and step 1), whose final value is compared with
1279 NIT. The induction variable is incremented in the loop latch.
1280 REDUCTION_LIST describes the reductions in LOOP. */
1283 canonicalize_loop_ivs (struct loop
*loop
, htab_t reduction_list
, tree nit
)
1285 unsigned precision
= TYPE_PRECISION (TREE_TYPE (nit
));
1286 tree phi
, prev
, res
, type
, var_before
, val
, atype
, mtype
, t
, next
;
1287 block_stmt_iterator bsi
;
1290 edge exit
= single_dom_exit (loop
);
1291 struct reduction_info
*red
;
1293 for (phi
= phi_nodes (loop
->header
); phi
; phi
= PHI_CHAIN (phi
))
1295 res
= PHI_RESULT (phi
);
1297 if (is_gimple_reg (res
) && TYPE_PRECISION (TREE_TYPE (res
)) > precision
)
1298 precision
= TYPE_PRECISION (TREE_TYPE (res
));
1301 type
= lang_hooks
.types
.type_for_size (precision
, 1);
1303 bsi
= bsi_last (loop
->latch
);
1304 create_iv (build_int_cst_type (type
, 0), build_int_cst (type
, 1), NULL_TREE
,
1305 loop
, &bsi
, true, &var_before
, NULL
);
1307 bsi
= bsi_after_labels (loop
->header
);
1309 for (phi
= phi_nodes (loop
->header
); phi
; phi
= next
)
1311 next
= PHI_CHAIN (phi
);
1312 res
= PHI_RESULT (phi
);
1314 if (!is_gimple_reg (res
) || res
== var_before
)
1320 ok
= simple_iv (loop
, phi
, res
, &iv
, true);
1321 red
= reduction_phi (reduction_list
, phi
);
1322 /* We preserve the reduction phi nodes. */
1330 remove_phi_node (phi
, prev
, false);
1332 atype
= TREE_TYPE (res
);
1333 mtype
= POINTER_TYPE_P (atype
) ? sizetype
: atype
;
1334 val
= fold_build2 (MULT_EXPR
, mtype
, unshare_expr (iv
.step
),
1335 fold_convert (mtype
, var_before
));
1336 val
= fold_build2 (POINTER_TYPE_P (atype
)
1337 ? POINTER_PLUS_EXPR
: PLUS_EXPR
,
1338 atype
, unshare_expr (iv
.base
), val
);
1339 val
= force_gimple_operand_bsi (&bsi
, val
, false, NULL_TREE
, true,
1341 t
= build_gimple_modify_stmt (res
, val
);
1342 bsi_insert_before (&bsi
, t
, BSI_SAME_STMT
);
1343 SSA_NAME_DEF_STMT (res
) = t
;
1346 t
= last_stmt (exit
->src
);
1347 /* Make the loop exit if the control condition is not satisfied. */
1348 if (exit
->flags
& EDGE_TRUE_VALUE
)
1352 extract_true_false_edges_from_block (exit
->src
, &te
, &fe
);
1353 te
->flags
= EDGE_FALSE_VALUE
;
1354 fe
->flags
= EDGE_TRUE_VALUE
;
1356 COND_EXPR_COND (t
) = build2 (LT_EXPR
, boolean_type_node
, var_before
, nit
);
1359 /* Moves the exit condition of LOOP to the beginning of its header, and
1360 duplicates the part of the last iteration that gets disabled to the
1361 exit of the loop. NIT is the number of iterations of the loop
1362 (used to initialize the variables in the duplicated part).
1364 TODO: the common case is that latch of the loop is empty and immediatelly
1365 follows the loop exit. In this case, it would be better not to copy the
1366 body of the loop, but only move the entry of the loop directly before the
1367 exit check and increase the number of iterations of the loop by one.
1368 This may need some additional preconditioning in case NIT = ~0.
1369 REDUCTION_LIST describes the reductions in LOOP. */
1372 transform_to_exit_first_loop (struct loop
*loop
, htab_t reduction_list
, tree nit
)
1374 basic_block
*bbs
, *nbbs
, ex_bb
, orig_header
;
1377 edge exit
= single_dom_exit (loop
), hpred
;
1378 tree phi
, nphi
, cond
, control
, control_name
, res
, t
, cond_stmt
;
1379 block_stmt_iterator bsi
;
1381 split_block_after_labels (loop
->header
);
1382 orig_header
= single_succ (loop
->header
);
1383 hpred
= single_succ_edge (loop
->header
);
1385 cond_stmt
= last_stmt (exit
->src
);
1386 cond
= COND_EXPR_COND (cond_stmt
);
1387 control
= TREE_OPERAND (cond
, 0);
1388 gcc_assert (TREE_OPERAND (cond
, 1) == nit
);
1390 /* Make sure that we have phi nodes on exit for all loop header phis
1391 (create_parallel_loop requires that). */
1392 for (phi
= phi_nodes (loop
->header
); phi
; phi
= PHI_CHAIN (phi
))
1394 res
= PHI_RESULT (phi
);
1395 t
= make_ssa_name (SSA_NAME_VAR (res
), phi
);
1396 SET_PHI_RESULT (phi
, t
);
1398 nphi
= create_phi_node (res
, orig_header
);
1399 SSA_NAME_DEF_STMT (res
) = nphi
;
1400 add_phi_arg (nphi
, t
, hpred
);
1404 TREE_OPERAND (cond
, 0) = t
;
1405 update_stmt (cond_stmt
);
1410 bbs
= get_loop_body_in_dom_order (loop
);
1411 for (n
= 0; bbs
[n
] != exit
->src
; n
++)
1413 nbbs
= XNEWVEC (basic_block
, n
);
1414 ok
= tree_duplicate_sese_tail (single_succ_edge (loop
->header
), exit
,
1421 /* Other than reductions, the only gimple reg that should be copied
1422 out of the loop is the control variable. */
1424 control_name
= NULL_TREE
;
1425 for (phi
= phi_nodes (ex_bb
); phi
; phi
= PHI_CHAIN (phi
))
1427 res
= PHI_RESULT (phi
);
1428 if (!is_gimple_reg (res
))
1431 /* Check if it is a part of reduction. If it is,
1432 keep the phi at the reduction's keep_res field. The
1433 PHI_RESULT of this phi is the resulting value of the reduction
1434 variable when exiting the loop. */
1436 exit
= single_dom_exit (loop
);
1438 if (htab_elements (reduction_list
) > 0)
1440 struct reduction_info
*red
;
1442 tree val
= PHI_ARG_DEF_FROM_EDGE (phi
, exit
);
1444 red
= reduction_phi (reduction_list
, SSA_NAME_DEF_STMT (val
));
1446 red
->keep_res
= phi
;
1449 gcc_assert (control_name
== NULL_TREE
1450 && SSA_NAME_VAR (res
) == SSA_NAME_VAR (control
));
1453 gcc_assert (control_name
!= NULL_TREE
);
1454 phi
= SSA_NAME_DEF_STMT (control_name
);
1455 remove_phi_node (phi
, NULL_TREE
, false);
1457 /* Initialize the control variable to NIT. */
1458 bsi
= bsi_after_labels (ex_bb
);
1459 nit
= force_gimple_operand_bsi (&bsi
,
1460 fold_convert (TREE_TYPE (control_name
), nit
),
1461 false, NULL_TREE
, false, BSI_SAME_STMT
);
1462 t
= build_gimple_modify_stmt (control_name
, nit
);
1463 bsi_insert_before (&bsi
, t
, BSI_NEW_STMT
);
1464 SSA_NAME_DEF_STMT (control_name
) = t
;
1467 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
1468 LOOP_FN and DATA are the arguments of OMP_PARALLEL.
1469 NEW_DATA is the variable that should be initialized from the argument
1470 of LOOP_FN. N_THREADS is the requested number of threads. Returns the
1471 basic block containing OMP_PARALLEL tree. */
1474 create_parallel_loop (struct loop
*loop
, tree loop_fn
, tree data
,
1475 tree new_data
, unsigned n_threads
)
1477 block_stmt_iterator bsi
;
1478 basic_block bb
, paral_bb
, for_bb
, ex_bb
;
1479 tree t
, param
, res
, for_stmt
;
1480 tree cvar
, cvar_init
, initvar
, cvar_next
, cvar_base
, cond
, phi
, type
;
1481 edge exit
, nexit
, guard
, end
, e
;
1483 /* Prepare the OMP_PARALLEL statement. */
1484 bb
= loop_preheader_edge (loop
)->src
;
1485 paral_bb
= single_pred (bb
);
1486 bsi
= bsi_last (paral_bb
);
1488 t
= build_omp_clause (OMP_CLAUSE_NUM_THREADS
);
1489 OMP_CLAUSE_NUM_THREADS_EXPR (t
)
1490 = build_int_cst (integer_type_node
, n_threads
);
1491 t
= build4 (OMP_PARALLEL
, void_type_node
, NULL_TREE
, t
, loop_fn
, data
);
1493 bsi_insert_after (&bsi
, t
, BSI_NEW_STMT
);
1495 /* Initialize NEW_DATA. */
1498 bsi
= bsi_after_labels (bb
);
1500 param
= make_ssa_name (DECL_ARGUMENTS (loop_fn
), NULL_TREE
);
1501 t
= build_gimple_modify_stmt (param
, build_fold_addr_expr (data
));
1502 bsi_insert_before (&bsi
, t
, BSI_SAME_STMT
);
1503 SSA_NAME_DEF_STMT (param
) = t
;
1505 t
= build_gimple_modify_stmt (new_data
,
1506 fold_convert (TREE_TYPE (new_data
),
1508 bsi_insert_before (&bsi
, t
, BSI_SAME_STMT
);
1509 SSA_NAME_DEF_STMT (new_data
) = t
;
1512 /* Emit OMP_RETURN for OMP_PARALLEL. */
1513 bb
= split_loop_exit_edge (single_dom_exit (loop
));
1514 bsi
= bsi_last (bb
);
1515 bsi_insert_after (&bsi
, make_node (OMP_RETURN
), BSI_NEW_STMT
);
1517 /* Extract data for OMP_FOR. */
1518 gcc_assert (loop
->header
== single_dom_exit (loop
)->src
);
1519 cond
= COND_EXPR_COND (last_stmt (loop
->header
));
1521 cvar
= TREE_OPERAND (cond
, 0);
1522 cvar_base
= SSA_NAME_VAR (cvar
);
1523 phi
= SSA_NAME_DEF_STMT (cvar
);
1524 cvar_init
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_preheader_edge (loop
));
1525 initvar
= make_ssa_name (cvar_base
, NULL_TREE
);
1526 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi
, loop_preheader_edge (loop
)),
1528 cvar_next
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_latch_edge (loop
));
1530 bsi
= bsi_last (loop
->latch
);
1531 gcc_assert (bsi_stmt (bsi
) == SSA_NAME_DEF_STMT (cvar_next
));
1532 bsi_remove (&bsi
, true);
1535 for_bb
= split_edge (loop_preheader_edge (loop
));
1536 ex_bb
= split_loop_exit_edge (single_dom_exit (loop
));
1537 extract_true_false_edges_from_block (loop
->header
, &nexit
, &exit
);
1538 gcc_assert (exit
== single_dom_exit (loop
));
1540 guard
= make_edge (for_bb
, ex_bb
, 0);
1541 single_succ_edge (loop
->latch
)->flags
= 0;
1542 end
= make_edge (loop
->latch
, ex_bb
, EDGE_FALLTHRU
);
1543 for (phi
= phi_nodes (ex_bb
); phi
; phi
= PHI_CHAIN (phi
))
1545 res
= PHI_RESULT (phi
);
1546 gcc_assert (!is_gimple_reg (phi
));
1547 t
= SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi
, exit
));
1548 add_phi_arg (phi
, PHI_ARG_DEF_FROM_EDGE (t
, loop_preheader_edge (loop
)),
1550 add_phi_arg (phi
, PHI_ARG_DEF_FROM_EDGE (t
, loop_latch_edge (loop
)),
1553 e
= redirect_edge_and_branch (exit
, nexit
->dest
);
1554 PENDING_STMT (e
) = NULL
;
1557 TREE_OPERAND (cond
, 0) = cvar_base
;
1558 type
= TREE_TYPE (cvar
);
1559 t
= build_omp_clause (OMP_CLAUSE_SCHEDULE
);
1560 OMP_CLAUSE_SCHEDULE_KIND (t
) = OMP_CLAUSE_SCHEDULE_STATIC
;
1562 for_stmt
= make_node (OMP_FOR
);
1563 TREE_TYPE (for_stmt
) = void_type_node
;
1564 OMP_FOR_CLAUSES (for_stmt
) = t
;
1565 OMP_FOR_INIT (for_stmt
) = build_gimple_modify_stmt (initvar
, cvar_init
);
1566 OMP_FOR_COND (for_stmt
) = cond
;
1567 OMP_FOR_INCR (for_stmt
) = build_gimple_modify_stmt (cvar_base
,
1568 build2 (PLUS_EXPR
, type
,
1572 OMP_FOR_BODY (for_stmt
) = NULL_TREE
;
1573 OMP_FOR_PRE_BODY (for_stmt
) = NULL_TREE
;
1575 bsi
= bsi_last (for_bb
);
1576 bsi_insert_after (&bsi
, for_stmt
, BSI_NEW_STMT
);
1577 SSA_NAME_DEF_STMT (initvar
) = for_stmt
;
1579 /* Emit OMP_CONTINUE. */
1580 bsi
= bsi_last (loop
->latch
);
1581 t
= build2 (OMP_CONTINUE
, void_type_node
, cvar_next
, cvar
);
1582 bsi_insert_after (&bsi
, t
, BSI_NEW_STMT
);
1583 SSA_NAME_DEF_STMT (cvar_next
) = t
;
1585 /* Emit OMP_RETURN for OMP_FOR. */
1586 bsi
= bsi_last (ex_bb
);
1587 bsi_insert_after (&bsi
, make_node (OMP_RETURN
), BSI_NEW_STMT
);
1592 /* Generates code to execute the iterations of LOOP in N_THREADS threads in
1593 parallel. NITER describes number of iterations of LOOP.
1594 REDUCTION_LIST describes the reductions existant in the LOOP. */
1597 gen_parallel_loop (struct loop
*loop
, htab_t reduction_list
,
1598 unsigned n_threads
, struct tree_niter_desc
*niter
)
1602 tree many_iterations_cond
, type
, nit
;
1603 tree stmts
, arg_struct
, new_arg_struct
;
1604 basic_block parallel_head
;
1605 struct clsn_data clsn_data
;
1610 ---------------------------------------------------------------------
1613 IV = phi (INIT, IV + STEP)
1619 ---------------------------------------------------------------------
1621 with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
1622 we generate the following code:
1624 ---------------------------------------------------------------------
1627 || NITER < MIN_PER_THREAD * N_THREADS)
1631 store all local loop-invariant variables used in body of the loop to DATA.
1632 OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
1633 load the variables from DATA.
1634 OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
1638 OMP_RETURN -- OMP_FOR
1639 OMP_RETURN -- OMP_PARALLEL
1645 IV = phi (INIT, IV + STEP)
1656 /* Create two versions of the loop -- in the old one, we know that the
1657 number of iterations is large enough, and we will transform it into the
1658 loop that will be split to loop_fn, the new one will be used for the
1659 remaining iterations. */
1661 type
= TREE_TYPE (niter
->niter
);
1662 nit
= force_gimple_operand (unshare_expr (niter
->niter
), &stmts
, true,
1665 bsi_insert_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
1667 many_iterations_cond
=
1668 fold_build2 (GE_EXPR
, boolean_type_node
,
1669 nit
, build_int_cst (type
, MIN_PER_THREAD
* n_threads
));
1670 many_iterations_cond
1671 = fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
1672 invert_truthvalue (unshare_expr (niter
->may_be_zero
)),
1673 many_iterations_cond
);
1674 many_iterations_cond
1675 = force_gimple_operand (many_iterations_cond
, &stmts
, false, NULL_TREE
);
1677 bsi_insert_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
1678 if (!is_gimple_condexpr (many_iterations_cond
))
1680 many_iterations_cond
1681 = force_gimple_operand (many_iterations_cond
, &stmts
,
1684 bsi_insert_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
1687 initialize_original_copy_tables ();
1689 /* We assume that the loop usually iterates a lot. */
1690 prob
= 4 * REG_BR_PROB_BASE
/ 5;
1691 nloop
= loop_version (loop
, many_iterations_cond
, NULL
,
1692 prob
, prob
, REG_BR_PROB_BASE
- prob
, true);
1693 update_ssa (TODO_update_ssa
);
1694 free_original_copy_tables ();
1696 /* Base all the induction variables in LOOP on a single control one. */
1697 canonicalize_loop_ivs (loop
, reduction_list
, nit
);
1699 /* Ensure that the exit condition is the first statement in the loop. */
1700 transform_to_exit_first_loop (loop
, reduction_list
, nit
);
1703 /* Generate intializations for reductions. */
1705 if (htab_elements (reduction_list
) > 0)
1706 htab_traverse (reduction_list
, initialize_reductions
, loop
);
1708 /* Eliminate the references to local variables from the loop. */
1709 eliminate_local_variables (loop
);
1711 /* In the old loop, move all variables non-local to the loop to a structure
1712 and back, and create separate decls for the variables used in loop. */
1713 separate_decls_in_loop (loop
, reduction_list
, &arg_struct
, &new_arg_struct
, &clsn_data
);
1715 /* Create the parallel constructs. */
1716 parallel_head
= create_parallel_loop (loop
, create_loop_fn (), arg_struct
,
1717 new_arg_struct
, n_threads
);
1718 if (htab_elements (reduction_list
) > 0)
1719 create_call_for_reduction (loop
, reduction_list
, &clsn_data
);
1723 /* Cancel the loop (it is simpler to do it here rather than to teach the
1724 expander to do it). */
1725 cancel_loop_tree (loop
);
1727 /* Free loop bound estimations that could contain references to
1728 removed statements. */
1729 FOR_EACH_LOOP (li
, loop
, 0)
1730 free_numbers_of_iterations_estimates_loop (loop
);
1732 /* Expand the parallel constructs. We do it directly here instead of running
1733 a separate expand_omp pass, since it is more efficient, and less likely to
1734 cause troubles with further analyses not being able to deal with the
1737 omp_expand_local (parallel_head
);
1740 /* Detect parallel loops and generate parallel code using libgomp
1741 primitives. Returns true if some loop was parallelized, false
1745 parallelize_loops (void)
1747 unsigned n_threads
= flag_tree_parallelize_loops
;
1748 bool changed
= false;
1750 struct tree_niter_desc niter_desc
;
1752 htab_t reduction_list
;
1754 /* Do not parallelize loops in the functions created by parallelization. */
1755 if (parallelized_function_p (cfun
->decl
))
1758 reduction_list
= htab_create (10, reduction_info_hash
,
1759 reduction_info_eq
, free
);
1761 FOR_EACH_LOOP (li
, loop
, 0)
1763 htab_empty (reduction_list
);
1764 if (/* Do not bother with loops in cold areas. */
1765 !maybe_hot_bb_p (loop
->header
)
1766 /* Or loops that roll too little. */
1767 || expected_loop_iterations (loop
) <= n_threads
1768 /* And of course, the loop must be parallelizable. */
1769 || !can_duplicate_loop_p (loop
)
1770 || loop_has_blocks_with_irreducible_flag (loop
)
1771 || !loop_parallel_p (loop
, reduction_list
, &niter_desc
))
1775 gen_parallel_loop (loop
, reduction_list
, n_threads
, &niter_desc
);
1776 verify_flow_info ();
1777 verify_dominators (CDI_DOMINATORS
);
1778 verify_loop_structure ();
1779 verify_loop_closed_ssa ();
1782 htab_delete (reduction_list
);
1786 #include "gt-tree-parloops.h"