Daily bump.
[official-gcc.git] / gcc / tree-parloops.c
blob4f3c13e23959fcf75c9f75f7bf2a9c265233bc3c
1 /* Loop autoparallelization.
2 Copyright (C) 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Sebastian Pop <pop@cri.ensmp.fr> and
4 Zdenek Dvorak <dvorakz@suse.cz>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
21 02110-1301, USA. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "rtl.h"
29 #include "tree-flow.h"
30 #include "cfgloop.h"
31 #include "ggc.h"
32 #include "tree-data-ref.h"
33 #include "diagnostic.h"
34 #include "tree-pass.h"
35 #include "tree-scalar-evolution.h"
36 #include "hashtab.h"
37 #include "langhooks.h"
38 #include "tree-vectorizer.h"
40 /* This pass tries to distribute iterations of loops into several threads.
41 The implementation is straightforward -- for each loop we test whether its
42 iterations are independent, and if it is the case (and some additional
43 conditions regarding profitability and correctness are satisfied), we
44 add OMP_PARALLEL and OMP_FOR codes and let omp expansion machinery do
45 its job.
47 The most of the complexity is in bringing the code into shape expected
48 by the omp expanders:
49 -- for OMP_FOR, ensuring that the loop has only one induction variable
50 and that the exit test is at the start of the loop body
51 -- for OMP_PARALLEL, replacing the references to local addressable
52 variables by accesses through pointers, and breaking up ssa chains
53 by storing the values incoming to the parallelized loop to a structure
54 passed to the new function as an argument (something similar is done
55 in omp gimplification, unfortunately only a small part of the code
56 can be shared).
58 TODO:
59 -- if there are several parallelizable loops in a function, it may be
60 possible to generate the threads just once (using synchronization to
61 ensure that cross-loop dependences are obeyed).
62 -- handling of common scalar dependence patterns (accumulation, ...)
63 -- handling of non-innermost loops */
65 /*
66 Reduction handling:
67 currently we use vect_is_simple_reduction() to detect reduction patterns.
68 The code transformation will be introduced by an example.
71 parloop
73 int sum=1;
75 for (i = 0; i < N; i++)
77 x[i] = i + 3;
78 sum+=x[i];
82 gimple-like code:
83 header_bb:
85 # sum_29 = PHI <sum_11(5), 1(3)>
86 # i_28 = PHI <i_12(5), 0(3)>
87 D.1795_8 = i_28 + 3;
88 x[i_28] = D.1795_8;
89 sum_11 = D.1795_8 + sum_29;
90 i_12 = i_28 + 1;
91 if (N_6(D) > i_12)
92 goto header_bb;
95 exit_bb:
97 # sum_21 = PHI <sum_11(4)>
98 printf (&"%d"[0], sum_21);
101 after reduction transformation (only relevant parts):
103 parloop
106 ....
109 # Storing the the initial value given by the user. #
111 .paral_data_store.32.sum.27 = 1;
113 #pragma omp parallel num_threads(4)
115 #pragma omp for schedule(static)
117 # The neutral element corresponding to the particular
118 reduction's operation, e.g. 0 for PLUS_EXPR,
119 1 for MULT_EXPR, etc. replaces the user's initial value. #
121 # sum.27_29 = PHI <sum.27_11, 0>
123 sum.27_11 = D.1827_8 + sum.27_29;
125 OMP_CONTINUE
127 # Adding this reduction phi is done at create_phi_for_local_result() #
128 # sum.27_56 = PHI <sum.27_11, 0>
129 OMP_RETURN
131 # Creating the atomic operation is done at
132 create_call_for_reduction_1() #
134 #pragma omp atomic_load
135 D.1839_59 = *&.paral_data_load.33_51->reduction.23;
136 D.1840_60 = sum.27_56 + D.1839_59;
137 #pragma omp atomic_store (D.1840_60);
139 OMP_RETURN
141 # collecting the result after the join of the threads is done at
142 create_loads_for_reductions().
143 The value computed by the threads is loaded from the
144 shared struct. #
147 .paral_data_load.33_52 = &.paral_data_store.32;
148 sum_37 = .paral_data_load.33_52->sum.27;
149 sum_43 = D.1795_41 + sum_37;
151 exit bb:
152 # sum_21 = PHI <sum_43, sum_26>
153 printf (&"%d"[0], sum_21);
161 /* Minimal number of iterations of a loop that should be executed in each
162 thread. */
163 #define MIN_PER_THREAD 100
165 /* Element of the hashtable, representing a
166 reduction in the current loop. */
167 struct reduction_info
169 tree reduc_stmt; /* reduction statement. */
170 tree reduc_phi; /* The phi node defining the reduction. */
171 enum tree_code reduction_code; /* code for the reduction operation. */
172 tree keep_res; /* The PHI_RESULT of this phi is the resulting value
173 of the reduction variable when existing the loop. */
174 tree initial_value; /* The initial value of the reduction var before entering the loop. */
175 tree field; /* the name of the field in the parloop data structure intended for reduction. */
176 tree init; /* reduction initialization value. */
177 tree new_phi; /* (helper field) Newly created phi node whose result
178 will be passed to the atomic operation. Represents
179 the local result each thread computed for the reduction
180 operation. */
183 /* Equality and hash functions for hashtab code. */
185 static int
186 reduction_info_eq (const void *aa, const void *bb)
188 const struct reduction_info *a = (const struct reduction_info *) aa;
189 const struct reduction_info *b = (const struct reduction_info *) bb;
191 return (a->reduc_phi == b->reduc_phi);
194 static hashval_t
195 reduction_info_hash (const void *aa)
197 const struct reduction_info *a = (const struct reduction_info *) aa;
199 return htab_hash_pointer (a->reduc_phi);
202 static struct reduction_info *
203 reduction_phi (htab_t reduction_list, tree phi)
205 struct reduction_info tmpred, *red;
207 if (htab_elements (reduction_list) == 0)
208 return NULL;
210 tmpred.reduc_phi = phi;
211 red = htab_find (reduction_list, &tmpred);
213 return red;
216 /* Element of hashtable of names to copy. */
218 struct name_to_copy_elt
220 unsigned version; /* The version of the name to copy. */
221 tree new_name; /* The new name used in the copy. */
222 tree field; /* The field of the structure used to pass the
223 value. */
226 /* Equality and hash functions for hashtab code. */
228 static int
229 name_to_copy_elt_eq (const void *aa, const void *bb)
231 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
232 const struct name_to_copy_elt *b = (const struct name_to_copy_elt *) bb;
234 return a->version == b->version;
237 static hashval_t
238 name_to_copy_elt_hash (const void *aa)
240 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
242 return (hashval_t) a->version;
245 /* Returns true if the iterations of LOOP are independent on each other (that
246 is, if we can execute them in parallel), and if LOOP satisfies other
247 conditions that we need to be able to parallelize it. Description of number
248 of iterations is stored to NITER. Reduction analysis is done, if
249 reductions are found, they are inserted to the REDUCTION_LIST. */
251 static bool
252 loop_parallel_p (struct loop *loop, htab_t reduction_list, struct tree_niter_desc *niter)
254 edge exit = single_dom_exit (loop);
255 VEC (ddr_p, heap) * dependence_relations;
256 VEC (data_reference_p, heap) * datarefs;
257 lambda_trans_matrix trans;
258 bool ret = false;
259 tree phi;
260 loop_vec_info simple_loop_info;
262 /* Only consider innermost loops with just one exit. The innermost-loop
263 restriction is not necessary, but it makes things simpler. */
264 if (loop->inner || !exit)
265 return false;
267 if (dump_file && (dump_flags & TDF_DETAILS))
268 fprintf (dump_file, "\nConsidering loop %d\n", loop->num);
270 /* We need to know # of iterations, and there should be no uses of values
271 defined inside loop outside of it, unless the values are invariants of
272 the loop. */
273 if (!number_of_iterations_exit (loop, exit, niter, false))
275 if (dump_file && (dump_flags & TDF_DETAILS))
276 fprintf (dump_file, " FAILED: number of iterations not known\n");
277 return false;
280 simple_loop_info = vect_analyze_loop_form (loop);
282 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
284 tree reduc_stmt = NULL, operation;
286 /* ??? TODO: Change this into a generic function that
287 recognizes reductions. */
288 if (!is_gimple_reg (PHI_RESULT (phi)))
289 continue;
290 if (simple_loop_info)
291 reduc_stmt = vect_is_simple_reduction (simple_loop_info, phi);
293 /* Create a reduction_info struct, initialize it and insert it to
294 the reduction list. */
296 if (reduc_stmt)
298 PTR *slot;
299 struct reduction_info *new_reduction;
301 if (dump_file && (dump_flags & TDF_DETAILS))
303 fprintf (dump_file,
304 "Detected reduction. reduction stmt is: \n");
305 print_generic_stmt (dump_file, reduc_stmt, 0);
306 fprintf (dump_file, "\n");
309 new_reduction = XCNEW (struct reduction_info);
311 new_reduction->reduc_stmt = reduc_stmt;
312 new_reduction->reduc_phi = phi;
313 operation = GIMPLE_STMT_OPERAND (reduc_stmt, 1);
314 new_reduction->reduction_code = TREE_CODE (operation);
315 slot = htab_find_slot (reduction_list, new_reduction, INSERT);
316 *slot = new_reduction;
320 /* Get rid of the information created by the vectorizer functions. */
321 destroy_loop_vec_info (simple_loop_info, true);
323 for (phi = phi_nodes (exit->dest); phi; phi = PHI_CHAIN (phi))
325 struct reduction_info *red;
326 imm_use_iterator imm_iter;
327 use_operand_p use_p;
328 tree reduc_phi;
330 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
332 if (is_gimple_reg (val))
334 if (dump_file && (dump_flags & TDF_DETAILS))
336 fprintf (dump_file, "phi is ");
337 print_generic_expr (dump_file, phi, 0);
338 fprintf (dump_file, "arg of phi to exit: value ");
339 print_generic_expr (dump_file, val, 0);
340 fprintf (dump_file, " used outside loop\n");
341 fprintf (dump_file,
342 " checking if it a part of reduction pattern: \n");
344 if (htab_elements (reduction_list) == 0)
346 if (dump_file && (dump_flags & TDF_DETAILS))
347 fprintf (dump_file,
348 " FAILED: it is not a part of reduction.\n");
349 return false;
351 reduc_phi = NULL;
352 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val)
354 if (flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
356 reduc_phi = USE_STMT (use_p);
357 break;
360 red = reduction_phi (reduction_list, reduc_phi);
361 if (red == NULL)
363 if (dump_file && (dump_flags & TDF_DETAILS))
364 fprintf (dump_file,
365 " FAILED: it is not a part of reduction.\n");
366 return false;
368 if (dump_file && (dump_flags & TDF_DETAILS))
370 fprintf (dump_file, "reduction phi is ");
371 print_generic_expr (dump_file, red->reduc_phi, 0);
372 fprintf (dump_file, "reduction stmt is ");
373 print_generic_expr (dump_file, red->reduc_stmt, 0);
379 /* The iterations of the loop may communicate only through bivs whose
380 iteration space can be distributed efficiently. */
381 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
383 tree def = PHI_RESULT (phi);
384 affine_iv iv;
386 if (is_gimple_reg (def) && !simple_iv (loop, phi, def, &iv, true))
388 struct reduction_info *red;
390 red = reduction_phi (reduction_list, phi);
391 if (red == NULL)
393 if (dump_file && (dump_flags & TDF_DETAILS))
394 fprintf (dump_file,
395 " FAILED: scalar dependency between iterations\n");
396 return false;
401 /* We need to version the loop to verify assumptions in runtime. */
402 if (!can_duplicate_loop_p (loop))
404 if (dump_file && (dump_flags & TDF_DETAILS))
405 fprintf (dump_file, " FAILED: cannot be duplicated\n");
406 return false;
409 /* Check for problems with dependences. If the loop can be reversed,
410 the iterations are independent. */
411 datarefs = VEC_alloc (data_reference_p, heap, 10);
412 dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
413 compute_data_dependences_for_loop (loop, true, &datarefs,
414 &dependence_relations);
415 if (dump_file && (dump_flags & TDF_DETAILS))
416 dump_data_dependence_relations (dump_file, dependence_relations);
418 trans = lambda_trans_matrix_new (1, 1);
419 LTM_MATRIX (trans)[0][0] = -1;
421 if (lambda_transform_legal_p (trans, 1, dependence_relations))
423 ret = true;
424 if (dump_file && (dump_flags & TDF_DETAILS))
425 fprintf (dump_file, " SUCCESS: may be parallelized\n");
427 else if (dump_file && (dump_flags & TDF_DETAILS))
428 fprintf (dump_file,
429 " FAILED: data dependencies exist across iterations\n");
431 free_dependence_relations (dependence_relations);
432 free_data_refs (datarefs);
434 return ret;
437 /* Return true when LOOP contains basic blocks marked with the
438 BB_IRREDUCIBLE_LOOP flag. */
440 static inline bool
441 loop_has_blocks_with_irreducible_flag (struct loop *loop)
443 unsigned i;
444 basic_block *bbs = get_loop_body_in_dom_order (loop);
445 bool res = true;
447 for (i = 0; i < loop->num_nodes; i++)
448 if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
449 goto end;
451 res = false;
452 end:
453 free (bbs);
454 return res;
457 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name.
458 The assignment statement is placed before LOOP. DECL_ADDRESS maps decls
459 to their addresses that can be reused. The address of OBJ is known to
460 be invariant in the whole function. */
462 static tree
463 take_address_of (tree obj, tree type, struct loop *loop, htab_t decl_address)
465 int uid;
466 void **dslot;
467 struct int_tree_map ielt, *nielt;
468 tree *var_p, name, bvar, stmt, addr;
469 edge entry = loop_preheader_edge (loop);
471 /* Since the address of OBJ is invariant, the trees may be shared.
472 Avoid rewriting unrelated parts of the code. */
473 obj = unshare_expr (obj);
474 for (var_p = &obj;
475 handled_component_p (*var_p);
476 var_p = &TREE_OPERAND (*var_p, 0))
477 continue;
478 uid = DECL_UID (*var_p);
480 ielt.uid = uid;
481 dslot = htab_find_slot_with_hash (decl_address, &ielt, uid, INSERT);
482 if (!*dslot)
484 addr = build_addr (*var_p, current_function_decl);
485 bvar = create_tmp_var (TREE_TYPE (addr), get_name (*var_p));
486 add_referenced_var (bvar);
487 stmt = build_gimple_modify_stmt (bvar, addr);
488 name = make_ssa_name (bvar, stmt);
489 GIMPLE_STMT_OPERAND (stmt, 0) = name;
490 bsi_insert_on_edge_immediate (entry, stmt);
492 nielt = XNEW (struct int_tree_map);
493 nielt->uid = uid;
494 nielt->to = name;
495 *dslot = nielt;
497 else
498 name = ((struct int_tree_map *) *dslot)->to;
500 if (var_p != &obj)
502 *var_p = build1 (INDIRECT_REF, TREE_TYPE (*var_p), name);
503 name = force_gimple_operand (build_addr (obj, current_function_decl),
504 &stmt, true, NULL_TREE);
505 if (stmt)
506 bsi_insert_on_edge_immediate (entry, stmt);
509 if (TREE_TYPE (name) != type)
511 name = force_gimple_operand (fold_convert (type, name), &stmt, true,
512 NULL_TREE);
513 if (stmt)
514 bsi_insert_on_edge_immediate (entry, stmt);
517 return name;
520 /* Callback for htab_traverse. Create the initialization statement
521 for reduction described in SLOT, and place it at the preheader of
522 the loop described in DATA. */
524 static int
525 initialize_reductions (void **slot, void *data)
527 tree init, c;
528 tree bvar, type, arg;
529 edge e;
531 struct reduction_info *reduc = *slot;
532 struct loop *loop = (struct loop *) data;
534 /* Create initialization in preheader:
535 reduction_variable = initialization value of reduction. */
537 /* In the phi node at the header, replace the argument coming
538 from the preheader with the reduction initialization value. */
540 /* Create a new variable to initialize the reduction. */
541 type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
542 bvar = create_tmp_var (type, "reduction");
543 add_referenced_var (bvar);
545 c = build_omp_clause (OMP_CLAUSE_REDUCTION);
546 OMP_CLAUSE_REDUCTION_CODE (c) = reduc->reduction_code;
547 OMP_CLAUSE_DECL (c) =
548 SSA_NAME_VAR (GIMPLE_STMT_OPERAND (reduc->reduc_stmt, 0));
550 init = omp_reduction_init (c, TREE_TYPE (bvar));
551 reduc->init = init;
553 /* Replace the argument representing the initialization value
554 with the initialization value for the reduction (neutral
555 element for the particular operation, e.g. 0 for PLUS_EXPR,
556 1 for MULT_EXPR, etc).
557 Keep the old value in a new variable "reduction_initial",
558 that will be taken in consideration after the parallel
559 computing is done. */
561 e = loop_preheader_edge (loop);
562 arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e);
563 /* Create new variable to hold the initial value. */
565 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
566 (reduc->reduc_phi, loop_preheader_edge (loop)), init);
567 reduc->initial_value = arg;
568 return 1;
571 struct elv_data
573 struct loop *loop;
574 htab_t decl_address;
575 bool changed;
578 /* Eliminates references to local variables in *TP out of LOOP. DECL_ADDRESS
579 contains addresses of the references that had their address taken already.
580 If the expression is changed, CHANGED is set to true. Callback for
581 walk_tree. */
583 static tree
584 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
586 struct elv_data *dta = data;
587 tree t = *tp, var, addr, addr_type, type, obj;
589 if (DECL_P (t))
591 *walk_subtrees = 0;
593 if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
594 return NULL_TREE;
596 type = TREE_TYPE (t);
597 addr_type = build_pointer_type (type);
598 addr = take_address_of (t, addr_type, dta->loop, dta->decl_address);
599 *tp = build1 (INDIRECT_REF, TREE_TYPE (*tp), addr);
601 dta->changed = true;
602 return NULL_TREE;
605 if (TREE_CODE (t) == ADDR_EXPR)
607 /* ADDR_EXPR may appear in two contexts:
608 -- as a gimple operand, when the address taken is a function invariant
609 -- as gimple rhs, when the resulting address in not a function
610 invariant
611 We do not need to do anything special in the latter case (the base of
612 the memory reference whose address is taken may be replaced in the
613 DECL_P case). The former case is more complicated, as we need to
614 ensure that the new address is still a gimple operand. Thus, it
615 is not sufficient to replace just the base of the memory reference --
616 we need to move the whole computation of the address out of the
617 loop. */
618 if (!is_gimple_val (t))
619 return NULL_TREE;
621 *walk_subtrees = 0;
622 obj = TREE_OPERAND (t, 0);
623 var = get_base_address (obj);
624 if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var))
625 return NULL_TREE;
627 addr_type = TREE_TYPE (t);
628 addr = take_address_of (obj, addr_type, dta->loop, dta->decl_address);
629 *tp = addr;
631 dta->changed = true;
632 return NULL_TREE;
635 if (!EXPR_P (t) && !GIMPLE_STMT_P (t))
636 *walk_subtrees = 0;
638 return NULL_TREE;
641 /* Moves the references to local variables in STMT from LOOP. DECL_ADDRESS
642 contains addresses for the references for that we have already taken
643 them. */
645 static void
646 eliminate_local_variables_stmt (struct loop *loop, tree stmt,
647 htab_t decl_address)
649 struct elv_data dta;
651 dta.loop = loop;
652 dta.decl_address = decl_address;
653 dta.changed = false;
655 walk_tree (&stmt, eliminate_local_variables_1, &dta, NULL);
657 if (dta.changed)
658 update_stmt (stmt);
661 /* Eliminates the references to local variables from LOOP.
662 This includes:
663 1) Taking address of a local variable -- these are moved out of the
664 loop (and temporary variable is created to hold the address if
665 necessary).
666 2) Dereferencing a local variable -- these are replaced with indirect
667 references. */
669 static void
670 eliminate_local_variables (struct loop *loop)
672 basic_block bb, *body = get_loop_body (loop);
673 unsigned i;
674 block_stmt_iterator bsi;
675 htab_t decl_address = htab_create (10, int_tree_map_hash, int_tree_map_eq,
676 free);
678 /* Find and rename the ssa names defined outside of loop. */
679 for (i = 0; i < loop->num_nodes; i++)
681 bb = body[i];
683 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
684 eliminate_local_variables_stmt (loop, bsi_stmt (bsi), decl_address);
687 htab_delete (decl_address);
690 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
691 The copies are stored to NAME_COPIES, if NAME was already duplicated,
692 its duplicate stored in NAME_COPIES is returned.
694 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
695 duplicated, storing the copies in DECL_COPIES. */
697 static tree
698 separate_decls_in_loop_name (tree name,
699 htab_t name_copies, htab_t decl_copies,
700 bool copy_name_p)
702 tree copy, var, var_copy;
703 unsigned idx, uid, nuid;
704 struct int_tree_map ielt, *nielt;
705 struct name_to_copy_elt elt, *nelt;
706 void **slot, **dslot;
708 if (TREE_CODE (name) != SSA_NAME)
709 return name;
711 idx = SSA_NAME_VERSION (name);
712 elt.version = idx;
713 slot = htab_find_slot_with_hash (name_copies, &elt, idx,
714 copy_name_p ? INSERT : NO_INSERT);
715 if (slot && *slot)
716 return ((struct name_to_copy_elt *) *slot)->new_name;
718 var = SSA_NAME_VAR (name);
719 uid = DECL_UID (var);
720 ielt.uid = uid;
721 dslot = htab_find_slot_with_hash (decl_copies, &ielt, uid, INSERT);
722 if (!*dslot)
724 var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
725 DECL_GIMPLE_REG_P (var_copy) = DECL_GIMPLE_REG_P (var);
726 add_referenced_var (var_copy);
727 nielt = XNEW (struct int_tree_map);
728 nielt->uid = uid;
729 nielt->to = var_copy;
730 *dslot = nielt;
732 /* Ensure that when we meet this decl next time, we won't duplicate
733 it again. */
734 nuid = DECL_UID (var_copy);
735 ielt.uid = nuid;
736 dslot = htab_find_slot_with_hash (decl_copies, &ielt, nuid, INSERT);
737 gcc_assert (!*dslot);
738 nielt = XNEW (struct int_tree_map);
739 nielt->uid = nuid;
740 nielt->to = var_copy;
741 *dslot = nielt;
743 else
744 var_copy = ((struct int_tree_map *) *dslot)->to;
746 if (copy_name_p)
748 copy = duplicate_ssa_name (name, NULL_TREE);
749 nelt = XNEW (struct name_to_copy_elt);
750 nelt->version = idx;
751 nelt->new_name = copy;
752 nelt->field = NULL_TREE;
753 *slot = nelt;
755 else
757 gcc_assert (!slot);
758 copy = name;
761 SSA_NAME_VAR (copy) = var_copy;
762 return copy;
765 /* Finds the ssa names used in STMT that are defined outside of LOOP and
766 replaces such ssa names with their duplicates. The duplicates are stored to
767 NAME_COPIES. Base decls of all ssa names used in STMT
768 (including those defined in LOOP) are replaced with the new temporary
769 variables; the replacement decls are stored in DECL_COPIES. */
771 static void
772 separate_decls_in_loop_stmt (struct loop *loop, tree stmt,
773 htab_t name_copies, htab_t decl_copies)
775 use_operand_p use;
776 def_operand_p def;
777 ssa_op_iter oi;
778 tree name, copy;
779 bool copy_name_p;
781 mark_virtual_ops_for_renaming (stmt);
783 FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
785 name = DEF_FROM_PTR (def);
786 gcc_assert (TREE_CODE (name) == SSA_NAME);
787 copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
788 false);
789 gcc_assert (copy == name);
792 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
794 name = USE_FROM_PTR (use);
795 if (TREE_CODE (name) != SSA_NAME)
796 continue;
798 copy_name_p = expr_invariant_in_loop_p (loop, name);
799 copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
800 copy_name_p);
801 SET_USE (use, copy);
805 /* Callback for htab_traverse. Adds a field corresponding to the reduction
806 specified in SLOT. The type is passed in DATA. */
808 static int
809 add_field_for_reduction (void **slot, void *data)
812 struct reduction_info *red = *slot;
813 tree type = data;
814 tree var = SSA_NAME_VAR (GIMPLE_STMT_OPERAND (red->reduc_stmt, 0));
815 tree field = build_decl (FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
817 insert_field_into_struct (type, field);
819 red->field = field;
821 return 1;
824 /* Callback for htab_traverse. Adds a field corresponding to a ssa name
825 described in SLOT. The type is passed in DATA. */
827 static int
828 add_field_for_name (void **slot, void *data)
830 struct name_to_copy_elt *elt = *slot;
831 tree type = data;
832 tree name = ssa_name (elt->version);
833 tree var = SSA_NAME_VAR (name);
834 tree field = build_decl (FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
836 insert_field_into_struct (type, field);
837 elt->field = field;
839 return 1;
842 /* Callback for htab_traverse. A local result is the intermediate result
843 computed by a single
844 thread, or the intial value in case no iteration was executed.
845 This function creates a phi node reflecting these values.
846 The phi's result will be stored in NEW_PHI field of the
847 reduction's data structure. */
849 static int
850 create_phi_for_local_result (void **slot, void *data)
852 struct reduction_info *reduc = *slot;
853 struct loop *loop = data;
854 edge e;
855 tree new_phi;
856 basic_block store_bb;
857 tree local_res;
859 /* STORE_BB is the block where the phi
860 should be stored. It is the destination of the loop exit.
861 (Find the fallthru edge from OMP_CONTINUE). */
862 store_bb = FALLTHRU_EDGE (loop->latch)->dest;
864 /* STORE_BB has two predecessors. One coming from the loop
865 (the reduction's result is computed at the loop),
866 and another coming from a block preceding the loop,
867 when no iterations
868 are executed (the initial value should be taken). */
869 if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (loop->latch))
870 e = EDGE_PRED (store_bb, 1);
871 else
872 e = EDGE_PRED (store_bb, 0);
873 local_res = make_ssa_name (SSA_NAME_VAR (GIMPLE_STMT_OPERAND (reduc->reduc_stmt, 0)), NULL_TREE);
874 new_phi = create_phi_node (local_res, store_bb);
875 SSA_NAME_DEF_STMT (local_res) = new_phi;
876 add_phi_arg (new_phi, reduc->init, e);
877 add_phi_arg (new_phi, GIMPLE_STMT_OPERAND (reduc->reduc_stmt, 0),
878 FALLTHRU_EDGE (loop->latch));
879 reduc->new_phi = new_phi;
881 return 1;
884 struct clsn_data
886 tree store;
887 tree load;
889 basic_block store_bb;
890 basic_block load_bb;
893 /* Callback for htab_traverse. Create an atomic instruction for the
894 reduction described in SLOT.
895 DATA annotates the place in memory the atomic operation relates to,
896 and the basic block it needs to be generated in. */
898 static int
899 create_call_for_reduction_1 (void **slot, void *data)
901 struct reduction_info *reduc = *slot;
902 struct clsn_data *clsn_data = data;
903 block_stmt_iterator bsi;
904 tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
905 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
906 tree load_struct;
907 basic_block bb;
908 basic_block new_bb;
909 edge e;
910 tree t, addr, addr_type, ref, x;
911 tree tmp_load, load, name;
913 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
914 t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
915 addr_type = build_pointer_type (type);
917 addr = build_addr (t, current_function_decl);
919 /* Create phi node. */
920 bb = clsn_data->load_bb;
922 e = split_block (bb, t);
923 new_bb = e->dest;
925 tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)), NULL);
926 add_referenced_var (tmp_load);
927 tmp_load = make_ssa_name (tmp_load, NULL);
928 load = build2 (OMP_ATOMIC_LOAD, void_type_node, tmp_load, addr);
929 SSA_NAME_DEF_STMT (tmp_load) = load;
930 bsi = bsi_start (new_bb);
931 bsi_insert_after (&bsi, load, BSI_NEW_STMT);
933 e = split_block (new_bb, load);
934 new_bb = e->dest;
935 bsi = bsi_start (new_bb);
936 ref = tmp_load;
938 fold_build2 (reduc->reduction_code,
939 TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref,
940 PHI_RESULT (reduc->new_phi));
942 name =
943 force_gimple_operand_bsi (&bsi, x, true, NULL_TREE, true,
944 BSI_CONTINUE_LINKING);
946 x = build1 (OMP_ATOMIC_STORE, void_type_node, name);
948 bsi_insert_after (&bsi, x, BSI_NEW_STMT);
949 return 1;
952 /* Create the atomic operation at the join point of the threads.
953 REDUCTION_LIST describes the reductions in the LOOP.
954 LD_ST_DATA describes the shared data structure where
955 shared data is stored in and loaded from. */
956 static void
957 create_call_for_reduction (struct loop *loop, htab_t reduction_list,
958 struct clsn_data *ld_st_data)
960 htab_traverse (reduction_list, create_phi_for_local_result, loop);
961 /* Find the fallthru edge from OMP_CONTINUE. */
962 ld_st_data->load_bb = FALLTHRU_EDGE (loop->latch)->dest;
963 htab_traverse (reduction_list, create_call_for_reduction_1, ld_st_data);
966 /* Callback for htab_traverse. Loads the final reduction value at the
967 join point of all threads, and inserts it in the right place. */
969 static int
970 create_loads_for_reductions (void **slot, void *data)
972 struct reduction_info *red = *slot;
973 struct clsn_data *clsn_data = data;
974 tree stmt;
975 block_stmt_iterator bsi;
976 tree type = TREE_TYPE (GIMPLE_STMT_OPERAND (red->reduc_stmt, 0));
977 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
978 tree load_struct;
979 tree name;
980 tree x;
982 bsi = bsi_after_labels (clsn_data->load_bb);
983 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
984 load_struct = build3 (COMPONENT_REF, type, load_struct, red->field,
985 NULL_TREE);
987 x = load_struct;
988 name = PHI_RESULT (red->keep_res);
989 stmt = build_gimple_modify_stmt (name, x);
990 GIMPLE_STMT_OPERAND (stmt, 0) = name;
991 SSA_NAME_DEF_STMT (name) = stmt;
993 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
995 remove_phi_node (red->keep_res, NULL_TREE, false);
997 return 1;
1000 /* Load the reduction result that was stored in LD_ST_DATA.
1001 REDUCTION_LIST describes the list of reductions that the
1002 loades should be generated for. */
1003 static void
1004 create_final_loads_for_reduction (htab_t reduction_list,
1005 struct clsn_data *ld_st_data)
1007 block_stmt_iterator bsi;
1008 tree t;
1010 bsi = bsi_after_labels (ld_st_data->load_bb);
1011 t = build_fold_addr_expr (ld_st_data->store);
1013 build_gimple_modify_stmt (ld_st_data->load,
1014 build_fold_addr_expr (ld_st_data->store));
1016 bsi_insert_before (&bsi, t, BSI_NEW_STMT);
1017 SSA_NAME_DEF_STMT (ld_st_data->load) = t;
1018 GIMPLE_STMT_OPERAND (t, 0) = ld_st_data->load;
1020 htab_traverse (reduction_list, create_loads_for_reductions, ld_st_data);
1024 /* Callback for htab_traverse. Store the neutral value for the
1025 particular reduction's operation, e.g. 0 for PLUS_EXPR,
1026 1 for MULT_EXPR, etc. into the reduction field.
1027 The reduction is specified in SLOT. The store information is
1028 passed in DATA. */
1030 static int
1031 create_stores_for_reduction (void **slot, void *data)
1033 struct reduction_info *red = *slot;
1034 struct clsn_data *clsn_data = data;
1035 tree stmt;
1036 block_stmt_iterator bsi;
1037 tree type = TREE_TYPE (GIMPLE_STMT_OPERAND (red->reduc_stmt, 0));
1039 bsi = bsi_last (clsn_data->store_bb);
1040 stmt =
1041 build_gimple_modify_stmt (build3
1042 (COMPONENT_REF, type, clsn_data->store,
1043 red->field, NULL_TREE),
1044 red->initial_value);
1045 mark_virtual_ops_for_renaming (stmt);
1046 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1048 return 1;
1051 /* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and
1052 store to a field of STORE in STORE_BB for the ssa name and its duplicate
1053 specified in SLOT. */
1055 static int
1056 create_loads_and_stores_for_name (void **slot, void *data)
1058 struct name_to_copy_elt *elt = *slot;
1059 struct clsn_data *clsn_data = data;
1060 tree stmt;
1061 block_stmt_iterator bsi;
1062 tree type = TREE_TYPE (elt->new_name);
1063 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
1064 tree load_struct;
1066 bsi = bsi_last (clsn_data->store_bb);
1067 stmt =
1068 build_gimple_modify_stmt (build3
1069 (COMPONENT_REF, type, clsn_data->store,
1070 elt->field, NULL_TREE),
1071 ssa_name (elt->version));
1072 mark_virtual_ops_for_renaming (stmt);
1073 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1075 bsi = bsi_last (clsn_data->load_bb);
1076 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
1077 stmt = build_gimple_modify_stmt (elt->new_name,
1078 build3 (COMPONENT_REF, type, load_struct,
1079 elt->field, NULL_TREE));
1080 SSA_NAME_DEF_STMT (elt->new_name) = stmt;
1081 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1083 return 1;
1086 /* Moves all the variables used in LOOP and defined outside of it (including
1087 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1088 name) to a structure created for this purpose. The code
1090 while (1)
1092 use (a);
1093 use (b);
1096 is transformed this way:
1098 bb0:
1099 old.a = a;
1100 old.b = b;
1102 bb1:
1103 a' = new->a;
1104 b' = new->b;
1105 while (1)
1107 use (a');
1108 use (b');
1111 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The
1112 pointer `new' is intentionally not initialized (the loop will be split to a
1113 separate function later, and `new' will be initialized from its arguments).
1114 LD_ST_DATA holds information about the shared data structure used to pass
1115 information among the threads. It is initialized here, and
1116 gen_parallel_loop will pass it to create_call_for_reduction that
1117 needs this information. REDUCTION_LIST describes the reductions
1118 in LOOP. */
1120 static void
1121 separate_decls_in_loop (struct loop *loop, htab_t reduction_list,
1122 tree * arg_struct, tree * new_arg_struct,
1123 struct clsn_data *ld_st_data)
1126 basic_block bb1 = split_edge (loop_preheader_edge (loop));
1127 basic_block bb0 = single_pred (bb1);
1128 htab_t name_copies = htab_create (10, name_to_copy_elt_hash,
1129 name_to_copy_elt_eq, free);
1130 htab_t decl_copies = htab_create (10, int_tree_map_hash, int_tree_map_eq,
1131 free);
1132 basic_block bb, *body = get_loop_body (loop);
1133 unsigned i;
1134 tree phi, type, type_name, nvar;
1135 block_stmt_iterator bsi;
1136 struct clsn_data clsn_data;
1138 /* Find and rename the ssa names defined outside of loop. */
1139 for (i = 0; i < loop->num_nodes; i++)
1141 bb = body[i];
1143 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1144 separate_decls_in_loop_stmt (loop, phi, name_copies, decl_copies);
1146 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
1147 separate_decls_in_loop_stmt (loop, bsi_stmt (bsi), name_copies,
1148 decl_copies);
1150 free (body);
1152 if (htab_elements (name_copies) == 0)
1154 /* It may happen that there is nothing to copy (if there are only
1155 loop carried and external variables in the loop). */
1156 *arg_struct = NULL;
1157 *new_arg_struct = NULL;
1159 else
1161 /* Create the type for the structure to store the ssa names to. */
1162 type = lang_hooks.types.make_type (RECORD_TYPE);
1163 type_name = build_decl (TYPE_DECL, create_tmp_var_name (".paral_data"),
1164 type);
1165 TYPE_NAME (type) = type_name;
1167 htab_traverse (name_copies, add_field_for_name, type);
1168 if (htab_elements (reduction_list) > 0)
1170 /* Create the fields for reductions. */
1171 htab_traverse (reduction_list, add_field_for_reduction,
1172 type);
1174 layout_type (type);
1176 /* Create the loads and stores. */
1177 *arg_struct = create_tmp_var (type, ".paral_data_store");
1178 add_referenced_var (*arg_struct);
1179 nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
1180 add_referenced_var (nvar);
1181 *new_arg_struct = make_ssa_name (nvar, NULL_TREE);
1183 ld_st_data->store = *arg_struct;
1184 ld_st_data->load = *new_arg_struct;
1185 ld_st_data->store_bb = bb0;
1186 ld_st_data->load_bb = bb1;
1188 htab_traverse (name_copies, create_loads_and_stores_for_name,
1189 ld_st_data);
1191 /* Load the calculation from memory (after the join of the threads). */
1193 if (htab_elements (reduction_list) > 0)
1195 htab_traverse (reduction_list, create_stores_for_reduction,
1196 ld_st_data);
1197 clsn_data.load = make_ssa_name (nvar, NULL_TREE);
1198 clsn_data.load_bb = single_dom_exit (loop)->dest;
1199 clsn_data.store = ld_st_data->store;
1200 create_final_loads_for_reduction (reduction_list, &clsn_data);
1204 htab_delete (decl_copies);
1205 htab_delete (name_copies);
1208 /* Bitmap containing uids of functions created by parallelization. We cannot
1209 allocate it from the default obstack, as it must live across compilation
1210 of several functions; we make it gc allocated instead. */
1212 static GTY(()) bitmap parallelized_functions;
1214 /* Returns true if FN was created by create_loop_fn. */
1216 static bool
1217 parallelized_function_p (tree fn)
1219 if (!parallelized_functions || !DECL_ARTIFICIAL (fn))
1220 return false;
1222 return bitmap_bit_p (parallelized_functions, DECL_UID (fn));
1225 /* Creates and returns an empty function that will receive the body of
1226 a parallelized loop. */
1228 static tree
1229 create_loop_fn (void)
1231 char buf[100];
1232 char *tname;
1233 tree decl, type, name, t;
1234 struct function *act_cfun = cfun;
1235 static unsigned loopfn_num;
1237 snprintf (buf, 100, "%s.$loopfn", current_function_name ());
1238 ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
1239 clean_symbol_name (tname);
1240 name = get_identifier (tname);
1241 type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
1243 decl = build_decl (FUNCTION_DECL, name, type);
1244 if (!parallelized_functions)
1245 parallelized_functions = BITMAP_GGC_ALLOC ();
1246 bitmap_set_bit (parallelized_functions, DECL_UID (decl));
1248 TREE_STATIC (decl) = 1;
1249 TREE_USED (decl) = 1;
1250 DECL_ARTIFICIAL (decl) = 1;
1251 DECL_IGNORED_P (decl) = 0;
1252 TREE_PUBLIC (decl) = 0;
1253 DECL_UNINLINABLE (decl) = 1;
1254 DECL_EXTERNAL (decl) = 0;
1255 DECL_CONTEXT (decl) = NULL_TREE;
1256 DECL_INITIAL (decl) = make_node (BLOCK);
1258 t = build_decl (RESULT_DECL, NULL_TREE, void_type_node);
1259 DECL_ARTIFICIAL (t) = 1;
1260 DECL_IGNORED_P (t) = 1;
1261 DECL_RESULT (decl) = t;
1263 t = build_decl (PARM_DECL, get_identifier (".paral_data_param"),
1264 ptr_type_node);
1265 DECL_ARTIFICIAL (t) = 1;
1266 DECL_ARG_TYPE (t) = ptr_type_node;
1267 DECL_CONTEXT (t) = decl;
1268 TREE_USED (t) = 1;
1269 DECL_ARGUMENTS (decl) = t;
1271 allocate_struct_function (decl, false);
1273 /* The call to allocate_struct_function clobbers CFUN, so we need to restore
1274 it. */
1275 set_cfun (act_cfun);
1277 return decl;
1280 /* Bases all the induction variables in LOOP on a single induction variable
1281 (unsigned with base 0 and step 1), whose final value is compared with
1282 NIT. The induction variable is incremented in the loop latch.
1283 REDUCTION_LIST describes the reductions in LOOP. */
1285 static void
1286 canonicalize_loop_ivs (struct loop *loop, htab_t reduction_list, tree nit)
1288 unsigned precision = TYPE_PRECISION (TREE_TYPE (nit));
1289 tree phi, prev, res, type, var_before, val, atype, mtype, t, next;
1290 block_stmt_iterator bsi;
1291 bool ok;
1292 affine_iv iv;
1293 edge exit = single_dom_exit (loop);
1294 struct reduction_info *red;
1296 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
1298 res = PHI_RESULT (phi);
1300 if (is_gimple_reg (res) && TYPE_PRECISION (TREE_TYPE (res)) > precision)
1301 precision = TYPE_PRECISION (TREE_TYPE (res));
1304 type = lang_hooks.types.type_for_size (precision, 1);
1306 bsi = bsi_last (loop->latch);
1307 create_iv (build_int_cst_type (type, 0), build_int_cst (type, 1), NULL_TREE,
1308 loop, &bsi, true, &var_before, NULL);
1310 bsi = bsi_after_labels (loop->header);
1311 prev = NULL;
1312 for (phi = phi_nodes (loop->header); phi; phi = next)
1314 next = PHI_CHAIN (phi);
1315 res = PHI_RESULT (phi);
1317 if (!is_gimple_reg (res) || res == var_before)
1319 prev = phi;
1320 continue;
1323 ok = simple_iv (loop, phi, res, &iv, true);
1324 red = reduction_phi (reduction_list, phi);
1325 /* We preserve the reduction phi nodes. */
1326 if (!ok && red)
1328 prev = phi;
1329 continue;
1331 else
1332 gcc_assert (ok);
1333 remove_phi_node (phi, prev, false);
1335 atype = TREE_TYPE (res);
1336 mtype = POINTER_TYPE_P (atype) ? sizetype : atype;
1337 val = fold_build2 (MULT_EXPR, mtype, unshare_expr (iv.step),
1338 fold_convert (mtype, var_before));
1339 val = fold_build2 (POINTER_TYPE_P (atype)
1340 ? POINTER_PLUS_EXPR : PLUS_EXPR,
1341 atype, unshare_expr (iv.base), val);
1342 val = force_gimple_operand_bsi (&bsi, val, false, NULL_TREE, true,
1343 BSI_SAME_STMT);
1344 t = build_gimple_modify_stmt (res, val);
1345 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
1346 SSA_NAME_DEF_STMT (res) = t;
1349 t = last_stmt (exit->src);
1350 /* Make the loop exit if the control condition is not satisfied. */
1351 if (exit->flags & EDGE_TRUE_VALUE)
1353 edge te, fe;
1355 extract_true_false_edges_from_block (exit->src, &te, &fe);
1356 te->flags = EDGE_FALSE_VALUE;
1357 fe->flags = EDGE_TRUE_VALUE;
1359 COND_EXPR_COND (t) = build2 (LT_EXPR, boolean_type_node, var_before, nit);
1362 /* Moves the exit condition of LOOP to the beginning of its header, and
1363 duplicates the part of the last iteration that gets disabled to the
1364 exit of the loop. NIT is the number of iterations of the loop
1365 (used to initialize the variables in the duplicated part).
1367 TODO: the common case is that latch of the loop is empty and immediatelly
1368 follows the loop exit. In this case, it would be better not to copy the
1369 body of the loop, but only move the entry of the loop directly before the
1370 exit check and increase the number of iterations of the loop by one.
1371 This may need some additional preconditioning in case NIT = ~0.
1372 REDUCTION_LIST describes the reductions in LOOP. */
1374 static void
1375 transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit)
1377 basic_block *bbs, *nbbs, ex_bb, orig_header;
1378 unsigned n;
1379 bool ok;
1380 edge exit = single_dom_exit (loop), hpred;
1381 tree phi, nphi, cond, control, control_name, res, t, cond_stmt;
1382 block_stmt_iterator bsi;
1384 split_block_after_labels (loop->header);
1385 orig_header = single_succ (loop->header);
1386 hpred = single_succ_edge (loop->header);
1388 cond_stmt = last_stmt (exit->src);
1389 cond = COND_EXPR_COND (cond_stmt);
1390 control = TREE_OPERAND (cond, 0);
1391 gcc_assert (TREE_OPERAND (cond, 1) == nit);
1393 /* Make sure that we have phi nodes on exit for all loop header phis
1394 (create_parallel_loop requires that). */
1395 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
1397 res = PHI_RESULT (phi);
1398 t = make_ssa_name (SSA_NAME_VAR (res), phi);
1399 SET_PHI_RESULT (phi, t);
1401 nphi = create_phi_node (res, orig_header);
1402 SSA_NAME_DEF_STMT (res) = nphi;
1403 add_phi_arg (nphi, t, hpred);
1405 if (res == control)
1407 TREE_OPERAND (cond, 0) = t;
1408 update_stmt (cond_stmt);
1409 control = t;
1413 bbs = get_loop_body_in_dom_order (loop);
1414 for (n = 0; bbs[n] != exit->src; n++)
1415 continue;
1416 nbbs = XNEWVEC (basic_block, n);
1417 ok = tree_duplicate_sese_tail (single_succ_edge (loop->header), exit,
1418 bbs + 1, n, nbbs);
1419 gcc_assert (ok);
1420 free (bbs);
1421 ex_bb = nbbs[0];
1422 free (nbbs);
1424 /* Other than reductions, the only gimple reg that should be copied
1425 out of the loop is the control variable. */
1427 control_name = NULL_TREE;
1428 for (phi = phi_nodes (ex_bb); phi; phi = PHI_CHAIN (phi))
1430 res = PHI_RESULT (phi);
1431 if (!is_gimple_reg (res))
1432 continue;
1434 /* Check if it is a part of reduction. If it is,
1435 keep the phi at the reduction's keep_res field. The
1436 PHI_RESULT of this phi is the resulting value of the reduction
1437 variable when exiting the loop. */
1439 exit = single_dom_exit (loop);
1441 if (htab_elements (reduction_list) > 0)
1443 struct reduction_info *red;
1445 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1447 red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
1448 if (red)
1449 red->keep_res = phi;
1451 else
1452 gcc_assert (control_name == NULL_TREE
1453 && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
1454 control_name = res;
1456 gcc_assert (control_name != NULL_TREE);
1457 phi = SSA_NAME_DEF_STMT (control_name);
1458 remove_phi_node (phi, NULL_TREE, false);
1460 /* Initialize the control variable to NIT. */
1461 bsi = bsi_after_labels (ex_bb);
1462 nit = force_gimple_operand_bsi (&bsi,
1463 fold_convert (TREE_TYPE (control_name), nit),
1464 false, NULL_TREE, false, BSI_SAME_STMT);
1465 t = build_gimple_modify_stmt (control_name, nit);
1466 bsi_insert_before (&bsi, t, BSI_NEW_STMT);
1467 SSA_NAME_DEF_STMT (control_name) = t;
1470 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
1471 LOOP_FN and DATA are the arguments of OMP_PARALLEL.
1472 NEW_DATA is the variable that should be initialized from the argument
1473 of LOOP_FN. N_THREADS is the requested number of threads. Returns the
1474 basic block containing OMP_PARALLEL tree. */
1476 static basic_block
1477 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
1478 tree new_data, unsigned n_threads)
1480 block_stmt_iterator bsi;
1481 basic_block bb, paral_bb, for_bb, ex_bb;
1482 tree t, param, res, for_stmt;
1483 tree cvar, cvar_init, initvar, cvar_next, cvar_base, cond, phi, type;
1484 edge exit, nexit, guard, end, e;
1486 /* Prepare the OMP_PARALLEL statement. */
1487 bb = loop_preheader_edge (loop)->src;
1488 paral_bb = single_pred (bb);
1489 bsi = bsi_last (paral_bb);
1491 t = build_omp_clause (OMP_CLAUSE_NUM_THREADS);
1492 OMP_CLAUSE_NUM_THREADS_EXPR (t)
1493 = build_int_cst (integer_type_node, n_threads);
1494 t = build4 (OMP_PARALLEL, void_type_node, NULL_TREE, t, loop_fn, data);
1496 bsi_insert_after (&bsi, t, BSI_NEW_STMT);
1498 /* Initialize NEW_DATA. */
1499 if (data)
1501 bsi = bsi_after_labels (bb);
1503 param = make_ssa_name (DECL_ARGUMENTS (loop_fn), NULL_TREE);
1504 t = build_gimple_modify_stmt (param, build_fold_addr_expr (data));
1505 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
1506 SSA_NAME_DEF_STMT (param) = t;
1508 t = build_gimple_modify_stmt (new_data,
1509 fold_convert (TREE_TYPE (new_data),
1510 param));
1511 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
1512 SSA_NAME_DEF_STMT (new_data) = t;
1515 /* Emit OMP_RETURN for OMP_PARALLEL. */
1516 bb = split_loop_exit_edge (single_dom_exit (loop));
1517 bsi = bsi_last (bb);
1518 bsi_insert_after (&bsi, make_node (OMP_RETURN), BSI_NEW_STMT);
1520 /* Extract data for OMP_FOR. */
1521 gcc_assert (loop->header == single_dom_exit (loop)->src);
1522 cond = COND_EXPR_COND (last_stmt (loop->header));
1524 cvar = TREE_OPERAND (cond, 0);
1525 cvar_base = SSA_NAME_VAR (cvar);
1526 phi = SSA_NAME_DEF_STMT (cvar);
1527 cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1528 initvar = make_ssa_name (cvar_base, NULL_TREE);
1529 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
1530 initvar);
1531 cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
1533 bsi = bsi_last (loop->latch);
1534 gcc_assert (bsi_stmt (bsi) == SSA_NAME_DEF_STMT (cvar_next));
1535 bsi_remove (&bsi, true);
1537 /* Prepare cfg. */
1538 for_bb = split_edge (loop_preheader_edge (loop));
1539 ex_bb = split_loop_exit_edge (single_dom_exit (loop));
1540 extract_true_false_edges_from_block (loop->header, &nexit, &exit);
1541 gcc_assert (exit == single_dom_exit (loop));
1543 guard = make_edge (for_bb, ex_bb, 0);
1544 single_succ_edge (loop->latch)->flags = 0;
1545 end = make_edge (loop->latch, ex_bb, EDGE_FALLTHRU);
1546 for (phi = phi_nodes (ex_bb); phi; phi = PHI_CHAIN (phi))
1548 res = PHI_RESULT (phi);
1549 gcc_assert (!is_gimple_reg (phi));
1550 t = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, exit));
1551 add_phi_arg (phi, PHI_ARG_DEF_FROM_EDGE (t, loop_preheader_edge (loop)),
1552 guard);
1553 add_phi_arg (phi, PHI_ARG_DEF_FROM_EDGE (t, loop_latch_edge (loop)),
1554 end);
1556 e = redirect_edge_and_branch (exit, nexit->dest);
1557 PENDING_STMT (e) = NULL;
1559 /* Emit OMP_FOR. */
1560 TREE_OPERAND (cond, 0) = cvar_base;
1561 type = TREE_TYPE (cvar);
1562 t = build_omp_clause (OMP_CLAUSE_SCHEDULE);
1563 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
1565 for_stmt = make_node (OMP_FOR);
1566 TREE_TYPE (for_stmt) = void_type_node;
1567 OMP_FOR_CLAUSES (for_stmt) = t;
1568 OMP_FOR_INIT (for_stmt) = build_gimple_modify_stmt (initvar, cvar_init);
1569 OMP_FOR_COND (for_stmt) = cond;
1570 OMP_FOR_INCR (for_stmt) = build_gimple_modify_stmt (cvar_base,
1571 build2 (PLUS_EXPR, type,
1572 cvar_base,
1573 build_int_cst
1574 (type, 1)));
1575 OMP_FOR_BODY (for_stmt) = NULL_TREE;
1576 OMP_FOR_PRE_BODY (for_stmt) = NULL_TREE;
1578 bsi = bsi_last (for_bb);
1579 bsi_insert_after (&bsi, for_stmt, BSI_NEW_STMT);
1580 SSA_NAME_DEF_STMT (initvar) = for_stmt;
1582 /* Emit OMP_CONTINUE. */
1583 bsi = bsi_last (loop->latch);
1584 t = build2 (OMP_CONTINUE, void_type_node, cvar_next, cvar);
1585 bsi_insert_after (&bsi, t, BSI_NEW_STMT);
1586 SSA_NAME_DEF_STMT (cvar_next) = t;
1588 /* Emit OMP_RETURN for OMP_FOR. */
1589 bsi = bsi_last (ex_bb);
1590 bsi_insert_after (&bsi, make_node (OMP_RETURN), BSI_NEW_STMT);
1592 return paral_bb;
1595 /* Generates code to execute the iterations of LOOP in N_THREADS threads in
1596 parallel. NITER describes number of iterations of LOOP.
1597 REDUCTION_LIST describes the reductions existant in the LOOP. */
1599 static void
1600 gen_parallel_loop (struct loop *loop, htab_t reduction_list,
1601 unsigned n_threads, struct tree_niter_desc *niter)
1603 struct loop *nloop;
1604 loop_iterator li;
1605 tree many_iterations_cond, type, nit;
1606 tree stmts, arg_struct, new_arg_struct;
1607 basic_block parallel_head;
1608 struct clsn_data clsn_data;
1609 unsigned prob;
1611 /* From
1613 ---------------------------------------------------------------------
1614 loop
1616 IV = phi (INIT, IV + STEP)
1617 BODY1;
1618 if (COND)
1619 break;
1620 BODY2;
1622 ---------------------------------------------------------------------
1624 with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
1625 we generate the following code:
1627 ---------------------------------------------------------------------
1629 if (MAY_BE_ZERO
1630 || NITER < MIN_PER_THREAD * N_THREADS)
1631 goto original;
1633 BODY1;
1634 store all local loop-invariant variables used in body of the loop to DATA.
1635 OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
1636 load the variables from DATA.
1637 OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
1638 BODY2;
1639 BODY1;
1640 OMP_CONTINUE;
1641 OMP_RETURN -- OMP_FOR
1642 OMP_RETURN -- OMP_PARALLEL
1643 goto end;
1645 original:
1646 loop
1648 IV = phi (INIT, IV + STEP)
1649 BODY1;
1650 if (COND)
1651 break;
1652 BODY2;
1655 end:
1659 /* Create two versions of the loop -- in the old one, we know that the
1660 number of iterations is large enough, and we will transform it into the
1661 loop that will be split to loop_fn, the new one will be used for the
1662 remaining iterations. */
1664 type = TREE_TYPE (niter->niter);
1665 nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
1666 NULL_TREE);
1667 if (stmts)
1668 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1670 many_iterations_cond =
1671 fold_build2 (GE_EXPR, boolean_type_node,
1672 nit, build_int_cst (type, MIN_PER_THREAD * n_threads));
1673 many_iterations_cond
1674 = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1675 invert_truthvalue (unshare_expr (niter->may_be_zero)),
1676 many_iterations_cond);
1677 many_iterations_cond
1678 = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
1679 if (stmts)
1680 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1681 if (!is_gimple_condexpr (many_iterations_cond))
1683 many_iterations_cond
1684 = force_gimple_operand (many_iterations_cond, &stmts,
1685 true, NULL_TREE);
1686 if (stmts)
1687 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1690 initialize_original_copy_tables ();
1692 /* We assume that the loop usually iterates a lot. */
1693 prob = 4 * REG_BR_PROB_BASE / 5;
1694 nloop = loop_version (loop, many_iterations_cond, NULL,
1695 prob, prob, REG_BR_PROB_BASE - prob, true);
1696 update_ssa (TODO_update_ssa);
1697 free_original_copy_tables ();
1699 /* Base all the induction variables in LOOP on a single control one. */
1700 canonicalize_loop_ivs (loop, reduction_list, nit);
1702 /* Ensure that the exit condition is the first statement in the loop. */
1703 transform_to_exit_first_loop (loop, reduction_list, nit);
1706 /* Generate intializations for reductions. */
1708 if (htab_elements (reduction_list) > 0)
1709 htab_traverse (reduction_list, initialize_reductions, loop);
1711 /* Eliminate the references to local variables from the loop. */
1712 eliminate_local_variables (loop);
1714 /* In the old loop, move all variables non-local to the loop to a structure
1715 and back, and create separate decls for the variables used in loop. */
1716 separate_decls_in_loop (loop, reduction_list, &arg_struct, &new_arg_struct, &clsn_data);
1718 /* Create the parallel constructs. */
1719 parallel_head = create_parallel_loop (loop, create_loop_fn (), arg_struct,
1720 new_arg_struct, n_threads);
1721 if (htab_elements (reduction_list) > 0)
1722 create_call_for_reduction (loop, reduction_list, &clsn_data);
1724 scev_reset ();
1726 /* Cancel the loop (it is simpler to do it here rather than to teach the
1727 expander to do it). */
1728 cancel_loop_tree (loop);
1730 /* Free loop bound estimations that could contain references to
1731 removed statements. */
1732 FOR_EACH_LOOP (li, loop, 0)
1733 free_numbers_of_iterations_estimates_loop (loop);
1735 /* Expand the parallel constructs. We do it directly here instead of running
1736 a separate expand_omp pass, since it is more efficient, and less likely to
1737 cause troubles with further analyses not being able to deal with the
1738 OMP trees. */
1740 omp_expand_local (parallel_head);
1743 /* Detect parallel loops and generate parallel code using libgomp
1744 primitives. Returns true if some loop was parallelized, false
1745 otherwise. */
1747 bool
1748 parallelize_loops (void)
1750 unsigned n_threads = flag_tree_parallelize_loops;
1751 bool changed = false;
1752 struct loop *loop;
1753 struct tree_niter_desc niter_desc;
1754 loop_iterator li;
1755 htab_t reduction_list;
1757 /* Do not parallelize loops in the functions created by parallelization. */
1758 if (parallelized_function_p (cfun->decl))
1759 return false;
1761 reduction_list = htab_create (10, reduction_info_hash,
1762 reduction_info_eq, free);
1764 FOR_EACH_LOOP (li, loop, 0)
1766 htab_empty (reduction_list);
1767 if (/* Do not bother with loops in cold areas. */
1768 !maybe_hot_bb_p (loop->header)
1769 /* Or loops that roll too little. */
1770 || expected_loop_iterations (loop) <= n_threads
1771 /* And of course, the loop must be parallelizable. */
1772 || !can_duplicate_loop_p (loop)
1773 || loop_has_blocks_with_irreducible_flag (loop)
1774 || !loop_parallel_p (loop, reduction_list, &niter_desc))
1775 continue;
1777 changed = true;
1778 gen_parallel_loop (loop, reduction_list, n_threads, &niter_desc);
1779 verify_flow_info ();
1780 verify_dominators (CDI_DOMINATORS);
1781 verify_loop_structure ();
1782 verify_loop_closed_ssa ();
1785 htab_delete (reduction_list);
1786 return changed;
1789 #include "gt-tree-parloops.h"