PR c++/30897
[official-gcc.git] / gcc / tree-parloops.c
blob34c4639e570521a0cb276a5525d22349d9a6b65d
1 /* Loop autoparallelization.
2 Copyright (C) 2006 Free Software Foundation, Inc.
3 Contributed by Sebastian Pop <pop@cri.ensmp.fr> and
4 Zdenek Dvorak <dvorakz@suse.cz>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
21 02110-1301, USA. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "rtl.h"
29 #include "tree-flow.h"
30 #include "cfgloop.h"
31 #include "ggc.h"
32 #include "tree-data-ref.h"
33 #include "diagnostic.h"
34 #include "tree-pass.h"
35 #include "tree-scalar-evolution.h"
36 #include "hashtab.h"
37 #include "langhooks.h"
38 #include "tree-vectorizer.h"
40 /* This pass tries to distribute iterations of loops into several threads.
41 The implementation is straightforward -- for each loop we test whether its
42 iterations are independent, and if it is the case (and some additional
43 conditions regarding profitability and correctness are satisfied), we
44 add OMP_PARALLEL and OMP_FOR codes and let omp expansion machinery do
45 its job.
47 The most of the complexity is in bringing the code into shape expected
48 by the omp expanders:
49 -- for OMP_FOR, ensuring that the loop has only one induction variable
50 and that the exit test is at the start of the loop body
51 -- for OMP_PARALLEL, replacing the references to local addressable
52 variables by accesses through pointers, and breaking up ssa chains
53 by storing the values incoming to the parallelized loop to a structure
54 passed to the new function as an argument (something similar is done
55 in omp gimplification, unfortunately only a small part of the code
56 can be shared).
58 TODO:
59 -- if there are several parallelizable loops in a function, it may be
60 possible to generate the threads just once (using synchronization to
61 ensure that cross-loop dependences are obeyed).
62 -- handling of common scalar dependence patterns (accumulation, ...)
63 -- handling of non-innermost loops */
65 /*
66 Reduction handling:
67 currently we use vect_is_simple_reduction() to detect reduction patterns.
68 The code transformation will be introduced by an example.
70 source code:
72 parloop
74 int sum=1;
76 for (i = 0; i < N/1000; i++)
78 x[i] = i + 3;
79 sum+=x[i];
83 gimple code:
85 header_bb:
87 # sum_24 = PHI <sum_14(3), 1(2)>;
88 # i_21 = PHI <i_15(3), 0(2)>;
89 <L0>:;
90 D.2191_10 = i_21 + 3;
91 x[i_21] = D.2191_10;
92 sum_14 = D.2191_10 + sum_24;
93 i_15 = i_21 + 1;
94 if (N_8 > i_15) goto <L0>; else goto <L2>;
96 exit_bb:
98 # sum_25 = PHI <sum_14(3)>;
99 <L2>:;
102 after reduction transformation (only relevant parts):
104 parloop
107 ....
109 <L16>:;
110 D.2241_2 = (unsigned int) N_8;
111 D.2242_26 = D.2241_2 - 1;
112 if (D.2242_26 > 399) goto <L26>; else goto <L27>;
114 #two new variables are created for each reduction:
115 "reduction" is the variable holding the neutral element
116 for the particular operation, e.g. 0 for PLUS_EXPR,
117 1 for MULT_EXPR, etc.
118 "reduction_initial" is the initial value given by the user.
119 It is kept and will be used after the parallel computing
120 is done.#
122 <L26>:;
123 reduction.38_42 = 0;
124 reduction_initial.39_43 = 1;
125 x.40_44 = &x;
126 .paral_data_store.47.D.2261 = D.2242_26;
127 .paral_data_store.47.reduction.38 = reduction.38_42;
128 .paral_data_store.47.x.40 = x.40_44;
129 __builtin_GOMP_parallel_start (parloop._loopfn.0, &.paral_data_store.47, 4);
130 parloop._loopfn.0 (&.paral_data_store.47);
131 __builtin_GOMP_parallel_end ();
133 # collecting the result after the join of the threads is done at
134 create_loads_for_reductions().
135 a new variable "reduction_final" is created. It calculates the
136 final value from the initial value and the value computed by
137 the threads. #
139 .paral_data_load.48_49 = &.paral_data_store.47;
140 reduction_final.49_50 = .paral_data_load.48_49->reduction.38;
141 reduction_final.49_51 = reduction_initial.39_43 + reduction_final.49_50;
142 ivtmp.37_36 = D.2242_26;
143 i_37 = (int) ivtmp.37_36;
144 D.2191_38 = i_37 + 3;
145 x[i_37] = D.2191_38;
146 sum_40 = D.2191_38 + reduction_final.49_51;
147 i_41 = i_37 + 1;
148 goto <bb 8> (<L2>);
150 # sum_25 = PHI <sum_40(4), sum_9(6)>;
151 <L2>:;
152 printf (&"sum is %d\n"[0], sum_25);
158 parloop._loopfn.0 (.paral_data_param)
162 <L28>:;
163 .paral_data_param_52 = .paral_data_param_75;
164 .paral_data_load.48_48 = (struct .paral_data.46 *) .paral_data_param_52;
165 D.2289_46 = .paral_data_load.48_48->D.2261;
166 reduction.43_45 = .paral_data_load.48_48->reduction.38;
167 x.45_47 = .paral_data_load.48_48->x.40;
168 # SUCC: 23 [100.0%] (fallthru)
170 # BLOCK 23
171 # PRED: 21 [100.0%] (fallthru)
172 <L30>:;
173 D.2292_60 = __builtin_omp_get_num_threads ();
174 D.2293_61 = (unsigned int) D.2292_60;
175 D.2294_62 = __builtin_omp_get_thread_num ();
176 D.2295_63 = (unsigned int) D.2294_62;
177 D.2296_64 = D.2289_46 / D.2293_61;
178 D.2297_65 = D.2293_61 * D.2296_64;
179 D.2298_66 = D.2297_65 != D.2289_46;
180 D.2299_67 = D.2296_64 + D.2298_66;
181 D.2300_68 = D.2299_67 * D.2295_63;
182 D.2301_69 = D.2299_67 + D.2300_68;
183 D.2302_70 = MIN_EXPR <D.2301_69, D.2289_46>;
184 ivtmp.41_54 = D.2300_68;
185 if (D.2300_68 >= D.2302_70) goto <L31>; else goto <L32>;
186 # SUCC: 26 [100.0%] (false) 24 (true)
188 # BLOCK 26
189 # PRED: 23 [100.0%] (false)
190 <L32>:;
191 # SUCC: 4 [100.0%] (fallthru)
193 # BLOCK 4
194 # PRED: 5 [100.0%] (true) 26 [100.0%] (fallthru)
195 # ivtmp.41_31 = PHI <ivtmp.41_30(5), ivtmp.41_54(26)>;
196 # sum.42_32 = PHI <sum.42_14(5), reduction.43_45(26)>;
197 <L0>:;
198 # SUCC: 19 [100.0%] (fallthru)
200 # BLOCK 19
201 # PRED: 4 [100.0%] (fallthru)
202 # sum.42_24 = PHI <sum.42_32(4)>;
203 # ivtmp.41_17 = PHI <ivtmp.41_31(4)>;
204 i.44_21 = (int) ivtmp.41_17;
205 D.2310_10 = i.44_21 + 3;
206 (*x.45_47)[i.44_21] = D.2310_10;
207 sum.42_14 = D.2310_10 + sum.42_24;
208 i.44_15 = i.44_21 + 1;
209 # SUCC: 5 [100.0%] (fallthru)
211 # BLOCK 5
212 # PRED: 19 [100.0%] (fallthru)
213 <L17>:;
214 ivtmp.41_30 = ivtmp.41_31 + 1;
215 if (ivtmp.41_30 < D.2302_70) goto <L0>; else goto <L31>;
216 # SUCC: 4 [100.0%] (true) 24 (false)
218 # Adding this reduction phi is done at
219 create_phi_for_local_result() #
221 # BLOCK 24
222 # PRED: 5 (false) 23 (true)
223 # reduction.38_56 = PHI <sum.42_14(5), 0(23)>;
224 <L31>:;
225 __builtin_GOMP_barrier ();
226 # SUCC: 25 [100.0%] (fallthru)
228 # Creating the atomic operation is
229 done at create_call_for_reduction_1() #
231 # BLOCK 25
232 # PRED: 24 [100.0%] (fallthru)
233 D.2306_57 = &.paral_data_load.48_48->reduction.38;
234 D.2307_58 = (unsigned int) reduction.38_56;
235 D.2308_59 = __sync_fetch_and_add_4 (D.2306_57, D.2307_58);
236 # SUCC: 22 [100.0%] (fallthru)
238 # BLOCK 22
239 # PRED: 25 [100.0%] (fallthru)
240 <L29>:;
241 return;
242 # SUCC: EXIT
248 /* Minimal number of iterations of a loop that should be executed in each
249 thread. */
250 #define MIN_PER_THREAD 100
252 /* Element of the hashtable, representing a
253 reduction in the current loop. */
254 struct reduction_info
256 tree reduc_stmt; /* reduction statement. */
257 tree reduc_phi; /* The phi node defining the reduction. */
258 enum tree_code reduction_code; /* code for the reduction operation. */
259 tree keep_res; /* The PHI_RESULT of this phi is the resulting value
260 of the reduction variable when existing the loop. */
261 tree initial_value; /* An ssa name representing a new variable holding
262 the initial value of the reduction var before entering the loop. */
263 tree field; /* the name of the field in the parloop data structure intended for reduction. */
264 tree reduction_init; /* An ssa name representing a new variable which will be
265 assigned the proper reduction initialization value (init). */
266 tree init; /* reduction initialization value. */
267 tree new_phi; /* (helper field) Newly created phi node whose result
268 will be passed to the atomic operation. Represents
269 the local result each thread computed for the reduction
270 operation. */
273 /* Equality and hash functions for hashtab code. */
275 static int
276 reduction_info_eq (const void *aa, const void *bb)
278 const struct reduction_info *a = (const struct reduction_info *) aa;
279 const struct reduction_info *b = (const struct reduction_info *) bb;
281 return (a->reduc_phi == b->reduc_phi);
284 static hashval_t
285 reduction_info_hash (const void *aa)
287 const struct reduction_info *a = (const struct reduction_info *) aa;
289 return htab_hash_pointer (a->reduc_phi);
292 static struct reduction_info *
293 reduction_phi (htab_t reduction_list, tree phi)
295 struct reduction_info tmpred, *red;
297 if (htab_elements (reduction_list) == 0)
298 return NULL;
300 tmpred.reduc_phi = phi;
301 red = htab_find (reduction_list, &tmpred);
303 return red;
306 /* Element of hashtable of names to copy. */
308 struct name_to_copy_elt
310 unsigned version; /* The version of the name to copy. */
311 tree new_name; /* The new name used in the copy. */
312 tree field; /* The field of the structure used to pass the
313 value. */
316 /* Equality and hash functions for hashtab code. */
318 static int
319 name_to_copy_elt_eq (const void *aa, const void *bb)
321 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
322 const struct name_to_copy_elt *b = (const struct name_to_copy_elt *) bb;
324 return a->version == b->version;
327 static hashval_t
328 name_to_copy_elt_hash (const void *aa)
330 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
332 return (hashval_t) a->version;
335 /* Returns true if the iterations of LOOP are independent on each other (that
336 is, if we can execute them in parallel), and if LOOP satisfies other
337 conditions that we need to be able to parallelize it. Description of number
338 of iterations is stored to NITER. Reduction analysis is done, if
339 reductions are found, they are inserted to the REDUCTION_LIST. */
341 static bool
342 loop_parallel_p (struct loop *loop, htab_t reduction_list, struct tree_niter_desc *niter)
344 edge exit = single_dom_exit (loop);
345 VEC (ddr_p, heap) * dependence_relations;
346 VEC (data_reference_p, heap) * datarefs;
347 lambda_trans_matrix trans;
348 bool ret = false;
349 tree phi;
350 loop_vec_info simple_loop_info;
352 /* Only consider innermost loops with just one exit. The innermost-loop
353 restriction is not necessary, but it makes things simpler. */
354 if (loop->inner || !exit)
355 return false;
357 if (dump_file && (dump_flags & TDF_DETAILS))
358 fprintf (dump_file, "\nConsidering loop %d\n", loop->num);
360 /* We need to know # of iterations, and there should be no uses of values
361 defined inside loop outside of it, unless the values are invariants of
362 the loop. */
363 if (!number_of_iterations_exit (loop, exit, niter, false))
365 if (dump_file && (dump_flags & TDF_DETAILS))
366 fprintf (dump_file, " FAILED: number of iterations not known\n");
367 return false;
370 simple_loop_info = vect_analyze_loop_form (loop);
372 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
374 tree reduc_stmt = NULL, operation;
376 /* ??? TODO: Change this into a generic function that
377 recognizes reductions. */
378 if (!is_gimple_reg (PHI_RESULT (phi)))
379 continue;
380 if (simple_loop_info)
381 reduc_stmt = vect_is_simple_reduction (simple_loop_info, phi);
383 /* Create a reduction_info struct, initialize it and insert it to
384 the reduction list. */
386 if (reduc_stmt)
388 PTR *slot;
389 struct reduction_info *new_reduction;
391 if (dump_file && (dump_flags & TDF_DETAILS))
393 fprintf (dump_file,
394 "Detected reduction. reduction stmt is: \n");
395 print_generic_stmt (dump_file, reduc_stmt, 0);
396 fprintf (dump_file, "\n");
399 new_reduction = XCNEW (struct reduction_info);
401 new_reduction->reduc_stmt = reduc_stmt;
402 new_reduction->reduc_phi = phi;
403 operation = GIMPLE_STMT_OPERAND (reduc_stmt, 1);
404 new_reduction->reduction_code = TREE_CODE (operation);
405 slot = htab_find_slot (reduction_list, new_reduction, INSERT);
406 *slot = new_reduction;
410 for (phi = phi_nodes (exit->dest); phi; phi = PHI_CHAIN (phi))
412 struct reduction_info *red;
413 imm_use_iterator imm_iter;
414 use_operand_p use_p;
415 tree reduc_phi;
417 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
419 if (is_gimple_reg (val))
421 if (dump_file && (dump_flags & TDF_DETAILS))
423 fprintf (dump_file, "phi is ");
424 print_generic_expr (dump_file, phi, 0);
425 fprintf (dump_file, "arg of phi to exit: value ");
426 print_generic_expr (dump_file, val, 0);
427 fprintf (dump_file, " used outside loop\n");
428 fprintf (dump_file,
429 " checking if it a part of reduction pattern: \n");
431 if (htab_elements (reduction_list) == 0)
433 if (dump_file && (dump_flags & TDF_DETAILS))
434 fprintf (dump_file,
435 " FAILED: it is not a part of reduction.\n");
436 return false;
438 reduc_phi = NULL;
439 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val)
441 if (flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
443 reduc_phi = USE_STMT (use_p);
444 break;
447 red = reduction_phi (reduction_list, reduc_phi);
448 if (red == NULL)
450 if (dump_file && (dump_flags & TDF_DETAILS))
451 fprintf (dump_file,
452 " FAILED: it is not a part of reduction.\n");
453 return false;
455 if (dump_file && (dump_flags & TDF_DETAILS))
457 fprintf (dump_file, "reduction phi is ");
458 print_generic_expr (dump_file, red->reduc_phi, 0);
459 fprintf (dump_file, "reduction stmt is ");
460 print_generic_expr (dump_file, red->reduc_stmt, 0);
466 /* The iterations of the loop may communicate only through bivs whose
467 iteration space can be distributed efficiently. */
468 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
470 tree def = PHI_RESULT (phi);
471 affine_iv iv;
473 if (is_gimple_reg (def) && !simple_iv (loop, phi, def, &iv, true))
475 struct reduction_info *red;
477 red = reduction_phi (reduction_list, phi);
478 if (red == NULL)
480 if (dump_file && (dump_flags & TDF_DETAILS))
481 fprintf (dump_file,
482 " FAILED: scalar dependency between iterations\n");
483 return false;
488 /* We need to version the loop to verify assumptions in runtime. */
489 if (!can_duplicate_loop_p (loop))
491 if (dump_file && (dump_flags & TDF_DETAILS))
492 fprintf (dump_file, " FAILED: cannot be duplicated\n");
493 return false;
496 /* Check for problems with dependences. If the loop can be reversed,
497 the iterations are independent. */
498 datarefs = VEC_alloc (data_reference_p, heap, 10);
499 dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
500 compute_data_dependences_for_loop (loop, true, &datarefs,
501 &dependence_relations);
502 if (dump_file && (dump_flags & TDF_DETAILS))
503 dump_data_dependence_relations (dump_file, dependence_relations);
505 trans = lambda_trans_matrix_new (1, 1);
506 LTM_MATRIX (trans)[0][0] = -1;
508 if (lambda_transform_legal_p (trans, 1, dependence_relations))
510 ret = true;
511 if (dump_file && (dump_flags & TDF_DETAILS))
512 fprintf (dump_file, " SUCCESS: may be parallelized\n");
514 else if (dump_file && (dump_flags & TDF_DETAILS))
515 fprintf (dump_file,
516 " FAILED: data dependencies exist across iterations\n");
518 free_dependence_relations (dependence_relations);
519 free_data_refs (datarefs);
521 return ret;
524 /* Assigns the address of VAR in TYPE to an ssa name, and returns this name.
525 The assignment statement is placed before LOOP. DECL_ADDRESS maps decls
526 to their addresses that can be reused. */
528 static tree
529 take_address_of (tree var, tree type, struct loop *loop, htab_t decl_address)
531 int uid = DECL_UID (var);
532 void **dslot;
533 struct int_tree_map ielt, *nielt;
534 tree name, bvar, stmt;
535 edge entry = loop_preheader_edge (loop);
537 ielt.uid = uid;
538 dslot = htab_find_slot_with_hash (decl_address, &ielt, uid, INSERT);
539 if (!*dslot)
541 bvar = create_tmp_var (type, get_name (var));
542 add_referenced_var (bvar);
543 stmt = build_gimple_modify_stmt (bvar,
544 fold_convert (type,
545 build_addr (var,
546 current_function_decl)));
547 name = make_ssa_name (bvar, stmt);
548 GIMPLE_STMT_OPERAND (stmt, 0) = name;
549 bsi_insert_on_edge_immediate (entry, stmt);
551 nielt = XNEW (struct int_tree_map);
552 nielt->uid = uid;
553 nielt->to = name;
554 *dslot = nielt;
556 return name;
559 name = ((struct int_tree_map *) *dslot)->to;
560 if (TREE_TYPE (name) == type)
561 return name;
563 bvar = SSA_NAME_VAR (name);
564 stmt = build_gimple_modify_stmt (bvar, fold_convert (type, name));
565 name = make_ssa_name (bvar, stmt);
566 GIMPLE_STMT_OPERAND (stmt, 0) = name;
567 bsi_insert_on_edge_immediate (entry, stmt);
569 return name;
572 /* Callback for htab_traverse. Create the initialization statement
573 for reduction described in SLOT, and place it at the preheader of
574 the loop described in DATA. */
576 static int
577 initialize_reductions (void **slot, void *data)
579 tree t, stmt;
580 tree init, c;
581 tree name, name1;
582 tree bvar, type, arg;
583 edge e;
585 struct reduction_info *reduc = *slot;
586 struct loop *loop = (struct loop *) data;
588 /* Create initialization in preheader:
589 reduction_variable = initialization value of reduction. */
591 /* In the phi node at the header, replace the argument coming
592 from the preheader with the reduction initialization value. */
594 /* Create a new variable to initialize the reduction. */
595 type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
596 bvar = create_tmp_var (type, "reduction");
597 add_referenced_var (bvar);
599 c = build_omp_clause (OMP_CLAUSE_REDUCTION);
600 OMP_CLAUSE_REDUCTION_CODE (c) = reduc->reduction_code;
601 OMP_CLAUSE_DECL (c) =
602 SSA_NAME_VAR (GIMPLE_STMT_OPERAND (reduc->reduc_stmt, 0));
604 init = omp_reduction_init (c, TREE_TYPE (bvar));
605 reduc->init = init;
607 t = build_gimple_modify_stmt (bvar, init);
608 name = make_ssa_name (bvar, t);
610 GIMPLE_STMT_OPERAND (t, 0) = name;
611 SSA_NAME_DEF_STMT (name) = t;
613 /* Replace the argument
614 representing the initialization value. Keeping the old value
615 in a new variable "reduction_initial", that will be taken in
616 consideration after the parallel computing is done. */
618 e = loop_preheader_edge (loop);
619 arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e);
620 /* Create new variable to hold the initial value. */
621 type = TREE_TYPE (bvar);
622 bvar = create_tmp_var (type, "reduction_initial");
623 add_referenced_var (bvar);
625 stmt = build_gimple_modify_stmt (bvar, arg);
626 name1 = make_ssa_name (bvar, stmt);
627 GIMPLE_STMT_OPERAND (stmt, 0) = name1;
628 SSA_NAME_DEF_STMT (name1) = stmt;
630 bsi_insert_on_edge_immediate (e, stmt);
631 bsi_insert_on_edge_immediate (e, t);
632 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
633 (reduc->reduc_phi, loop_preheader_edge (loop)), name);
634 reduc->initial_value = name1;
635 reduc->reduction_init = name;
636 return 1;
639 struct elv_data
641 struct loop *loop;
642 htab_t decl_address;
643 bool changed;
646 /* Eliminates references to local variables in *TP out of LOOP. DECL_ADDRESS
647 contains addresses of the references that had their address taken already.
648 If the expression is changed, CHANGED is set to true. Callback for
649 walk_tree. */
651 static tree
652 eliminate_local_variables_1 (tree * tp, int *walk_subtrees, void *data)
654 struct elv_data *dta = data;
655 tree t = *tp, var, addr, addr_type, type;
657 if (DECL_P (t))
659 *walk_subtrees = 0;
661 if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
662 return NULL_TREE;
664 type = TREE_TYPE (t);
665 addr_type = build_pointer_type (type);
666 addr = take_address_of (t, addr_type, dta->loop, dta->decl_address);
667 *tp = build1 (INDIRECT_REF, TREE_TYPE (*tp), addr);
669 dta->changed = true;
670 return NULL_TREE;
673 if (TREE_CODE (t) == ADDR_EXPR)
675 var = TREE_OPERAND (t, 0);
676 if (!DECL_P (var))
677 return NULL_TREE;
679 *walk_subtrees = 0;
680 if (!SSA_VAR_P (var) || DECL_EXTERNAL (var))
681 return NULL_TREE;
683 addr_type = TREE_TYPE (t);
684 addr = take_address_of (var, addr_type, dta->loop, dta->decl_address);
685 *tp = addr;
687 dta->changed = true;
688 return NULL_TREE;
691 if (!EXPR_P (t) && !GIMPLE_STMT_P (t))
692 *walk_subtrees = 0;
694 return NULL_TREE;
697 /* Moves the references to local variables in STMT from LOOP. DECL_ADDRESS
698 contains addresses for the references for that we have already taken
699 them. */
701 static void
702 eliminate_local_variables_stmt (struct loop *loop, tree stmt,
703 htab_t decl_address)
705 struct elv_data dta;
707 dta.loop = loop;
708 dta.decl_address = decl_address;
709 dta.changed = false;
711 walk_tree (&stmt, eliminate_local_variables_1, &dta, NULL);
713 if (dta.changed)
714 update_stmt (stmt);
717 /* Eliminates the references to local variables from LOOP.
718 This includes:
719 1) Taking address of a local variable -- these are moved out of the
720 loop (and temporary variable is created to hold the address if
721 necessary).
722 2) Dereferencing a local variable -- these are replaced with indirect
723 references. */
725 static void
726 eliminate_local_variables (struct loop *loop)
728 basic_block bb, *body = get_loop_body (loop);
729 unsigned i;
730 block_stmt_iterator bsi;
731 htab_t decl_address = htab_create (10, int_tree_map_hash, int_tree_map_eq,
732 free);
734 /* Find and rename the ssa names defined outside of loop. */
735 for (i = 0; i < loop->num_nodes; i++)
737 bb = body[i];
739 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
740 eliminate_local_variables_stmt (loop, bsi_stmt (bsi), decl_address);
743 htab_delete (decl_address);
746 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
747 The copies are stored to NAME_COPIES, if NAME was already duplicated,
748 its duplicate stored in NAME_COPIES is returned.
750 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
751 duplicated, storing the copies in DECL_COPIES. */
753 static tree
754 separate_decls_in_loop_name (tree name,
755 htab_t name_copies, htab_t decl_copies,
756 bool copy_name_p)
758 tree copy, var, var_copy;
759 unsigned idx, uid, nuid;
760 struct int_tree_map ielt, *nielt;
761 struct name_to_copy_elt elt, *nelt;
762 void **slot, **dslot;
764 if (TREE_CODE (name) != SSA_NAME)
765 return name;
767 idx = SSA_NAME_VERSION (name);
768 elt.version = idx;
769 slot = htab_find_slot_with_hash (name_copies, &elt, idx,
770 copy_name_p ? INSERT : NO_INSERT);
771 if (slot && *slot)
772 return ((struct name_to_copy_elt *) *slot)->new_name;
774 var = SSA_NAME_VAR (name);
775 uid = DECL_UID (var);
776 ielt.uid = uid;
777 dslot = htab_find_slot_with_hash (decl_copies, &ielt, uid, INSERT);
778 if (!*dslot)
780 var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
781 add_referenced_var (var_copy);
782 nielt = XNEW (struct int_tree_map);
783 nielt->uid = uid;
784 nielt->to = var_copy;
785 *dslot = nielt;
787 /* Ensure that when we meet this decl next time, we won't duplicate
788 it again. */
789 nuid = DECL_UID (var_copy);
790 ielt.uid = nuid;
791 dslot = htab_find_slot_with_hash (decl_copies, &ielt, nuid, INSERT);
792 gcc_assert (!*dslot);
793 nielt = XNEW (struct int_tree_map);
794 nielt->uid = nuid;
795 nielt->to = var_copy;
796 *dslot = nielt;
798 else
799 var_copy = ((struct int_tree_map *) *dslot)->to;
801 if (copy_name_p)
803 copy = duplicate_ssa_name (name, NULL_TREE);
804 nelt = XNEW (struct name_to_copy_elt);
805 nelt->version = idx;
806 nelt->new_name = copy;
807 nelt->field = NULL_TREE;
808 *slot = nelt;
810 else
812 gcc_assert (!slot);
813 copy = name;
816 SSA_NAME_VAR (copy) = var_copy;
817 return copy;
820 /* Finds the ssa names used in STMT that are defined outside of LOOP and
821 replaces such ssa names with their duplicates. The duplicates are stored to
822 NAME_COPIES. Base decls of all ssa names used in STMT
823 (including those defined in LOOP) are replaced with the new temporary
824 variables; the replacement decls are stored in DECL_COPIES. */
826 static void
827 separate_decls_in_loop_stmt (struct loop *loop, tree stmt,
828 htab_t name_copies, htab_t decl_copies)
830 use_operand_p use;
831 def_operand_p def;
832 ssa_op_iter oi;
833 tree name, copy;
834 bool copy_name_p;
836 mark_virtual_ops_for_renaming (stmt);
838 FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
840 name = DEF_FROM_PTR (def);
841 gcc_assert (TREE_CODE (name) == SSA_NAME);
842 copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
843 false);
844 gcc_assert (copy == name);
847 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
849 name = USE_FROM_PTR (use);
850 if (TREE_CODE (name) != SSA_NAME)
851 continue;
853 copy_name_p = expr_invariant_in_loop_p (loop, name);
854 copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
855 copy_name_p);
856 SET_USE (use, copy);
860 /* A helper structure for passing the TYPE and REDUCTION_LIST
861 to the DATA parameter of add_field_for_name. */
862 struct data_arg
864 tree type;
865 htab_t reduction_list;
868 /* Callback for htab_traverse. Adds a field corresponding to a ssa name
869 described in SLOT. The type is passed in DATA. The Reduction list
870 is also passes in DATA. */
872 static int
873 add_field_for_name (void **slot, void *data)
875 tree stmt;
876 use_operand_p use_p = NULL;
878 struct name_to_copy_elt *elt = *slot;
879 struct data_arg *data_arg = (struct data_arg *) data;
880 tree type = data_arg->type;
881 tree name = ssa_name (elt->version);
882 tree var = SSA_NAME_VAR (name);
883 tree field = build_decl (FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
885 insert_field_into_struct (type, field);
886 elt->field = field;
888 /* Find uses of name to determine if this name is related to
889 a reduction phi, and if so, record the field in the reduction struct. */
891 if ((htab_elements (data_arg->reduction_list) > 0)
892 && single_imm_use (elt->new_name, &use_p, &stmt)
893 && TREE_CODE (stmt) == PHI_NODE)
895 /* check if STMT is a REDUC_PHI of some reduction. */
896 struct reduction_info *red;
898 red = reduction_phi (data_arg->reduction_list ,stmt);
899 if (red)
900 red->field = field;
903 return 1;
906 /* Callback for htab_traverse. A local result is the intermediate result
907 computed by a single
908 thread, or the intial value in case no iteration was executed.
909 This function creates a phi node reflecting these values.
910 The phi's result will be stored in NEW_PHI field of the
911 reduction's data structure. */
913 static int
914 create_phi_for_local_result (void **slot, void *data)
916 struct reduction_info *reduc = *slot;
917 struct loop *loop = data;
918 edge e;
919 tree new_phi;
920 basic_block store_bb;
921 tree local_res;
923 /* STORE_BB is the block where the phi
924 should be stored. It is the destination of the loop exit.
925 (Find the fallthru edge from OMP_CONTINUE). */
926 store_bb = FALLTHRU_EDGE (loop->latch)->dest;
928 /* STORE_BB has two predecessors. One coming from the loop
929 (the reduction's result is computed at the loop),
930 and another coming from a block preceding the loop,
931 when no iterations
932 are executed (the initial value should be taken). */
933 if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (loop->latch))
934 e = EDGE_PRED (store_bb, 1);
935 else
936 e = EDGE_PRED (store_bb, 0);
937 local_res = make_ssa_name (SSA_NAME_VAR (reduc->reduction_init), NULL_TREE);
938 new_phi = create_phi_node (local_res, store_bb);
939 SSA_NAME_DEF_STMT (local_res) = new_phi;
940 add_phi_arg (new_phi, reduc->init, e);
941 add_phi_arg (new_phi, GIMPLE_STMT_OPERAND (reduc->reduc_stmt, 0),
942 FALLTHRU_EDGE (loop->latch));
943 reduc->new_phi = new_phi;
945 return 1;
948 struct clsn_data
950 tree store;
951 tree load;
953 basic_block store_bb;
954 basic_block load_bb;
957 /* Callback for htab_traverse. Create an atomic instruction for the
958 reduction described in SLOT.
959 DATA annotates the place in memory the atomic operation relates to,
960 and the basic block it needs to be generated in. */
962 static int
963 create_call_for_reduction_1 (void **slot, void *data)
965 struct reduction_info *reduc = *slot;
966 struct clsn_data *clsn_data = data;
967 block_stmt_iterator bsi;
968 tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
969 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
970 tree load_struct;
971 basic_block bb;
972 basic_block new_bb;
973 edge e;
974 tree t, addr, addr_type, ref, x;
975 tree tmp_load, load, name;
977 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
978 t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
979 addr_type = build_pointer_type (type);
981 addr = build_addr (t, current_function_decl);
983 /* Create phi node. */
984 bb = clsn_data->load_bb;
986 e = split_block (bb, t);
987 new_bb = e->dest;
989 tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)), NULL);
990 add_referenced_var (tmp_load);
991 tmp_load = make_ssa_name (tmp_load, NULL);
992 load = build2 (OMP_ATOMIC_LOAD, void_type_node, tmp_load, addr);
993 SSA_NAME_DEF_STMT (tmp_load) = load;
994 bsi = bsi_start (new_bb);
995 bsi_insert_after (&bsi, load, BSI_NEW_STMT);
997 e = split_block (new_bb, load);
998 new_bb = e->dest;
999 bsi = bsi_start (new_bb);
1000 ref = tmp_load;
1002 fold_build2 (reduc->reduction_code,
1003 TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref,
1004 PHI_RESULT (reduc->new_phi));
1006 name =
1007 force_gimple_operand_bsi (&bsi, x, true, NULL_TREE, true,
1008 BSI_CONTINUE_LINKING);
1010 x = build1 (OMP_ATOMIC_STORE, void_type_node, name);
1012 bsi_insert_after (&bsi, x, BSI_NEW_STMT);
1013 return 1;
1016 /* Create the atomic operation at the join point of the threads.
1017 REDUCTION_LIST describes the reductions in the LOOP.
1018 LD_ST_DATA describes the shared data structure where
1019 shared data is stored in and loaded from. */
1020 static void
1021 create_call_for_reduction (struct loop *loop, htab_t reduction_list,
1022 struct clsn_data *ld_st_data)
1024 htab_traverse (reduction_list, create_phi_for_local_result, loop);
1025 /* Find the fallthru edge from OMP_CONTINUE. */
1026 ld_st_data->load_bb = FALLTHRU_EDGE (loop->latch)->dest;
1027 htab_traverse (reduction_list, create_call_for_reduction_1, ld_st_data);
1030 /* Callback for htab_traverse. Create a new variable that loads the
1031 final reduction value at the
1032 join point of all threads, adds the initial value the reduction
1033 variable had before the parallel computation started, and
1034 inserts it in the right place. */
1036 static int
1037 create_loads_for_reductions (void **slot, void *data)
1039 struct reduction_info *red = *slot;
1040 struct clsn_data *clsn_data = data;
1041 tree stmt;
1042 block_stmt_iterator bsi;
1043 tree type = TREE_TYPE (red->reduction_init);
1044 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
1045 tree load_struct;
1046 tree bvar, name;
1047 tree x;
1049 bsi = bsi_after_labels (clsn_data->load_bb);
1050 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
1051 load_struct = build3 (COMPONENT_REF, type, load_struct, red->field,
1052 NULL_TREE);
1053 bvar = create_tmp_var (type, "reduction_final");
1054 add_referenced_var (bvar);
1056 /* Apply operation between the new variable which is the result
1057 of computation all threads, and the initial value which is kept
1058 at reduction->inital_value. */
1060 stmt = build_gimple_modify_stmt (bvar, load_struct);
1061 name = make_ssa_name (bvar, stmt);
1062 GIMPLE_STMT_OPERAND (stmt, 0) = name;
1063 SSA_NAME_DEF_STMT (name) = stmt;
1065 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1068 fold_build2 (red->reduction_code, TREE_TYPE (load_struct),
1069 name, red->initial_value);
1070 name = PHI_RESULT (red->keep_res);
1071 stmt = build_gimple_modify_stmt (name, x);
1072 GIMPLE_STMT_OPERAND (stmt, 0) = name;
1073 SSA_NAME_DEF_STMT (name) = stmt;
1075 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1077 remove_phi_node (red->keep_res, NULL_TREE, false);
1079 return 1;
1082 /* Load the reduction result that was stored in LD_ST_DATA.
1083 REDUCTION_LIST describes the list of reductions that the
1084 loades should be generated for. */
1085 static void
1086 create_final_loads_for_reduction (htab_t reduction_list,
1087 struct clsn_data *ld_st_data)
1089 block_stmt_iterator bsi;
1090 tree t;
1092 bsi = bsi_after_labels (ld_st_data->load_bb);
1093 t = build_fold_addr_expr (ld_st_data->store);
1095 build_gimple_modify_stmt (ld_st_data->load,
1096 build_fold_addr_expr (ld_st_data->store));
1098 bsi_insert_before (&bsi, t, BSI_NEW_STMT);
1099 SSA_NAME_DEF_STMT (ld_st_data->load) = t;
1100 GIMPLE_STMT_OPERAND (t, 0) = ld_st_data->load;
1102 htab_traverse (reduction_list, create_loads_for_reductions, ld_st_data);
1106 /* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and
1107 store to a field of STORE in STORE_BB for the ssa name and its duplicate
1108 specified in SLOT. */
1110 static int
1111 create_loads_and_stores_for_name (void **slot, void *data)
1113 struct name_to_copy_elt *elt = *slot;
1114 struct clsn_data *clsn_data = data;
1115 tree stmt;
1116 block_stmt_iterator bsi;
1117 tree type = TREE_TYPE (elt->new_name);
1118 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
1119 tree load_struct;
1121 bsi = bsi_last (clsn_data->store_bb);
1122 stmt =
1123 build_gimple_modify_stmt (build3
1124 (COMPONENT_REF, type, clsn_data->store,
1125 elt->field, NULL_TREE),
1126 ssa_name (elt->version));
1127 mark_virtual_ops_for_renaming (stmt);
1128 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1130 bsi = bsi_last (clsn_data->load_bb);
1131 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
1132 stmt = build_gimple_modify_stmt (elt->new_name,
1133 build3 (COMPONENT_REF, type, load_struct,
1134 elt->field, NULL_TREE));
1135 SSA_NAME_DEF_STMT (elt->new_name) = stmt;
1136 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1138 return 1;
1141 /* Moves all the variables used in LOOP and defined outside of it (including
1142 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1143 name) to a structure created for this purpose. The code
1145 while (1)
1147 use (a);
1148 use (b);
1151 is transformed this way:
1153 bb0:
1154 old.a = a;
1155 old.b = b;
1157 bb1:
1158 a' = new->a;
1159 b' = new->b;
1160 while (1)
1162 use (a');
1163 use (b');
1166 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The
1167 pointer `new' is intentionally not initialized (the loop will be split to a
1168 separate function later, and `new' will be initialized from its arguments).
1169 LD_ST_DATA holds information about the shared data structure used to pass
1170 information among the threads. It is initialized here, and
1171 gen_parallel_loop will pass it to create_call_for_reduction that
1172 needs this information. REDUCTION_LIST describes the reductions
1173 in LOOP. */
1175 static void
1176 separate_decls_in_loop (struct loop *loop, htab_t reduction_list,
1177 tree * arg_struct, tree * new_arg_struct,
1178 struct clsn_data *ld_st_data)
1181 basic_block bb1 = split_edge (loop_preheader_edge (loop));
1182 basic_block bb0 = single_pred (bb1);
1183 htab_t name_copies = htab_create (10, name_to_copy_elt_hash,
1184 name_to_copy_elt_eq, free);
1185 htab_t decl_copies = htab_create (10, int_tree_map_hash, int_tree_map_eq,
1186 free);
1187 basic_block bb, *body = get_loop_body (loop);
1188 unsigned i;
1189 tree phi, type, type_name, nvar;
1190 block_stmt_iterator bsi;
1191 struct clsn_data clsn_data;
1193 /* Find and rename the ssa names defined outside of loop. */
1194 for (i = 0; i < loop->num_nodes; i++)
1196 bb = body[i];
1198 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1199 separate_decls_in_loop_stmt (loop, phi, name_copies, decl_copies);
1201 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
1202 separate_decls_in_loop_stmt (loop, bsi_stmt (bsi), name_copies,
1203 decl_copies);
1205 free (body);
1207 if (htab_elements (name_copies) == 0)
1209 /* It may happen that there is nothing to copy (if there are only
1210 loop carried and external variables in the loop). */
1211 *arg_struct = NULL;
1212 *new_arg_struct = NULL;
1214 else
1216 struct data_arg data_arg;
1218 /* Create the type for the structure to store the ssa names to. */
1219 type = lang_hooks.types.make_type (RECORD_TYPE);
1220 type_name = build_decl (TYPE_DECL, create_tmp_var_name (".paral_data"),
1221 type);
1222 TYPE_NAME (type) = type_name;
1224 data_arg.type = type;
1225 data_arg.reduction_list = reduction_list;
1226 htab_traverse (name_copies, add_field_for_name, &data_arg);
1227 layout_type (type);
1229 /* Create the loads and stores. */
1230 *arg_struct = create_tmp_var (type, ".paral_data_store");
1231 add_referenced_var (*arg_struct);
1232 nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
1233 add_referenced_var (nvar);
1234 *new_arg_struct = make_ssa_name (nvar, NULL_TREE);
1236 ld_st_data->store = *arg_struct;
1237 ld_st_data->load = *new_arg_struct;
1238 ld_st_data->store_bb = bb0;
1239 ld_st_data->load_bb = bb1;
1240 htab_traverse (name_copies, create_loads_and_stores_for_name,
1241 ld_st_data);
1243 /* Load the calculation from memory into a new
1244 reduction variable (after the join of the threads). */
1245 if (htab_elements (reduction_list) > 0)
1247 clsn_data.load = make_ssa_name (nvar, NULL_TREE);
1248 clsn_data.load_bb = single_dom_exit (loop)->dest;
1249 clsn_data.store = ld_st_data->store;
1250 create_final_loads_for_reduction (reduction_list, &clsn_data);
1254 htab_delete (decl_copies);
1255 htab_delete (name_copies);
1258 /* Bitmap containing uids of functions created by parallelization. We cannot
1259 allocate it from the default obstack, as it must live across compilation
1260 of several functions; we make it gc allocated instead. */
1262 static GTY(()) bitmap parallelized_functions;
1264 /* Returns true if FN was created by create_loop_fn. */
1266 static bool
1267 parallelized_function_p (tree fn)
1269 if (!parallelized_functions || !DECL_ARTIFICIAL (fn))
1270 return false;
1272 return bitmap_bit_p (parallelized_functions, DECL_UID (fn));
1275 /* Creates and returns an empty function that will receive the body of
1276 a parallelized loop. */
1278 static tree
1279 create_loop_fn (void)
1281 char buf[100];
1282 char *tname;
1283 tree decl, type, name, t;
1284 struct function *act_cfun = cfun;
1285 static unsigned loopfn_num;
1287 snprintf (buf, 100, "%s.$loopfn", current_function_name ());
1288 ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
1289 clean_symbol_name (tname);
1290 name = get_identifier (tname);
1291 type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
1293 decl = build_decl (FUNCTION_DECL, name, type);
1294 if (!parallelized_functions)
1295 parallelized_functions = BITMAP_GGC_ALLOC ();
1296 bitmap_set_bit (parallelized_functions, DECL_UID (decl));
1298 TREE_STATIC (decl) = 1;
1299 TREE_USED (decl) = 1;
1300 DECL_ARTIFICIAL (decl) = 1;
1301 DECL_IGNORED_P (decl) = 0;
1302 TREE_PUBLIC (decl) = 0;
1303 DECL_UNINLINABLE (decl) = 1;
1304 DECL_EXTERNAL (decl) = 0;
1305 DECL_CONTEXT (decl) = NULL_TREE;
1306 DECL_INITIAL (decl) = make_node (BLOCK);
1308 t = build_decl (RESULT_DECL, NULL_TREE, void_type_node);
1309 DECL_ARTIFICIAL (t) = 1;
1310 DECL_IGNORED_P (t) = 1;
1311 DECL_RESULT (decl) = t;
1313 t = build_decl (PARM_DECL, get_identifier (".paral_data_param"),
1314 ptr_type_node);
1315 DECL_ARTIFICIAL (t) = 1;
1316 DECL_ARG_TYPE (t) = ptr_type_node;
1317 DECL_CONTEXT (t) = decl;
1318 TREE_USED (t) = 1;
1319 DECL_ARGUMENTS (decl) = t;
1321 allocate_struct_function (decl);
1323 /* The call to allocate_struct_function clobbers CFUN, so we need to restore
1324 it. */
1325 cfun = act_cfun;
1327 return decl;
1330 /* Bases all the induction variables in LOOP on a single induction variable
1331 (unsigned with base 0 and step 1), whose final value is compared with
1332 NIT. The induction variable is incremented in the loop latch.
1333 REDUCTION_LIST describes the reductions in LOOP. */
1335 static void
1336 canonicalize_loop_ivs (struct loop *loop, htab_t reduction_list, tree nit)
1338 unsigned precision = TYPE_PRECISION (TREE_TYPE (nit));
1339 tree phi, prev, res, type, var_before, val, atype, t, next;
1340 block_stmt_iterator bsi;
1341 bool ok;
1342 affine_iv iv;
1343 edge exit = single_dom_exit (loop);
1344 struct reduction_info *red;
1346 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
1348 res = PHI_RESULT (phi);
1350 if (is_gimple_reg (res) && TYPE_PRECISION (TREE_TYPE (res)) > precision)
1351 precision = TYPE_PRECISION (TREE_TYPE (res));
1354 type = lang_hooks.types.type_for_size (precision, 1);
1356 bsi = bsi_last (loop->latch);
1357 create_iv (build_int_cst_type (type, 0), build_int_cst (type, 1), NULL_TREE,
1358 loop, &bsi, true, &var_before, NULL);
1360 bsi = bsi_after_labels (loop->header);
1361 prev = NULL;
1362 for (phi = phi_nodes (loop->header); phi; phi = next)
1364 next = PHI_CHAIN (phi);
1365 res = PHI_RESULT (phi);
1367 if (!is_gimple_reg (res) || res == var_before)
1369 prev = phi;
1370 continue;
1373 ok = simple_iv (loop, phi, res, &iv, true);
1374 red = reduction_phi (reduction_list, phi);
1375 /* We preserve the reduction phi nodes. */
1376 if (!ok && red)
1378 prev = phi;
1379 continue;
1381 else
1382 gcc_assert (ok);
1383 remove_phi_node (phi, prev, false);
1385 atype = TREE_TYPE (res);
1386 val = fold_build2 (PLUS_EXPR, atype,
1387 unshare_expr (iv.base),
1388 fold_build2 (MULT_EXPR, atype,
1389 unshare_expr (iv.step),
1390 fold_convert (atype, var_before)));
1391 val = force_gimple_operand_bsi (&bsi, val, false, NULL_TREE, true,
1392 BSI_SAME_STMT);
1393 t = build_gimple_modify_stmt (res, val);
1394 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
1395 SSA_NAME_DEF_STMT (res) = t;
1398 t = last_stmt (exit->src);
1399 /* Make the loop exit if the control condition is not satisfied. */
1400 if (exit->flags & EDGE_TRUE_VALUE)
1402 edge te, fe;
1404 extract_true_false_edges_from_block (exit->src, &te, &fe);
1405 te->flags = EDGE_FALSE_VALUE;
1406 fe->flags = EDGE_TRUE_VALUE;
1408 COND_EXPR_COND (t) = build2 (LT_EXPR, boolean_type_node, var_before, nit);
1411 /* Moves the exit condition of LOOP to the beginning of its header, and
1412 duplicates the part of the last iteration that gets disabled to the
1413 exit of the loop. NIT is the number of iterations of the loop
1414 (used to initialize the variables in the duplicated part).
1416 TODO: the common case is that latch of the loop is empty and immediatelly
1417 follows the loop exit. In this case, it would be better not to copy the
1418 body of the loop, but only move the entry of the loop directly before the
1419 exit check and increase the number of iterations of the loop by one.
1420 This may need some additional preconditioning in case NIT = ~0.
1421 REDUCTION_LIST describes the reductions in LOOP. */
1423 static void
1424 transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit)
1426 basic_block *bbs, *nbbs, ex_bb, orig_header;
1427 unsigned n;
1428 bool ok;
1429 edge exit = single_dom_exit (loop), hpred;
1430 tree phi, nphi, cond, control, control_name, res, t, cond_stmt;
1431 block_stmt_iterator bsi;
1433 split_block_after_labels (loop->header);
1434 orig_header = single_succ (loop->header);
1435 hpred = single_succ_edge (loop->header);
1437 cond_stmt = last_stmt (exit->src);
1438 cond = COND_EXPR_COND (cond_stmt);
1439 control = TREE_OPERAND (cond, 0);
1440 gcc_assert (TREE_OPERAND (cond, 1) == nit);
1442 /* Make sure that we have phi nodes on exit for all loop header phis
1443 (create_parallel_loop requires that). */
1444 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
1446 res = PHI_RESULT (phi);
1447 t = make_ssa_name (SSA_NAME_VAR (res), phi);
1448 SET_PHI_RESULT (phi, t);
1450 nphi = create_phi_node (res, orig_header);
1451 SSA_NAME_DEF_STMT (res) = nphi;
1452 add_phi_arg (nphi, t, hpred);
1454 if (res == control)
1456 TREE_OPERAND (cond, 0) = t;
1457 update_stmt (cond_stmt);
1458 control = t;
1462 bbs = get_loop_body_in_dom_order (loop);
1463 for (n = 0; bbs[n] != exit->src; n++)
1464 continue;
1465 nbbs = XNEWVEC (basic_block, n);
1466 ok = tree_duplicate_sese_tail (single_succ_edge (loop->header), exit,
1467 bbs + 1, n, nbbs);
1468 gcc_assert (ok);
1469 free (bbs);
1470 ex_bb = nbbs[0];
1471 free (nbbs);
1473 /* Other than reductions, the only gimple reg that should be copied
1474 out of the loop is the control variable. */
1476 control_name = NULL_TREE;
1477 for (phi = phi_nodes (ex_bb); phi; phi = PHI_CHAIN (phi))
1479 res = PHI_RESULT (phi);
1480 if (!is_gimple_reg (res))
1481 continue;
1483 /* Check if it is a part of reduction. If it is,
1484 keep the phi at the reduction's keep_res field. The
1485 PHI_RESULT of this phi is the resulting value of the reduction
1486 variable when exiting the loop. */
1488 exit = single_dom_exit (loop);
1490 if (htab_elements (reduction_list) > 0)
1492 struct reduction_info *red;
1494 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1496 red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
1497 if (red)
1498 red->keep_res = phi;
1500 else
1501 gcc_assert (control_name == NULL_TREE
1502 && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
1503 control_name = res;
1505 gcc_assert (control_name != NULL_TREE);
1506 phi = SSA_NAME_DEF_STMT (control_name);
1507 remove_phi_node (phi, NULL_TREE, false);
1509 /* Initialize the control variable to NIT. */
1510 bsi = bsi_after_labels (ex_bb);
1511 t = build_gimple_modify_stmt (control_name, nit);
1512 bsi_insert_before (&bsi, t, BSI_NEW_STMT);
1513 SSA_NAME_DEF_STMT (control_name) = t;
1516 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
1517 LOOP_FN and DATA are the arguments of OMP_PARALLEL.
1518 NEW_DATA is the variable that should be initialized from the argument
1519 of LOOP_FN. N_THREADS is the requested number of threads. Returns the
1520 basic block containing OMP_PARALLEL tree. */
1522 static basic_block
1523 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
1524 tree new_data, unsigned n_threads)
1526 block_stmt_iterator bsi;
1527 basic_block bb, paral_bb, for_bb, ex_bb;
1528 tree t, param, res, for_stmt;
1529 tree cvar, cvar_init, initvar, cvar_next, cvar_base, cond, phi, type;
1530 edge exit, nexit, guard, end, e;
1532 /* Prepare the OMP_PARALLEL statement. */
1533 bb = loop_preheader_edge (loop)->src;
1534 paral_bb = single_pred (bb);
1535 bsi = bsi_last (paral_bb);
1537 t = build_omp_clause (OMP_CLAUSE_NUM_THREADS);
1538 OMP_CLAUSE_NUM_THREADS_EXPR (t)
1539 = build_int_cst (integer_type_node, n_threads);
1540 t = build4 (OMP_PARALLEL, void_type_node, NULL_TREE, t, loop_fn, data);
1542 bsi_insert_after (&bsi, t, BSI_NEW_STMT);
1544 /* Initialize NEW_DATA. */
1545 if (data)
1547 bsi = bsi_after_labels (bb);
1549 param = make_ssa_name (DECL_ARGUMENTS (loop_fn), NULL_TREE);
1550 t = build_gimple_modify_stmt (param, build_fold_addr_expr (data));
1551 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
1552 SSA_NAME_DEF_STMT (param) = t;
1554 t = build_gimple_modify_stmt (new_data,
1555 fold_convert (TREE_TYPE (new_data),
1556 param));
1557 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
1558 SSA_NAME_DEF_STMT (new_data) = t;
1561 /* Emit OMP_RETURN for OMP_PARALLEL. */
1562 bb = split_loop_exit_edge (single_dom_exit (loop));
1563 bsi = bsi_last (bb);
1564 bsi_insert_after (&bsi, make_node (OMP_RETURN), BSI_NEW_STMT);
1566 /* Extract data for OMP_FOR. */
1567 gcc_assert (loop->header == single_dom_exit (loop)->src);
1568 cond = COND_EXPR_COND (last_stmt (loop->header));
1570 cvar = TREE_OPERAND (cond, 0);
1571 cvar_base = SSA_NAME_VAR (cvar);
1572 phi = SSA_NAME_DEF_STMT (cvar);
1573 cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1574 initvar = make_ssa_name (cvar_base, NULL_TREE);
1575 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
1576 initvar);
1577 cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
1579 bsi = bsi_last (loop->latch);
1580 gcc_assert (bsi_stmt (bsi) == SSA_NAME_DEF_STMT (cvar_next));
1581 bsi_remove (&bsi, true);
1583 /* Prepare cfg. */
1584 for_bb = split_edge (loop_preheader_edge (loop));
1585 ex_bb = split_loop_exit_edge (single_dom_exit (loop));
1586 extract_true_false_edges_from_block (loop->header, &nexit, &exit);
1587 gcc_assert (exit == single_dom_exit (loop));
1589 guard = make_edge (for_bb, ex_bb, 0);
1590 single_succ_edge (loop->latch)->flags = 0;
1591 end = make_edge (loop->latch, ex_bb, EDGE_FALLTHRU);
1592 for (phi = phi_nodes (ex_bb); phi; phi = PHI_CHAIN (phi))
1594 res = PHI_RESULT (phi);
1595 gcc_assert (!is_gimple_reg (phi));
1596 t = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, exit));
1597 add_phi_arg (phi, PHI_ARG_DEF_FROM_EDGE (t, loop_preheader_edge (loop)),
1598 guard);
1599 add_phi_arg (phi, PHI_ARG_DEF_FROM_EDGE (t, loop_latch_edge (loop)),
1600 end);
1602 e = redirect_edge_and_branch (exit, nexit->dest);
1603 PENDING_STMT (e) = NULL;
1605 /* Emit OMP_FOR. */
1606 TREE_OPERAND (cond, 0) = cvar_base;
1607 type = TREE_TYPE (cvar);
1608 t = build_omp_clause (OMP_CLAUSE_SCHEDULE);
1609 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
1611 for_stmt = make_node (OMP_FOR);
1612 TREE_TYPE (for_stmt) = void_type_node;
1613 OMP_FOR_CLAUSES (for_stmt) = t;
1614 OMP_FOR_INIT (for_stmt) = build_gimple_modify_stmt (initvar, cvar_init);
1615 OMP_FOR_COND (for_stmt) = cond;
1616 OMP_FOR_INCR (for_stmt) = build_gimple_modify_stmt (cvar_base,
1617 build2 (PLUS_EXPR, type,
1618 cvar_base,
1619 build_int_cst
1620 (type, 1)));
1621 OMP_FOR_BODY (for_stmt) = NULL_TREE;
1622 OMP_FOR_PRE_BODY (for_stmt) = NULL_TREE;
1624 bsi = bsi_last (for_bb);
1625 bsi_insert_after (&bsi, for_stmt, BSI_NEW_STMT);
1626 SSA_NAME_DEF_STMT (initvar) = for_stmt;
1628 /* Emit OMP_CONTINUE. */
1629 bsi = bsi_last (loop->latch);
1630 t = build2 (OMP_CONTINUE, void_type_node, cvar_next, cvar);
1631 bsi_insert_after (&bsi, t, BSI_NEW_STMT);
1632 SSA_NAME_DEF_STMT (cvar_next) = t;
1634 /* Emit OMP_RETURN for OMP_FOR. */
1635 bsi = bsi_last (ex_bb);
1636 bsi_insert_after (&bsi, make_node (OMP_RETURN), BSI_NEW_STMT);
1638 return paral_bb;
1641 /* Generates code to execute the iterations of LOOP in N_THREADS threads in
1642 parallel. NITER describes number of iterations of LOOP.
1643 REDUCTION_LIST describes the reductions existant in the LOOP. */
1645 static void
1646 gen_parallel_loop (struct loop *loop, htab_t reduction_list,
1647 unsigned n_threads, struct tree_niter_desc *niter)
1649 struct loop *nloop;
1650 tree many_iterations_cond, type, nit;
1651 tree stmts, arg_struct, new_arg_struct;
1652 basic_block parallel_head;
1653 struct clsn_data clsn_data;
1654 unsigned prob;
1656 /* From
1658 ---------------------------------------------------------------------
1659 loop
1661 IV = phi (INIT, IV + STEP)
1662 BODY1;
1663 if (COND)
1664 break;
1665 BODY2;
1667 ---------------------------------------------------------------------
1669 with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
1670 we generate the following code:
1672 ---------------------------------------------------------------------
1674 if (MAY_BE_ZERO
1675 || NITER < MIN_PER_THREAD * N_THREADS)
1676 goto original;
1678 BODY1;
1679 store all local loop-invariant variables used in body of the loop to DATA.
1680 OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
1681 load the variables from DATA.
1682 OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
1683 BODY2;
1684 BODY1;
1685 OMP_CONTINUE;
1686 OMP_RETURN -- OMP_FOR
1687 OMP_RETURN -- OMP_PARALLEL
1688 goto end;
1690 original:
1691 loop
1693 IV = phi (INIT, IV + STEP)
1694 BODY1;
1695 if (COND)
1696 break;
1697 BODY2;
1700 end:
1704 /* Create two versions of the loop -- in the old one, we know that the
1705 number of iterations is large enough, and we will transform it into the
1706 loop that will be split to loop_fn, the new one will be used for the
1707 remaining iterations. */
1709 type = TREE_TYPE (niter->niter);
1710 nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
1711 NULL_TREE);
1712 if (stmts)
1713 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1715 many_iterations_cond =
1716 fold_build2 (GE_EXPR, boolean_type_node,
1717 nit, build_int_cst (type, MIN_PER_THREAD * n_threads));
1718 many_iterations_cond
1719 = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1720 invert_truthvalue (unshare_expr (niter->may_be_zero)),
1721 many_iterations_cond);
1722 many_iterations_cond
1723 = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
1724 if (stmts)
1725 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1726 if (!is_gimple_condexpr (many_iterations_cond))
1728 many_iterations_cond
1729 = force_gimple_operand (many_iterations_cond, &stmts,
1730 true, NULL_TREE);
1731 if (stmts)
1732 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1735 initialize_original_copy_tables ();
1737 /* We assume that the loop usually iterates a lot. */
1738 prob = 4 * REG_BR_PROB_BASE / 5;
1739 nloop = loop_version (loop, many_iterations_cond, NULL,
1740 prob, prob, REG_BR_PROB_BASE - prob, true);
1741 update_ssa (TODO_update_ssa);
1742 free_original_copy_tables ();
1744 /* Base all the induction variables in LOOP on a single control one. */
1745 canonicalize_loop_ivs (loop, reduction_list, nit);
1747 /* Ensure that the exit condition is the first statement in the loop. */
1748 transform_to_exit_first_loop (loop, reduction_list, nit);
1751 /* Generate intializations for reductions. */
1753 if (htab_elements (reduction_list) > 0)
1754 htab_traverse (reduction_list, initialize_reductions, loop);
1756 /* Eliminate the references to local variables from the loop. */
1757 eliminate_local_variables (loop);
1759 /* In the old loop, move all variables non-local to the loop to a structure
1760 and back, and create separate decls for the variables used in loop. */
1761 separate_decls_in_loop (loop, reduction_list, &arg_struct, &new_arg_struct, &clsn_data);
1763 /* Create the parallel constructs. */
1764 parallel_head = create_parallel_loop (loop, create_loop_fn (), arg_struct,
1765 new_arg_struct, n_threads);
1766 if (htab_elements (reduction_list) > 0)
1767 create_call_for_reduction (loop, reduction_list, &clsn_data);
1769 scev_reset ();
1771 /* Cancel the loop (it is simpler to do it here rather than to teach the
1772 expander to do it). */
1773 cancel_loop_tree (loop);
1775 /* Expand the parallel constructs. We do it directly here instead of running
1776 a separate expand_omp pass, since it is more efficient, and less likely to
1777 cause troubles with further analyses not being able to deal with the
1778 OMP trees. */
1780 omp_expand_local (parallel_head);
1783 /* Detect parallel loops and generate parallel code using libgomp
1784 primitives. Returns true if some loop was parallelized, false
1785 otherwise. */
1787 bool
1788 parallelize_loops (void)
1790 unsigned n_threads = flag_tree_parallelize_loops;
1791 bool changed = false;
1792 struct loop *loop;
1793 struct tree_niter_desc niter_desc;
1794 loop_iterator li;
1795 htab_t reduction_list;
1797 /* Do not parallelize loops in the functions created by parallelization. */
1798 if (parallelized_function_p (cfun->decl))
1799 return false;
1801 reduction_list = htab_create (10, reduction_info_hash,
1802 reduction_info_eq, free);
1804 FOR_EACH_LOOP (li, loop, 0)
1806 htab_empty (reduction_list);
1807 if (/* Do not bother with loops in cold areas. */
1808 !maybe_hot_bb_p (loop->header)
1809 /* Or loops that roll too little. */
1810 || expected_loop_iterations (loop) <= n_threads
1811 /* And of course, the loop must be parallelizable. */
1812 || !can_duplicate_loop_p (loop)
1813 || !loop_parallel_p (loop, reduction_list, &niter_desc))
1814 continue;
1816 changed = true;
1817 gen_parallel_loop (loop, reduction_list, n_threads, &niter_desc);
1818 verify_flow_info ();
1819 verify_dominators (CDI_DOMINATORS);
1820 verify_loop_structure ();
1821 verify_loop_closed_ssa ();
1824 htab_delete (reduction_list);
1825 return changed;
1828 #include "gt-tree-parloops.h"