* tree-ssa-loop-manip.c (split_loop_exit_edge): Return the new block.
[official-gcc.git] / gcc / tree-parloops.c
blobacf0881adabc99d09dce77e2cdbe9e6408425f0a
1 /* Loop autoparallelization.
2 Copyright (C) 2006 Free Software Foundation, Inc.
3 Contributed by Sebastian Pop <pop@cri.ensmp.fr> and
4 Zdenek Dvorak <dvorakz@suse.cz>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
21 02110-1301, USA. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "rtl.h"
29 #include "tree-flow.h"
30 #include "cfgloop.h"
31 #include "ggc.h"
32 #include "tree-data-ref.h"
33 #include "diagnostic.h"
34 #include "tree-pass.h"
35 #include "tree-scalar-evolution.h"
36 #include "hashtab.h"
37 #include "langhooks.h"
39 /* This pass tries to distribute iterations of loops into several threads.
40 The implementation is straightforward -- for each loop we test whether its
41 iterations are independent, and if it is the case (and some additional
42 conditions regarding profitability and correctness are satisfied), we
43 split the loop to a separate function, and generate code to invoke the loop
44 in different threads for different parts of the iteration space; the most
45 complexity is in the code generation part.
47 TODO:
48 -- if there are several parallelizable loops in a function, it may be
49 possible to generate the threads just once (using synchronization to
50 ensure that cross-loop dependences are obeyed).
51 -- handling of common scalar dependence patterns (accumulation, ...)
52 -- handling of non-innermost loops */
54 /* Minimal number of iterations of a loop that should be executed in each
55 thread. */
56 #define MIN_PER_THREAD 100
58 /* Element of hashtable of names to copy. */
60 struct name_to_copy_elt
62 unsigned version; /* The version of the name to copy. */
63 tree new_name; /* The new name used in the copy. */
64 tree field; /* The field of the structure used to pass the
65 value. */
68 /* Equality and hash functions for hashtab code. */
70 static int
71 name_to_copy_elt_eq (const void *aa, const void *bb)
73 struct name_to_copy_elt *a = (struct name_to_copy_elt *) aa;
74 struct name_to_copy_elt *b = (struct name_to_copy_elt *) bb;
76 return a->version == b->version;
79 static hashval_t
80 name_to_copy_elt_hash (const void *aa)
82 struct name_to_copy_elt *a = (struct name_to_copy_elt *) aa;
84 return (hashval_t) a->version;
87 /* Returns true if the iterations of LOOP are independent on each other (that
88 is, if we can execute them in parallel), and if LOOP satisfies other
89 conditions that we need to be able to parallelize it. Description of number
90 of iterations is stored to NITER. */
92 static bool
93 loop_parallel_p (struct loop *loop, struct tree_niter_desc *niter)
95 edge exit = single_dom_exit (loop);
96 VEC (ddr_p, heap) *dependence_relations;
97 VEC (data_reference_p, heap) *datarefs;
98 lambda_trans_matrix trans;
99 bool ret = false;
100 tree phi;
102 /* Only consider innermost loops with just one exit. The innermost-loop
103 restriction is not necessary, but it makes things simpler. */
104 if (loop->inner || !exit)
105 return false;
107 if (dump_file && (dump_flags & TDF_DETAILS))
108 fprintf (dump_file, "\nConsidering loop %d\n", loop->num);
110 /* We need to know # of iterations, and there should be no uses of values
111 defined inside loop outside of it, unless the values are invariants of
112 the loop. */
113 if (!number_of_iterations_exit (loop, exit, niter, false))
115 if (dump_file && (dump_flags & TDF_DETAILS))
116 fprintf (dump_file, " FAILED: number of iterations not known\n");
117 return false;
120 for (phi = phi_nodes (exit->dest); phi; phi = PHI_CHAIN (phi))
122 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
124 if (is_gimple_reg (val))
126 if (dump_file && (dump_flags & TDF_DETAILS))
127 fprintf (dump_file, " FAILED: value used outside loop\n");
128 return false;
132 /* The iterations of the loop may comunicate only through bivs whose
133 iteration space can be distributed efficiently. */
134 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
136 tree def = PHI_RESULT (phi);
137 affine_iv iv;
139 if (is_gimple_reg (def)
140 && !simple_iv (loop, phi, def, &iv, true))
142 if (dump_file && (dump_flags & TDF_DETAILS))
143 fprintf (dump_file, " FAILED: non-biv phi node\n");
144 return false;
148 /* We need to version the loop to verify assumptions in runtime. */
149 if (!can_duplicate_loop_p (loop))
151 if (dump_file && (dump_flags & TDF_DETAILS))
152 fprintf (dump_file, " FAILED: cannot be duplicated\n");
153 return false;
156 /* Check for problems with dependences. If the loop can be reversed,
157 the iterations are indepent. */
158 datarefs = VEC_alloc (data_reference_p, heap, 10);
159 dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
160 compute_data_dependences_for_loop (loop, true, &datarefs,
161 &dependence_relations);
162 if (dump_file && (dump_flags & TDF_DETAILS))
163 dump_data_dependence_relations (dump_file, dependence_relations);
165 trans = lambda_trans_matrix_new (1, 1);
166 LTM_MATRIX (trans)[0][0] = -1;
168 if (lambda_transform_legal_p (trans, 1, dependence_relations))
170 ret = true;
171 if (dump_file && (dump_flags & TDF_DETAILS))
172 fprintf (dump_file, " SUCCESS: may be parallelized\n");
174 else if (dump_file && (dump_flags & TDF_DETAILS))
175 fprintf (dump_file, " FAILED: dependence check failed\n");
177 free_dependence_relations (dependence_relations);
178 free_data_refs (datarefs);
180 return ret;
183 /* Assigns the address of VAR in TYPE to an ssa name, and returns this name.
184 The assignment statement is placed before LOOP. DECL_ADDRESS maps decls
185 to their addresses that can be reused. */
187 static tree
188 take_address_of (tree var, tree type, struct loop *loop, htab_t decl_address)
190 int uid = DECL_UID (var);
191 void **dslot;
192 struct int_tree_map ielt, *nielt;
193 tree name, bvar, stmt;
194 edge entry = loop_preheader_edge (loop);
196 ielt.uid = uid;
197 dslot = htab_find_slot_with_hash (decl_address, &ielt, uid, INSERT);
198 if (!*dslot)
200 bvar = create_tmp_var (type, get_name (var));
201 add_referenced_var (bvar);
202 stmt = build2 (MODIFY_EXPR, void_type_node, bvar,
203 fold_convert (type,
204 build_addr (var, current_function_decl)));
205 name = make_ssa_name (bvar, stmt);
206 TREE_OPERAND (stmt, 0) = name;
207 bsi_insert_on_edge_immediate_loop (entry, stmt);
209 nielt = XNEW (struct int_tree_map);
210 nielt->uid = uid;
211 nielt->to = name;
212 *dslot = nielt;
214 return name;
217 name = ((struct int_tree_map *) *dslot)->to;
218 if (TREE_TYPE (name) == type)
219 return name;
221 bvar = SSA_NAME_VAR (name);
222 stmt = build2 (MODIFY_EXPR, void_type_node, bvar,
223 fold_convert (type, name));
224 name = make_ssa_name (bvar, stmt);
225 TREE_OPERAND (stmt, 0) = name;
226 bsi_insert_on_edge_immediate_loop (entry, stmt);
228 return name;
231 /* Eliminates references to local variables in *TP from LOOP. DECL_ADDRESS
232 contains addresses for the references for that we have already taken
233 them. If the expression is changed, CHANGED is set to true. Callback for
234 walk_tree. */
236 struct elv_data
238 struct loop *loop;
239 htab_t decl_address;
240 bool changed;
243 static tree
244 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
246 struct elv_data *dta = data;
247 tree t = *tp, var, addr, addr_type, type;
249 if (DECL_P (t))
251 *walk_subtrees = 0;
253 if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
254 return NULL_TREE;
256 type = TREE_TYPE (t);
257 addr_type = build_pointer_type (type);
258 addr = take_address_of (t, addr_type, dta->loop, dta->decl_address);
259 *tp = build1 (INDIRECT_REF, TREE_TYPE (*tp), addr);
261 dta->changed = true;
262 return NULL_TREE;
265 if (TREE_CODE (t) == ADDR_EXPR)
267 var = TREE_OPERAND (t, 0);
268 if (!DECL_P (var))
269 return NULL_TREE;
271 *walk_subtrees = 0;
272 if (!SSA_VAR_P (var) || DECL_EXTERNAL (var))
273 return NULL_TREE;
275 addr_type = TREE_TYPE (t);
276 addr = take_address_of (var, addr_type, dta->loop, dta->decl_address);
277 *tp = addr;
279 dta->changed = true;
280 return NULL_TREE;
283 if (!EXPR_P (t))
284 *walk_subtrees = 0;
286 return NULL_TREE;
289 /* Moves the references to local variables in STMT from LOOP. DECL_ADDRESS
290 contains addresses for the references for that we have already taken
291 them. */
293 static void
294 eliminate_local_variables_stmt (struct loop *loop, tree stmt,
295 htab_t decl_address)
297 struct elv_data dta;
299 dta.loop = loop;
300 dta.decl_address = decl_address;
301 dta.changed = false;
303 walk_tree (&stmt, eliminate_local_variables_1, &dta, NULL);
305 if (dta.changed)
306 update_stmt (stmt);
309 /* Eliminates the references to local variables from LOOP. This includes:
311 1) Taking address of a local variable -- these are moved out of the loop
312 (and temporary variable is created to hold the address if necessary).
313 2) Dereferencing a local variable -- these are replaced with indirect
314 references. */
316 static void
317 eliminate_local_variables (struct loop *loop)
319 basic_block bb, *body = get_loop_body (loop);
320 unsigned i;
321 block_stmt_iterator bsi;
322 htab_t decl_address = htab_create (10, int_tree_map_hash, int_tree_map_eq,
323 free);
325 /* Find and rename the ssa names defined outside of loop. */
326 for (i = 0; i < loop->num_nodes; i++)
328 bb = body[i];
330 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
331 eliminate_local_variables_stmt (loop, bsi_stmt (bsi), decl_address);
334 htab_delete (decl_address);
337 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
338 The copies are stored to NAME_COPIES, if NAME was already duplicated,
339 its duplicate stored in NAME_COPIES is returned.
341 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
342 duplicated, storing the copies in DECL_COPIES. */
344 static tree
345 separate_decls_in_loop_name (tree name,
346 htab_t name_copies, htab_t decl_copies,
347 bool copy_name_p)
349 tree copy, var, var_copy;
350 unsigned idx, uid, nuid;
351 struct int_tree_map ielt, *nielt;
352 struct name_to_copy_elt elt, *nelt;
353 void **slot, **dslot;
355 if (TREE_CODE (name) != SSA_NAME)
356 return name;
358 idx = SSA_NAME_VERSION (name);
359 elt.version = idx;
360 slot = htab_find_slot_with_hash (name_copies, &elt, idx,
361 copy_name_p ? INSERT : NO_INSERT);
362 if (slot && *slot)
363 return ((struct name_to_copy_elt *) *slot)->new_name;
365 var = SSA_NAME_VAR (name);
366 uid = DECL_UID (var);
367 ielt.uid = uid;
368 dslot = htab_find_slot_with_hash (decl_copies, &ielt, uid, INSERT);
369 if (!*dslot)
371 var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
372 add_referenced_var (var_copy);
373 nielt = XNEW (struct int_tree_map);
374 nielt->uid = uid;
375 nielt->to = var_copy;
376 *dslot = nielt;
378 /* Ensure that when we meet this decl next time, we won't duplicate
379 it again. */
380 nuid = DECL_UID (var_copy);
381 ielt.uid = nuid;
382 dslot = htab_find_slot_with_hash (decl_copies, &ielt, nuid, INSERT);
383 gcc_assert (!*dslot);
384 nielt = XNEW (struct int_tree_map);
385 nielt->uid = nuid;
386 nielt->to = var_copy;
387 *dslot = nielt;
389 else
390 var_copy = ((struct int_tree_map *) *dslot)->to;
392 if (copy_name_p)
394 copy = duplicate_ssa_name (name, NULL_TREE);
395 nelt = XNEW (struct name_to_copy_elt);
396 nelt->version = idx;
397 nelt->new_name = copy;
398 nelt->field = NULL_TREE;
399 *slot = nelt;
401 else
403 gcc_assert (!slot);
404 copy = name;
407 SSA_NAME_VAR (copy) = var_copy;
408 return copy;
411 /* Finds the ssa names used in STMT that are defined outside of LOOP and
412 replaces such ssa names with their duplicates. The duplicates are stored to
413 NAME_COPIES. Base decls of all ssa names used in STMT
414 (including those defined in LOOP) are replaced with the new temporary
415 variables; the replacement decls are stored in DECL_COPIES. */
417 static void
418 separate_decls_in_loop_stmt (struct loop *loop, tree stmt,
419 htab_t name_copies, htab_t decl_copies)
421 use_operand_p use;
422 def_operand_p def;
423 ssa_op_iter oi;
424 tree name, copy;
425 bool copy_name_p;
427 mark_virtual_ops_for_renaming (stmt);
429 FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
431 name = DEF_FROM_PTR (def);
432 gcc_assert (TREE_CODE (name) == SSA_NAME);
433 copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
434 false);
435 gcc_assert (copy == name);
438 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
440 name = USE_FROM_PTR (use);
441 if (TREE_CODE (name) != SSA_NAME)
442 continue;
444 copy_name_p = expr_invariant_in_loop_p (loop, name);
445 copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
446 copy_name_p);
447 SET_USE (use, copy);
451 /* Callback for htab_traverse. Adds a field corresponding to a ssa name
452 described in SLOT to the type passed in DATA. */
454 static int
455 add_field_for_name (void **slot, void *data)
457 struct name_to_copy_elt *elt = *slot;
458 tree type = data;
459 tree name = ssa_name (elt->version);
460 tree var = SSA_NAME_VAR (name);
461 tree field = build_decl (FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
463 insert_field_into_struct (type, field);
464 elt->field = field;
465 return 1;
468 /* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and
469 store to a field of STORE in STORE_BB for the ssa name and its duplicate
470 specified in SLOT. */
472 struct clsn_data
474 tree store;
475 tree load;
477 basic_block store_bb;
478 basic_block load_bb;
481 static int
482 create_loads_and_stores_for_name (void **slot, void *data)
484 struct name_to_copy_elt *elt = *slot;
485 struct clsn_data *clsn_data = data;
486 tree stmt;
487 block_stmt_iterator bsi;
488 tree type = TREE_TYPE (elt->new_name);
489 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
490 tree load_struct;
492 bsi = bsi_last (clsn_data->store_bb);
493 stmt = build2 (MODIFY_EXPR, void_type_node,
494 build3 (COMPONENT_REF, type, clsn_data->store, elt->field,
495 NULL_TREE),
496 ssa_name (elt->version));
497 mark_virtual_ops_for_renaming (stmt);
498 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
500 bsi = bsi_last (clsn_data->load_bb);
501 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
502 stmt = build2 (MODIFY_EXPR, void_type_node,
503 elt->new_name,
504 build3 (COMPONENT_REF, type, load_struct, elt->field,
505 NULL_TREE));
506 SSA_NAME_DEF_STMT (elt->new_name) = stmt;
507 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
509 return 1;
512 /* Moves all the variables used in LOOP and defined outside of it (including
513 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
514 name) to a structure created for this purpose. The code
516 while (1)
518 use (a);
519 use (b);
522 is transformed this way:
524 bb0:
525 old.a = a;
526 old.b = b;
528 bb1:
529 a' = new->a;
530 b' = new->b;
531 while (1)
533 use (a');
534 use (b');
537 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The
538 pointer `new' is intentionally not initialized (the loop will be split to a
539 separate function later, and `new' will be initialized from its arguments).
542 static void
543 separate_decls_in_loop (struct loop *loop, tree *arg_struct,
544 tree *new_arg_struct)
546 basic_block bb1 = loop_split_edge_with (loop_preheader_edge (loop), NULL);
547 basic_block bb0 = single_pred (bb1);
548 htab_t name_copies = htab_create (10, name_to_copy_elt_hash,
549 name_to_copy_elt_eq, free);
550 htab_t decl_copies = htab_create (10, int_tree_map_hash, int_tree_map_eq,
551 free);
552 basic_block bb, *body = get_loop_body (loop);
553 unsigned i;
554 tree phi, type, type_name, nvar;
555 block_stmt_iterator bsi;
556 struct clsn_data clsn_data;
558 /* Find and rename the ssa names defined outside of loop. */
559 for (i = 0; i < loop->num_nodes; i++)
561 bb = body[i];
563 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
564 separate_decls_in_loop_stmt (loop, phi, name_copies, decl_copies);
566 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
567 separate_decls_in_loop_stmt (loop, bsi_stmt (bsi), name_copies,
568 decl_copies);
570 free (body);
572 if (htab_elements (name_copies) == 0)
574 /* It may happen that there is nothing to copy (if there are only
575 loop carried and external variables in the loop). */
576 *arg_struct = NULL;
577 *new_arg_struct = NULL;
579 else
581 /* Create the type for the structure to store the ssa names to. */
582 type = lang_hooks.types.make_type (RECORD_TYPE);
583 type_name = build_decl (TYPE_DECL, create_tmp_var_name (".paral_data"),
584 type);
585 TYPE_NAME (type) = type_name;
587 htab_traverse (name_copies, add_field_for_name, type);
588 layout_type (type);
590 /* Create the loads and stores. */
591 *arg_struct = create_tmp_var (type, ".paral_data_store");
592 add_referenced_var (*arg_struct);
593 nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
594 add_referenced_var (nvar);
595 *new_arg_struct = make_ssa_name (nvar, NULL_TREE);
597 /* We should mark *arg_struct call clobbered. However, this means
598 we would need to update virtual operands for every function call.
599 To avoid this, we pretend this variable is volatile. */
600 TREE_THIS_VOLATILE (*arg_struct) = 1;
602 clsn_data.store = *arg_struct;
603 clsn_data.load = *new_arg_struct;
604 clsn_data.store_bb = bb0;
605 clsn_data.load_bb = bb1;
606 htab_traverse (name_copies, create_loads_and_stores_for_name,
607 &clsn_data);
610 htab_delete (decl_copies);
611 htab_delete (name_copies);
614 /* Bitmap containing uids of functions created by parallelization. We cannot
615 allocate it from the default obstack, as it must live across compilation
616 of several functions; we make it gc allocated instead. */
618 static GTY(()) bitmap parallelized_functions;
620 /* Returns true if FN was created by create_loop_fn. */
622 static bool
623 parallelized_function_p (tree fn)
625 if (!parallelized_functions || !DECL_ARTIFICIAL (fn))
626 return false;
628 return bitmap_bit_p (parallelized_functions, DECL_UID (fn));
631 /* Creates and returns an empty function to that the body of the loop
632 is later split. */
634 static tree
635 create_loop_fn (void)
637 char buf[100];
638 char *tname;
639 tree decl, type, name, t;
640 struct function *act_cfun = cfun;
641 static unsigned loopfn_num;
643 snprintf (buf, 100, "%s.$loopfn", current_function_name ());
644 ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
645 clean_symbol_name (tname);
646 name = get_identifier (tname);
647 type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
649 decl = build_decl (FUNCTION_DECL, name, type);
650 if (!parallelized_functions)
651 parallelized_functions = BITMAP_GGC_ALLOC ();
652 bitmap_set_bit (parallelized_functions, DECL_UID (decl));
654 TREE_STATIC (decl) = 1;
655 TREE_USED (decl) = 1;
656 DECL_ARTIFICIAL (decl) = 1;
657 DECL_IGNORED_P (decl) = 0;
658 TREE_PUBLIC (decl) = 0;
659 DECL_UNINLINABLE (decl) = 1;
660 DECL_EXTERNAL (decl) = 0;
661 DECL_CONTEXT (decl) = NULL_TREE;
662 DECL_INITIAL (decl) = make_node (BLOCK);
664 t = build_decl (RESULT_DECL, NULL_TREE, void_type_node);
665 DECL_ARTIFICIAL (t) = 1;
666 DECL_IGNORED_P (t) = 1;
667 DECL_RESULT (decl) = t;
669 t = build_decl (PARM_DECL, get_identifier (".paral_data_param"),
670 ptr_type_node);
671 DECL_ARTIFICIAL (t) = 1;
672 DECL_ARG_TYPE (t) = ptr_type_node;
673 DECL_CONTEXT (t) = decl;
674 TREE_USED (t) = 1;
675 DECL_ARGUMENTS (decl) = t;
677 allocate_struct_function (decl);
679 /* The call to allocate_struct_function clobbers CFUN, so we need to restore
680 it. */
681 cfun = act_cfun;
683 return decl;
686 /* Bases all the induction variables in LOOP on a single induction variable
687 (unsigned with base 0 and step 1), whose final value is compared with
688 NIT. The induction variable is incremented in the loop latch. */
690 static void
691 canonicalize_loop_ivs (struct loop *loop, tree nit)
693 unsigned precision = TYPE_PRECISION (TREE_TYPE (nit));
694 tree phi, prev, res, type, var_before, val, atype, t, next;
695 block_stmt_iterator bsi;
696 bool ok;
697 affine_iv iv;
698 edge exit = single_dom_exit (loop);
700 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
702 res = PHI_RESULT (phi);
704 if (is_gimple_reg (res)
705 && TYPE_PRECISION (TREE_TYPE (res)) > precision)
706 precision = TYPE_PRECISION (TREE_TYPE (res));
709 type = lang_hooks.types.type_for_size (precision, 1);
711 bsi = bsi_last (loop->latch);
712 create_iv (build_int_cst_type (type, 0), build_int_cst (type, 1), NULL_TREE,
713 loop, &bsi, true, &var_before, NULL);
715 bsi = bsi_after_labels (loop->header);
716 prev = NULL;
717 for (phi = phi_nodes (loop->header); phi; phi = next)
719 next = PHI_CHAIN (phi);
720 res = PHI_RESULT (phi);
722 if (!is_gimple_reg (res)
723 || res == var_before)
725 prev = phi;
726 continue;
729 ok = simple_iv (loop, phi, res, &iv, true);
730 gcc_assert (ok);
732 SET_PHI_RESULT (phi, NULL_TREE);
733 remove_phi_node (phi, prev);
735 atype = TREE_TYPE (res);
736 val = fold_build2 (PLUS_EXPR, atype,
737 unshare_expr (iv.base),
738 fold_build2 (MULT_EXPR, atype,
739 unshare_expr (iv.step),
740 fold_convert (atype, var_before)));
741 val = force_gimple_operand_bsi (&bsi, val, false, NULL_TREE, true,
742 BSI_SAME_STMT);
743 t = build2 (MODIFY_EXPR, void_type_node, res, val);
744 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
745 SSA_NAME_DEF_STMT (res) = t;
748 t = last_stmt (exit->src);
749 /* Make the loop exit if the control condition is not satisfied. */
750 if (exit->flags & EDGE_TRUE_VALUE)
752 edge te, fe;
754 extract_true_false_edges_from_block (exit->src, &te, &fe);
755 te->flags = EDGE_FALSE_VALUE;
756 fe->flags = EDGE_TRUE_VALUE;
758 COND_EXPR_COND (t) = build2 (LT_EXPR, boolean_type_node, var_before, nit);
761 /* Moves the exit condition of LOOP to the beginning of its header, and
762 duplicates the part of the last iteration that gets disabled to the
763 exit of the loop. NIT is the number of iterations of the loop
764 (used to initialize the variables in the duplicated part).
766 TODO: the common case is that latch of the loop is empty and immediatelly
767 follows the loop exit. In this case, it would be better not to copy the
768 body of the loop, but rather increase the number of iterations of the loop
769 by one. This may need some additional preconditioning in case NIT = ~0. */
771 static void
772 transform_to_exit_first_loop (struct loop *loop, tree nit)
774 basic_block *bbs, *nbbs, ex_bb, orig_header;
775 unsigned n;
776 bool ok;
777 edge exit = single_dom_exit (loop), hpred;
778 tree phi, nphi, cond, control, control_name, res, t, cond_stmt;
779 block_stmt_iterator bsi;
781 split_block_after_labels (loop->header);
782 orig_header = single_succ (loop->header);
783 hpred = single_succ_edge (loop->header);
784 add_bb_to_loop (orig_header, loop);
786 cond_stmt = last_stmt (exit->src);
787 cond = COND_EXPR_COND (cond_stmt);
788 control = TREE_OPERAND (cond, 0);
789 gcc_assert (TREE_OPERAND (cond, 1) == nit);
791 /* Make sure that we have phi nodes on exit for all loop header phis
792 (create_parallel_loop requires that). */
793 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
795 res = PHI_RESULT (phi);
796 t = make_ssa_name (SSA_NAME_VAR (res), phi);
797 SET_PHI_RESULT (phi, t);
799 nphi = create_phi_node (res, orig_header);
800 SSA_NAME_DEF_STMT (res) = nphi;
801 add_phi_arg (nphi, t, hpred);
803 if (res == control)
805 TREE_OPERAND (cond, 0) = t;
806 update_stmt (cond_stmt);
807 control = t;
811 bbs = get_loop_body_in_dom_order (loop);
812 for (n = 0; bbs[n] != exit->src; n++)
813 continue;
814 nbbs = XNEWVEC (basic_block, n);
815 ok = tree_duplicate_sese_tail (single_succ_edge (loop->header), exit,
816 bbs + 1, n, nbbs);
817 gcc_assert (ok);
818 free (bbs);
819 ex_bb = nbbs[0];
820 free (nbbs);
822 /* The only gimple reg that should be copied out of the loop is the
823 control variable. */
824 control_name = NULL_TREE;
825 for (phi = phi_nodes (ex_bb); phi; phi = PHI_CHAIN (phi))
827 res = PHI_RESULT (phi);
828 if (!is_gimple_reg (res))
829 continue;
831 gcc_assert (control_name == NULL_TREE
832 && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
833 control_name = res;
835 gcc_assert (control_name != NULL_TREE);
836 phi = SSA_NAME_DEF_STMT (control_name);
837 SET_PHI_RESULT (phi, NULL_TREE);
838 remove_phi_node (phi, NULL_TREE);
840 /* Initialize the control variable to NIT. */
841 bsi = bsi_after_labels (ex_bb);
842 t = build2 (MODIFY_EXPR, void_type_node, control_name, nit);
843 bsi_insert_before (&bsi, t, BSI_NEW_STMT);
844 SSA_NAME_DEF_STMT (control_name) = t;
847 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
848 LOOP_FN and DATA are the arguments of OMP_PARALLEL.
849 NEW_DATA is the variable that should be initialized from the argument
850 of LOOP_FN. N_THREADS is the requested number of threads. Returns the
851 basic block containing OMP_PARALLEL tree. */
853 static basic_block
854 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
855 tree new_data, unsigned n_threads)
857 block_stmt_iterator bsi;
858 basic_block bb, paral_bb, for_bb, ex_bb;
859 tree t, param, res;
860 tree cvar, cvar_init, initvar, cvar_next, cvar_base, cond, phi, type;
861 edge exit, nexit, guard, end, e;
863 /* Prepare the OMP_PARALLEL statement. */
864 bb = loop_preheader_edge (loop)->src;
865 paral_bb = single_pred (bb);
866 bsi = bsi_last (paral_bb);
868 t = build_omp_clause (OMP_CLAUSE_NUM_THREADS);
869 OMP_CLAUSE_NUM_THREADS_EXPR (t)
870 = build_int_cst (integer_type_node, n_threads);
871 t = build4 (OMP_PARALLEL, void_type_node, NULL_TREE, t,
872 loop_fn, data);
874 bsi_insert_after (&bsi, t, BSI_NEW_STMT);
876 /* Initialize NEW_DATA. */
877 if (data)
879 bsi = bsi_after_labels (bb);
881 param = make_ssa_name (DECL_ARGUMENTS (loop_fn), NULL_TREE);
882 t = build2 (MODIFY_EXPR, void_type_node, param, build_fold_addr_expr (data));
883 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
884 SSA_NAME_DEF_STMT (param) = t;
886 t = build2 (MODIFY_EXPR, void_type_node, new_data,
887 fold_convert (TREE_TYPE (new_data), param));
888 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
889 SSA_NAME_DEF_STMT (new_data) = t;
892 /* Emit OMP_RETURN for OMP_PARALLEL. */
893 bb = split_loop_exit_edge (single_dom_exit (loop));
894 bsi = bsi_last (bb);
895 bsi_insert_after (&bsi, make_node (OMP_RETURN), BSI_NEW_STMT);
897 /* Extract data for OMP_FOR. */
898 gcc_assert (loop->header == single_dom_exit (loop)->src);
899 cond = COND_EXPR_COND (last_stmt (loop->header));
901 cvar = TREE_OPERAND (cond, 0);
902 cvar_base = SSA_NAME_VAR (cvar);
903 phi = SSA_NAME_DEF_STMT (cvar);
904 cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
905 initvar = make_ssa_name (cvar_base, NULL_TREE);
906 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
907 initvar);
908 cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
910 bsi = bsi_last (loop->latch);
911 gcc_assert (bsi_stmt (bsi) == SSA_NAME_DEF_STMT (cvar_next));
912 bsi_remove (&bsi, true);
914 /* Prepare cfg. */
915 for_bb = loop_split_edge_with (loop_preheader_edge (loop), NULL);
916 ex_bb = split_loop_exit_edge (single_dom_exit (loop));
917 extract_true_false_edges_from_block (loop->header, &nexit, &exit);
918 gcc_assert (exit == single_dom_exit (loop));
920 guard = make_edge (for_bb, ex_bb, 0);
921 single_succ_edge (loop->latch)->flags = 0;
922 end = make_edge (loop->latch, ex_bb, EDGE_FALLTHRU);
923 for (phi = phi_nodes (ex_bb); phi; phi = PHI_CHAIN (phi))
925 res = PHI_RESULT (phi);
926 gcc_assert (!is_gimple_reg (phi));
927 t = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, exit));
928 add_phi_arg (phi, PHI_ARG_DEF_FROM_EDGE (t, loop_preheader_edge (loop)),
929 guard);
930 add_phi_arg (phi, PHI_ARG_DEF_FROM_EDGE (t, loop_latch_edge (loop)),
931 end);
933 e = redirect_edge_and_branch (exit, nexit->dest);
934 PENDING_STMT (e) = NULL;
936 /* Emit OMP_FOR. */
937 TREE_OPERAND (cond, 0) = cvar_base;
938 type = TREE_TYPE (cvar);
939 t = build_omp_clause (OMP_CLAUSE_SCHEDULE);
940 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
941 t = build6 (OMP_FOR, void_type_node, NULL_TREE, t,
942 build2 (MODIFY_EXPR, void_type_node, initvar, cvar_init),
943 cond,
944 build2 (MODIFY_EXPR, void_type_node,
945 cvar_base,
946 build2 (PLUS_EXPR, type,
947 cvar_base,
948 build_int_cst (type, 1))),
949 NULL_TREE);
950 bsi = bsi_last (for_bb);
951 bsi_insert_after (&bsi, t, BSI_NEW_STMT);
952 SSA_NAME_DEF_STMT (initvar) = t;
954 /* Emit OMP_CONTINUE. */
955 bsi = bsi_last (loop->latch);
956 t = build2 (OMP_CONTINUE, void_type_node, cvar_next, cvar);
957 bsi_insert_after (&bsi, t, BSI_NEW_STMT);
958 SSA_NAME_DEF_STMT (cvar_next) = t;
960 /* Emit OMP_RETURN for OMP_FOR. */
961 bsi = bsi_last (ex_bb);
962 bsi_insert_after (&bsi, make_node (OMP_RETURN), BSI_NEW_STMT);
964 return paral_bb;
967 /* Generates code to execute the iterations of LOOP in N_THREADS threads in
968 parallel. NITER describes number of iterations of LOOP. */
970 static void
971 gen_parallel_loop (struct loop *loop, unsigned n_threads,
972 struct tree_niter_desc *niter)
974 struct loop *nloop;
975 tree many_iterations_cond, type, nit;
976 tree stmts, arg_struct, new_arg_struct;
977 basic_block parallel_head;
979 /* From
981 ---------------------------------------------------------------------
982 loop
984 IV = phi (INIT, IV + STEP)
985 BODY1;
986 if (COND)
987 break;
988 BODY2;
990 ---------------------------------------------------------------------
992 with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
993 we generate the following code:
995 ---------------------------------------------------------------------
997 if (MAY_BE_ZERO
998 || NITER < MIN_PER_THREAD * N_THREADS)
999 goto original;
1001 BODY1;
1002 store all local loop-invariant variables used in body of the loop to DATA.
1003 OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
1004 load the variables from DATA.
1005 OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
1006 BODY2;
1007 BODY1;
1008 OMP_CONTINUE;
1009 OMP_RETURN -- OMP_FOR
1010 OMP_RETURN -- OMP_PARALLEL
1011 goto end;
1013 original:
1014 loop
1016 IV = phi (INIT, IV + STEP)
1017 BODY1;
1018 if (COND)
1019 break;
1020 BODY2;
1023 end:
1027 /* Create two versions of the loop -- in the old one, we know that the
1028 number of iterations is large enough, and we will transform it into the
1029 loop that will be split to loop_fn, the new one will be used for the
1030 remaining iterations. */
1032 type = TREE_TYPE (niter->niter);
1033 nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
1034 NULL_TREE);
1035 if (stmts)
1036 bsi_insert_on_edge_immediate_loop (loop_preheader_edge (loop), stmts);
1038 many_iterations_cond =
1039 fold_build2 (GE_EXPR, boolean_type_node,
1040 nit, build_int_cst (type, MIN_PER_THREAD * n_threads));
1041 many_iterations_cond
1042 = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1043 invert_truthvalue (unshare_expr (niter->may_be_zero)),
1044 many_iterations_cond);
1045 many_iterations_cond
1046 = force_gimple_operand (many_iterations_cond, &stmts,
1047 false, NULL_TREE);
1048 if (stmts)
1049 bsi_insert_on_edge_immediate_loop (loop_preheader_edge (loop), stmts);
1050 if (!is_gimple_condexpr (many_iterations_cond))
1052 many_iterations_cond
1053 = force_gimple_operand (many_iterations_cond, &stmts,
1054 true, NULL_TREE);
1055 if (stmts)
1056 bsi_insert_on_edge_immediate_loop (loop_preheader_edge (loop), stmts);
1059 initialize_original_copy_tables ();
1060 nloop = loop_version (current_loops, loop, many_iterations_cond, NULL, true);
1061 update_ssa (TODO_update_ssa);
1062 free_original_copy_tables ();
1064 /* Base all the induction variables in LOOP on a single control one. */
1065 canonicalize_loop_ivs (loop, nit);
1067 /* Ensure that the exit condition is the first statement in the loop. */
1068 transform_to_exit_first_loop (loop, nit);
1070 /* Eliminate the references to local variables from the loop. */
1071 eliminate_local_variables (loop);
1073 /* In the old loop, move all variables non-local to the loop to a structure
1074 and back, and create separate decls for the variables used in loop. */
1075 separate_decls_in_loop (loop, &arg_struct, &new_arg_struct);
1077 /* Create the parallel constructs. */
1078 parallel_head = create_parallel_loop (loop, create_loop_fn (), arg_struct,
1079 new_arg_struct, n_threads);
1081 scev_reset ();
1083 /* Cancel the loop (it is simpler to do it here rather than to teach the
1084 expander to do it). */
1085 cancel_loop_tree (current_loops, loop);
1087 /* Expand the parallel constructs. We do it directly here instead of running
1088 a separate expand_omp pass, since it is more efficient, and less likely to
1089 cause troubles with further analyses not being able to deal with the
1090 OMP trees. */
1091 omp_expand_local (parallel_head);
1094 /* Detect parallel loops and generate parallel code using libgomp
1095 primitives. */
1097 bool
1098 parallelize_loops (void)
1100 unsigned i, n_threads = flag_tree_parallelize_loops;
1101 unsigned n = current_loops->num;
1102 bool changed = false;
1103 struct loop *loop;
1104 struct tree_niter_desc niter_desc;
1106 /* Do not parallelize loops in the functions created by parallelization. */
1107 if (parallelized_function_p (cfun->decl))
1108 return false;
1110 for (i = 1; i < n; i++)
1112 loop = current_loops->parray[i];
1113 if (!loop
1114 /* Do not bother with loops in cold areas. */
1115 || !maybe_hot_bb_p (loop->header)
1116 /* Or loops that roll too little. */
1117 || expected_loop_iterations (loop) <= n_threads
1118 /* And of course, the loop must be parallelizable. */
1119 || !can_duplicate_loop_p (loop)
1120 || !loop_parallel_p (loop, &niter_desc))
1121 continue;
1123 changed = true;
1124 gen_parallel_loop (loop, n_threads, &niter_desc);
1125 verify_flow_info ();
1126 verify_dominators (CDI_DOMINATORS);
1127 verify_loop_structure (current_loops);
1128 verify_loop_closed_ssa ();
1131 return changed;
1134 #include "gt-tree-parloops.h"