fwprop: Fix single_use_p calculation
[official-gcc.git] / gcc / omp-offload.c
blobba0937fba9405ebc432923b55499691d3b9a9464
1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
4 Copyright (C) 2005-2021 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
55 #include "context.h"
57 /* Describe the OpenACC looping structure of a function. The entire
58 function is held in a 'NULL' loop. */
60 struct oacc_loop
62 oacc_loop *parent; /* Containing loop. */
64 oacc_loop *child; /* First inner loop. */
66 oacc_loop *sibling; /* Next loop within same parent. */
68 location_t loc; /* Location of the loop start. */
70 gcall *marker; /* Initial head marker. */
72 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
73 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
75 tree routine; /* Pseudo-loop enclosing a routine. */
77 unsigned mask; /* Partitioning mask. */
78 unsigned e_mask; /* Partitioning of element loops (when tiling). */
79 unsigned inner; /* Partitioning of inner loops. */
80 unsigned flags; /* Partitioning flags. */
81 vec<gcall *> ifns; /* Contained loop abstraction functions. */
82 tree chunk_size; /* Chunk size. */
83 gcall *head_end; /* Final marker of head sequence. */
86 /* Holds offload tables with decls. */
87 vec<tree, va_gc> *offload_funcs, *offload_vars;
89 /* Return level at which oacc routine may spawn a partitioned loop, or
90 -1 if it is not a routine (i.e. is an offload fn). */
92 int
93 oacc_fn_attrib_level (tree attr)
95 tree pos = TREE_VALUE (attr);
97 if (!TREE_PURPOSE (pos))
98 return -1;
100 int ix = 0;
101 for (ix = 0; ix != GOMP_DIM_MAX;
102 ix++, pos = TREE_CHAIN (pos))
103 if (!integer_zerop (TREE_PURPOSE (pos)))
104 break;
106 return ix;
109 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
110 adds their addresses and sizes to constructor-vector V_CTOR. */
112 static void
113 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
114 vec<constructor_elt, va_gc> *v_ctor)
116 unsigned len = vec_safe_length (v_decls);
117 for (unsigned i = 0; i < len; i++)
119 tree it = (*v_decls)[i];
120 bool is_var = VAR_P (it);
121 bool is_link_var
122 = is_var
123 #ifdef ACCEL_COMPILER
124 && DECL_HAS_VALUE_EXPR_P (it)
125 #endif
126 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
128 /* See also omp_finish_file and output_offload_tables in lto-cgraph.c. */
129 if (!in_lto_p && !symtab_node::get (it))
130 continue;
132 tree size = NULL_TREE;
133 if (is_var)
134 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
136 tree addr;
137 if (!is_link_var)
138 addr = build_fold_addr_expr (it);
139 else
141 #ifdef ACCEL_COMPILER
142 /* For "omp declare target link" vars add address of the pointer to
143 the target table, instead of address of the var. */
144 tree value_expr = DECL_VALUE_EXPR (it);
145 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
146 varpool_node::finalize_decl (link_ptr_decl);
147 addr = build_fold_addr_expr (link_ptr_decl);
148 #else
149 addr = build_fold_addr_expr (it);
150 #endif
152 /* Most significant bit of the size marks "omp declare target link"
153 vars in host and target tables. */
154 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
155 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
156 * BITS_PER_UNIT - 1);
157 size = wide_int_to_tree (const_ptr_type_node, isize);
160 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
161 if (is_var)
162 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
166 /* Return true if DECL is a function for which its references should be
167 analyzed. */
169 static bool
170 omp_declare_target_fn_p (tree decl)
172 return (TREE_CODE (decl) == FUNCTION_DECL
173 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
174 && !lookup_attribute ("omp declare target host",
175 DECL_ATTRIBUTES (decl))
176 && (!flag_openacc
177 || oacc_get_fn_attrib (decl) == NULL_TREE));
180 /* Return true if DECL Is a variable for which its initializer references
181 should be analyzed. */
183 static bool
184 omp_declare_target_var_p (tree decl)
186 return (VAR_P (decl)
187 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
188 && !lookup_attribute ("omp declare target link",
189 DECL_ATTRIBUTES (decl)));
192 /* Helper function for omp_discover_implicit_declare_target, called through
193 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
194 declare target to. */
196 static tree
197 omp_discover_declare_target_tgt_fn_r (tree *tp, int *walk_subtrees, void *data)
199 if (TREE_CODE (*tp) == CALL_EXPR
200 && CALL_EXPR_FN (*tp)
201 && TREE_CODE (CALL_EXPR_FN (*tp)) == ADDR_EXPR
202 && TREE_CODE (TREE_OPERAND (CALL_EXPR_FN (*tp), 0)) == FUNCTION_DECL
203 && lookup_attribute ("omp declare variant base",
204 DECL_ATTRIBUTES (TREE_OPERAND (CALL_EXPR_FN (*tp),
205 0))))
207 tree fn = TREE_OPERAND (CALL_EXPR_FN (*tp), 0);
208 for (tree attr = DECL_ATTRIBUTES (fn); attr; attr = TREE_CHAIN (attr))
210 attr = lookup_attribute ("omp declare variant base", attr);
211 if (attr == NULL_TREE)
212 break;
213 tree purpose = TREE_PURPOSE (TREE_VALUE (attr));
214 if (TREE_CODE (purpose) == FUNCTION_DECL)
215 omp_discover_declare_target_tgt_fn_r (&purpose, walk_subtrees, data);
218 else if (TREE_CODE (*tp) == FUNCTION_DECL)
220 tree decl = *tp;
221 tree id = get_identifier ("omp declare target");
222 symtab_node *node = symtab_node::get (*tp);
223 if (node != NULL)
225 while (node->alias_target
226 && TREE_CODE (node->alias_target) == FUNCTION_DECL)
228 if (!omp_declare_target_fn_p (node->decl)
229 && !lookup_attribute ("omp declare target host",
230 DECL_ATTRIBUTES (node->decl)))
232 node->offloadable = 1;
233 DECL_ATTRIBUTES (node->decl)
234 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
236 node = symtab_node::get (node->alias_target);
238 symtab_node *new_node = node->ultimate_alias_target ();
239 decl = new_node->decl;
240 while (node != new_node)
242 if (!omp_declare_target_fn_p (node->decl)
243 && !lookup_attribute ("omp declare target host",
244 DECL_ATTRIBUTES (node->decl)))
246 node->offloadable = 1;
247 DECL_ATTRIBUTES (node->decl)
248 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
250 gcc_assert (node->alias && node->analyzed);
251 node = node->get_alias_target ();
253 node->offloadable = 1;
254 if (ENABLE_OFFLOADING)
255 g->have_offload = true;
257 if (omp_declare_target_fn_p (decl)
258 || lookup_attribute ("omp declare target host",
259 DECL_ATTRIBUTES (decl)))
260 return NULL_TREE;
262 if (!DECL_EXTERNAL (decl) && DECL_SAVED_TREE (decl))
263 ((vec<tree> *) data)->safe_push (decl);
264 DECL_ATTRIBUTES (decl) = tree_cons (id, NULL_TREE,
265 DECL_ATTRIBUTES (decl));
267 else if (TYPE_P (*tp))
268 *walk_subtrees = 0;
269 /* else if (TREE_CODE (*tp) == OMP_TARGET)
271 if (tree dev = omp_find_clause (OMP_TARGET_CLAUSES (*tp)))
272 if (OMP_DEVICE_ANCESTOR (dev))
273 *walk_subtrees = 0;
274 } */
275 return NULL_TREE;
278 /* Similarly, but ignore references outside of OMP_TARGET regions. */
280 static tree
281 omp_discover_declare_target_fn_r (tree *tp, int *walk_subtrees, void *data)
283 if (TREE_CODE (*tp) == OMP_TARGET)
285 /* And not OMP_DEVICE_ANCESTOR. */
286 walk_tree_without_duplicates (&OMP_TARGET_BODY (*tp),
287 omp_discover_declare_target_tgt_fn_r,
288 data);
289 *walk_subtrees = 0;
291 else if (TYPE_P (*tp))
292 *walk_subtrees = 0;
293 return NULL_TREE;
296 /* Helper function for omp_discover_implicit_declare_target, called through
297 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
298 declare target to. */
300 static tree
301 omp_discover_declare_target_var_r (tree *tp, int *walk_subtrees, void *data)
303 if (TREE_CODE (*tp) == FUNCTION_DECL)
304 return omp_discover_declare_target_tgt_fn_r (tp, walk_subtrees, data);
305 else if (VAR_P (*tp)
306 && is_global_var (*tp)
307 && !omp_declare_target_var_p (*tp))
309 tree id = get_identifier ("omp declare target");
310 if (lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp)))
312 error_at (DECL_SOURCE_LOCATION (*tp),
313 "%qD specified both in declare target %<link%> and "
314 "implicitly in %<to%> clauses", *tp);
315 DECL_ATTRIBUTES (*tp)
316 = remove_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp));
318 if (TREE_STATIC (*tp) && lang_hooks.decls.omp_get_decl_init (*tp))
319 ((vec<tree> *) data)->safe_push (*tp);
320 DECL_ATTRIBUTES (*tp) = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (*tp));
321 symtab_node *node = symtab_node::get (*tp);
322 if (node != NULL && !node->offloadable)
324 node->offloadable = 1;
325 if (ENABLE_OFFLOADING)
327 g->have_offload = true;
328 if (is_a <varpool_node *> (node))
329 vec_safe_push (offload_vars, node->decl);
333 else if (TYPE_P (*tp))
334 *walk_subtrees = 0;
335 return NULL_TREE;
338 /* Perform the OpenMP implicit declare target to discovery. */
340 void
341 omp_discover_implicit_declare_target (void)
343 cgraph_node *node;
344 varpool_node *vnode;
345 auto_vec<tree> worklist;
347 FOR_EACH_DEFINED_FUNCTION (node)
348 if (DECL_SAVED_TREE (node->decl))
350 struct cgraph_node *cgn;
351 if (omp_declare_target_fn_p (node->decl))
352 worklist.safe_push (node->decl);
353 else if (DECL_STRUCT_FUNCTION (node->decl)
354 && DECL_STRUCT_FUNCTION (node->decl)->has_omp_target)
355 worklist.safe_push (node->decl);
356 for (cgn = first_nested_function (node);
357 cgn; cgn = next_nested_function (cgn))
358 if (omp_declare_target_fn_p (cgn->decl))
359 worklist.safe_push (cgn->decl);
360 else if (DECL_STRUCT_FUNCTION (cgn->decl)
361 && DECL_STRUCT_FUNCTION (cgn->decl)->has_omp_target)
362 worklist.safe_push (cgn->decl);
364 FOR_EACH_VARIABLE (vnode)
365 if (lang_hooks.decls.omp_get_decl_init (vnode->decl)
366 && omp_declare_target_var_p (vnode->decl))
367 worklist.safe_push (vnode->decl);
368 while (!worklist.is_empty ())
370 tree decl = worklist.pop ();
371 if (VAR_P (decl))
372 walk_tree_without_duplicates (lang_hooks.decls.omp_get_decl_init (decl),
373 omp_discover_declare_target_var_r,
374 &worklist);
375 else if (omp_declare_target_fn_p (decl))
376 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
377 omp_discover_declare_target_tgt_fn_r,
378 &worklist);
379 else
380 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
381 omp_discover_declare_target_fn_r,
382 &worklist);
385 lang_hooks.decls.omp_finish_decl_inits ();
389 /* Create new symbols containing (address, size) pairs for global variables,
390 marked with "omp declare target" attribute, as well as addresses for the
391 functions, which are outlined offloading regions. */
392 void
393 omp_finish_file (void)
395 unsigned num_funcs = vec_safe_length (offload_funcs);
396 unsigned num_vars = vec_safe_length (offload_vars);
398 if (num_funcs == 0 && num_vars == 0)
399 return;
401 if (targetm_common.have_named_sections)
403 vec<constructor_elt, va_gc> *v_f, *v_v;
404 vec_alloc (v_f, num_funcs);
405 vec_alloc (v_v, num_vars * 2);
407 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
408 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
410 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
411 vec_safe_length (v_v));
412 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
413 num_funcs);
414 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
415 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
416 tree ctor_v = build_constructor (vars_decl_type, v_v);
417 tree ctor_f = build_constructor (funcs_decl_type, v_f);
418 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
419 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
420 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
421 get_identifier (".offload_func_table"),
422 funcs_decl_type);
423 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
424 get_identifier (".offload_var_table"),
425 vars_decl_type);
426 TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
427 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
428 otherwise a joint table in a binary will contain padding between
429 tables from multiple object files. */
430 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
431 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
432 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
433 DECL_INITIAL (funcs_decl) = ctor_f;
434 DECL_INITIAL (vars_decl) = ctor_v;
435 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
436 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
438 varpool_node::finalize_decl (vars_decl);
439 varpool_node::finalize_decl (funcs_decl);
441 else
443 for (unsigned i = 0; i < num_funcs; i++)
445 tree it = (*offload_funcs)[i];
446 /* See also add_decls_addresses_to_decl_constructor
447 and output_offload_tables in lto-cgraph.c. */
448 if (!in_lto_p && !symtab_node::get (it))
449 continue;
450 targetm.record_offload_symbol (it);
452 for (unsigned i = 0; i < num_vars; i++)
454 tree it = (*offload_vars)[i];
455 if (!in_lto_p && !symtab_node::get (it))
456 continue;
457 #ifdef ACCEL_COMPILER
458 if (DECL_HAS_VALUE_EXPR_P (it)
459 && lookup_attribute ("omp declare target link",
460 DECL_ATTRIBUTES (it)))
462 tree value_expr = DECL_VALUE_EXPR (it);
463 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
464 targetm.record_offload_symbol (link_ptr_decl);
465 varpool_node::finalize_decl (link_ptr_decl);
467 else
468 #endif
469 targetm.record_offload_symbol (it);
474 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
475 axis DIM. Return a tmp var holding the result. */
477 static tree
478 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
480 tree arg = build_int_cst (unsigned_type_node, dim);
481 tree size = create_tmp_var (integer_type_node);
482 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
483 gimple *call = gimple_build_call_internal (fn, 1, arg);
485 gimple_call_set_lhs (call, size);
486 gimple_seq_add_stmt (seq, call);
488 return size;
491 /* Find the number of threads (POS = false), or thread number (POS =
492 true) for an OpenACC region partitioned as MASK. Setup code
493 required for the calculation is added to SEQ. */
495 static tree
496 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
498 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
499 unsigned ix;
501 /* Start at gang level, and examine relevant dimension indices. */
502 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
503 if (GOMP_DIM_MASK (ix) & mask)
505 if (res)
507 /* We had an outer index, so scale that by the size of
508 this dimension. */
509 tree n = oacc_dim_call (false, ix, seq);
510 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
512 if (pos)
514 /* Determine index in this dimension. */
515 tree id = oacc_dim_call (true, ix, seq);
516 if (res)
517 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
518 else
519 res = id;
523 if (res == NULL_TREE)
524 res = integer_zero_node;
526 return res;
529 /* Transform IFN_GOACC_LOOP calls to actual code. See
530 expand_oacc_for for where these are generated. At the vector
531 level, we stride loops, such that each member of a warp will
532 operate on adjacent iterations. At the worker and gang level,
533 each gang/warp executes a set of contiguous iterations. Chunking
534 can override this such that each iteration engine executes a
535 contiguous chunk, and then moves on to stride to the next chunk. */
537 static void
538 oacc_xform_loop (gcall *call)
540 gimple_stmt_iterator gsi = gsi_for_stmt (call);
541 enum ifn_goacc_loop_kind code
542 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
543 tree dir = gimple_call_arg (call, 1);
544 tree range = gimple_call_arg (call, 2);
545 tree step = gimple_call_arg (call, 3);
546 tree chunk_size = NULL_TREE;
547 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
548 tree lhs = gimple_call_lhs (call);
549 tree type = NULL_TREE;
550 tree diff_type = TREE_TYPE (range);
551 tree r = NULL_TREE;
552 gimple_seq seq = NULL;
553 bool chunking = false, striding = true;
554 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
555 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
557 /* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
558 if (!lhs)
560 gsi_replace_with_seq (&gsi, seq, true);
561 return;
564 type = TREE_TYPE (lhs);
566 #ifdef ACCEL_COMPILER
567 chunk_size = gimple_call_arg (call, 4);
568 if (integer_minus_onep (chunk_size) /* Force static allocation. */
569 || integer_zerop (chunk_size)) /* Default (also static). */
571 /* If we're at the gang level, we want each to execute a
572 contiguous run of iterations. Otherwise we want each element
573 to stride. */
574 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
575 chunking = false;
577 else
579 /* Chunk of size 1 is striding. */
580 striding = integer_onep (chunk_size);
581 chunking = !striding;
583 #endif
585 /* striding=true, chunking=true
586 -> invalid.
587 striding=true, chunking=false
588 -> chunks=1
589 striding=false,chunking=true
590 -> chunks=ceil (range/(chunksize*threads*step))
591 striding=false,chunking=false
592 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
593 push_gimplify_context (true);
595 switch (code)
597 default: gcc_unreachable ();
599 case IFN_GOACC_LOOP_CHUNKS:
600 if (!chunking)
601 r = build_int_cst (type, 1);
602 else
604 /* chunk_max
605 = (range - dir) / (chunks * step * num_threads) + dir */
606 tree per = oacc_thread_numbers (false, mask, &seq);
607 per = fold_convert (type, per);
608 chunk_size = fold_convert (type, chunk_size);
609 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
610 per = fold_build2 (MULT_EXPR, type, per, step);
611 r = build2 (MINUS_EXPR, type, range, dir);
612 r = build2 (PLUS_EXPR, type, r, per);
613 r = build2 (TRUNC_DIV_EXPR, type, r, per);
615 break;
617 case IFN_GOACC_LOOP_STEP:
619 /* If striding, step by the entire compute volume, otherwise
620 step by the inner volume. */
621 unsigned volume = striding ? mask : inner_mask;
623 r = oacc_thread_numbers (false, volume, &seq);
624 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
626 break;
628 case IFN_GOACC_LOOP_OFFSET:
629 /* Enable vectorization on non-SIMT targets. */
630 if (!targetm.simt.vf
631 && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
632 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
633 the loop. */
634 && (flag_tree_loop_vectorize
635 || !global_options_set.x_flag_tree_loop_vectorize))
637 basic_block bb = gsi_bb (gsi);
638 class loop *parent = bb->loop_father;
639 class loop *body = parent->inner;
641 parent->force_vectorize = true;
642 parent->safelen = INT_MAX;
644 /* "Chunking loops" may have inner loops. */
645 if (parent->inner)
647 body->force_vectorize = true;
648 body->safelen = INT_MAX;
651 cfun->has_force_vectorize_loops = true;
653 if (striding)
655 r = oacc_thread_numbers (true, mask, &seq);
656 r = fold_convert (diff_type, r);
658 else
660 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
661 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
662 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
663 inner_size, outer_size);
665 volume = fold_convert (diff_type, volume);
666 if (chunking)
667 chunk_size = fold_convert (diff_type, chunk_size);
668 else
670 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
672 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
673 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
674 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
677 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
678 fold_convert (diff_type, inner_size));
679 r = oacc_thread_numbers (true, outer_mask, &seq);
680 r = fold_convert (diff_type, r);
681 r = build2 (MULT_EXPR, diff_type, r, span);
683 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
684 inner = fold_convert (diff_type, inner);
685 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
687 if (chunking)
689 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
690 tree per
691 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
692 per = build2 (MULT_EXPR, diff_type, per, chunk);
694 r = build2 (PLUS_EXPR, diff_type, r, per);
697 r = fold_build2 (MULT_EXPR, diff_type, r, step);
698 if (type != diff_type)
699 r = fold_convert (type, r);
700 break;
702 case IFN_GOACC_LOOP_BOUND:
703 if (striding)
704 r = range;
705 else
707 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
708 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
709 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
710 inner_size, outer_size);
712 volume = fold_convert (diff_type, volume);
713 if (chunking)
714 chunk_size = fold_convert (diff_type, chunk_size);
715 else
717 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
719 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
720 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
721 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
724 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
725 fold_convert (diff_type, inner_size));
727 r = fold_build2 (MULT_EXPR, diff_type, span, step);
729 tree offset = gimple_call_arg (call, 6);
730 r = build2 (PLUS_EXPR, diff_type, r,
731 fold_convert (diff_type, offset));
732 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
733 diff_type, r, range);
735 if (diff_type != type)
736 r = fold_convert (type, r);
737 break;
740 gimplify_assign (lhs, r, &seq);
742 pop_gimplify_context (NULL);
744 gsi_replace_with_seq (&gsi, seq, true);
747 /* Transform a GOACC_TILE call. Determines the element loop span for
748 the specified loop of the nest. This is 1 if we're not tiling.
750 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
752 static void
753 oacc_xform_tile (gcall *call)
755 gimple_stmt_iterator gsi = gsi_for_stmt (call);
756 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
757 /* Inner loops have higher loop_nos. */
758 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
759 tree tile_size = gimple_call_arg (call, 2);
760 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
761 tree lhs = gimple_call_lhs (call);
762 tree type = TREE_TYPE (lhs);
763 gimple_seq seq = NULL;
764 tree span = build_int_cst (type, 1);
766 gcc_assert (!(e_mask
767 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
768 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
769 push_gimplify_context (!seen_error ());
771 #ifndef ACCEL_COMPILER
772 /* Partitioning disabled on host compilers. */
773 e_mask = 0;
774 #endif
775 if (!e_mask)
776 /* Not paritioning. */
777 span = integer_one_node;
778 else if (!integer_zerop (tile_size))
779 /* User explicitly specified size. */
780 span = tile_size;
781 else
783 /* Pick a size based on the paritioning of the element loop and
784 the number of loop nests. */
785 tree first_size = NULL_TREE;
786 tree second_size = NULL_TREE;
788 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
789 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
790 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
791 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
793 if (!first_size)
795 first_size = second_size;
796 second_size = NULL_TREE;
799 if (loop_no + 1 == collapse)
801 span = first_size;
802 if (!loop_no && second_size)
803 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
804 span, second_size);
806 else if (loop_no + 2 == collapse)
807 span = second_size;
808 else
809 span = NULL_TREE;
811 if (!span)
812 /* There's no obvious element size for this loop. Options
813 are 1, first_size or some non-unity constant (32 is my
814 favourite). We should gather some statistics. */
815 span = first_size;
818 span = fold_convert (type, span);
819 gimplify_assign (lhs, span, &seq);
821 pop_gimplify_context (NULL);
823 gsi_replace_with_seq (&gsi, seq, true);
826 /* Default partitioned and minimum partitioned dimensions. */
828 static int oacc_default_dims[GOMP_DIM_MAX];
829 static int oacc_min_dims[GOMP_DIM_MAX];
832 oacc_get_default_dim (int dim)
834 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
835 return oacc_default_dims[dim];
839 oacc_get_min_dim (int dim)
841 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
842 return oacc_min_dims[dim];
845 /* Parse the default dimension parameter. This is a set of
846 :-separated optional compute dimensions. Each specified dimension
847 is a positive integer. When device type support is added, it is
848 planned to be a comma separated list of such compute dimensions,
849 with all but the first prefixed by the colon-terminated device
850 type. */
852 static void
853 oacc_parse_default_dims (const char *dims)
855 int ix;
857 for (ix = GOMP_DIM_MAX; ix--;)
859 oacc_default_dims[ix] = -1;
860 oacc_min_dims[ix] = 1;
863 #ifndef ACCEL_COMPILER
864 /* Cannot be overridden on the host. */
865 dims = NULL;
866 #endif
867 if (dims)
869 const char *pos = dims;
871 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
873 if (ix)
875 if (*pos != ':')
876 goto malformed;
877 pos++;
880 if (*pos != ':')
882 long val;
883 const char *eptr;
885 errno = 0;
886 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
887 if (errno || val <= 0 || (int) val != val)
888 goto malformed;
889 pos = eptr;
890 oacc_default_dims[ix] = (int) val;
893 if (*pos)
895 malformed:
896 error_at (UNKNOWN_LOCATION,
897 "%<-fopenacc-dim%> operand is malformed at %qs", pos);
901 /* Allow the backend to validate the dimensions. */
902 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
903 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
906 /* Validate and update the dimensions for offloaded FN. ATTRS is the
907 raw attribute. DIMS is an array of dimensions, which is filled in.
908 LEVEL is the partitioning level of a routine, or -1 for an offload
909 region itself. USED is the mask of partitioned execution in the
910 function. */
912 static void
913 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
915 tree purpose[GOMP_DIM_MAX];
916 unsigned ix;
917 tree pos = TREE_VALUE (attrs);
919 /* Make sure the attribute creator attached the dimension
920 information. */
921 gcc_assert (pos);
923 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
925 purpose[ix] = TREE_PURPOSE (pos);
926 tree val = TREE_VALUE (pos);
927 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
928 pos = TREE_CHAIN (pos);
931 bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
933 /* Default anything left to 1 or a partitioned default. */
934 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
935 if (dims[ix] < 0)
937 /* The OpenACC spec says 'If the [num_gangs] clause is not
938 specified, an implementation-defined default will be used;
939 the default may depend on the code within the construct.'
940 (2.5.6). Thus an implementation is free to choose
941 non-unity default for a parallel region that doesn't have
942 any gang-partitioned loops. However, it appears that there
943 is a sufficient body of user code that expects non-gang
944 partitioned regions to not execute in gang-redundant mode.
945 So we (a) don't warn about the non-portability and (b) pick
946 the minimum permissible dimension size when there is no
947 partitioned execution. Otherwise we pick the global
948 default for the dimension, which the user can control. The
949 same wording and logic applies to num_workers and
950 vector_length, however the worker- or vector- single
951 execution doesn't have the same impact as gang-redundant
952 execution. (If the minimum gang-level partioning is not 1,
953 the target is probably too confusing.) */
954 dims[ix] = (used & GOMP_DIM_MASK (ix)
955 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
956 changed = true;
959 if (changed)
961 /* Replace the attribute with new values. */
962 pos = NULL_TREE;
963 for (ix = GOMP_DIM_MAX; ix--;)
964 pos = tree_cons (purpose[ix],
965 build_int_cst (integer_type_node, dims[ix]), pos);
966 oacc_replace_fn_attrib (fn, pos);
970 /* Create an empty OpenACC loop structure at LOC. */
972 static oacc_loop *
973 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
975 oacc_loop *loop = XCNEW (oacc_loop);
977 loop->parent = parent;
979 if (parent)
981 loop->sibling = parent->child;
982 parent->child = loop;
985 loop->loc = loc;
986 return loop;
989 /* Create an outermost, dummy OpenACC loop for offloaded function
990 DECL. */
992 static oacc_loop *
993 new_oacc_loop_outer (tree decl)
995 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
998 /* Start a new OpenACC loop structure beginning at head marker HEAD.
999 Link into PARENT loop. Return the new loop. */
1001 static oacc_loop *
1002 new_oacc_loop (oacc_loop *parent, gcall *marker)
1004 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
1006 loop->marker = marker;
1008 /* TODO: This is where device_type flattening would occur for the loop
1009 flags. */
1011 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
1013 tree chunk_size = integer_zero_node;
1014 if (loop->flags & OLF_GANG_STATIC)
1015 chunk_size = gimple_call_arg (marker, 4);
1016 loop->chunk_size = chunk_size;
1018 return loop;
1021 /* Create a dummy loop encompassing a call to a openACC routine.
1022 Extract the routine's partitioning requirements. */
1024 static void
1025 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
1027 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
1028 int level = oacc_fn_attrib_level (attrs);
1030 gcc_assert (level >= 0);
1032 loop->marker = call;
1033 loop->routine = decl;
1034 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
1035 ^ (GOMP_DIM_MASK (level) - 1));
1038 /* Finish off the current OpenACC loop ending at tail marker TAIL.
1039 Return the parent loop. */
1041 static oacc_loop *
1042 finish_oacc_loop (oacc_loop *loop)
1044 /* If the loop has been collapsed, don't partition it. */
1045 if (loop->ifns.is_empty ())
1046 loop->mask = loop->flags = 0;
1047 return loop->parent;
1050 /* Free all OpenACC loop structures within LOOP (inclusive). */
1052 static void
1053 free_oacc_loop (oacc_loop *loop)
1055 if (loop->sibling)
1056 free_oacc_loop (loop->sibling);
1057 if (loop->child)
1058 free_oacc_loop (loop->child);
1060 loop->ifns.release ();
1061 free (loop);
1064 /* Dump out the OpenACC loop head or tail beginning at FROM. */
1066 static void
1067 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
1068 const char *title, int level)
1070 enum ifn_unique_kind kind
1071 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1073 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
1074 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1076 gimple *stmt = gsi_stmt (gsi);
1078 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1080 enum ifn_unique_kind k
1081 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
1082 (gimple_call_arg (stmt, 0)));
1084 if (k == kind && stmt != from)
1085 break;
1087 print_gimple_stmt (file, stmt, depth * 2 + 2);
1089 gsi_next (&gsi);
1090 while (gsi_end_p (gsi))
1091 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1095 /* Dump OpenACC loop LOOP, its children, and its siblings. */
1097 static void
1098 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
1100 int ix;
1102 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
1103 loop->flags, loop->mask,
1104 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
1106 if (loop->marker)
1107 print_gimple_stmt (file, loop->marker, depth * 2);
1109 if (loop->routine)
1110 fprintf (file, "%*sRoutine %s:%u:%s\n",
1111 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
1112 DECL_SOURCE_LINE (loop->routine),
1113 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
1115 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
1116 if (loop->heads[ix])
1117 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
1118 for (ix = GOMP_DIM_MAX; ix--;)
1119 if (loop->tails[ix])
1120 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
1122 if (loop->child)
1123 dump_oacc_loop (file, loop->child, depth + 1);
1124 if (loop->sibling)
1125 dump_oacc_loop (file, loop->sibling, depth);
1128 void debug_oacc_loop (oacc_loop *);
1130 /* Dump loops to stderr. */
1132 DEBUG_FUNCTION void
1133 debug_oacc_loop (oacc_loop *loop)
1135 dump_oacc_loop (stderr, loop, 0);
1138 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
1139 siblings. */
1141 static void
1142 inform_oacc_loop (const oacc_loop *loop)
1144 const char *gang
1145 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
1146 const char *worker
1147 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
1148 const char *vector
1149 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
1150 const char *seq = loop->mask == 0 ? " seq" : "";
1151 const dump_user_location_t loc
1152 = dump_user_location_t::from_location_t (loop->loc);
1153 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1154 "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
1155 vector, seq);
1157 if (loop->child)
1158 inform_oacc_loop (loop->child);
1159 if (loop->sibling)
1160 inform_oacc_loop (loop->sibling);
1163 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
1164 structures as we go. By construction these loops are properly
1165 nested. */
1167 static void
1168 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
1170 int marker = 0;
1171 int remaining = 0;
1173 if (bb->flags & BB_VISITED)
1174 return;
1176 follow:
1177 bb->flags |= BB_VISITED;
1179 /* Scan for loop markers. */
1180 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1181 gsi_next (&gsi))
1183 gimple *stmt = gsi_stmt (gsi);
1185 if (!is_gimple_call (stmt))
1186 continue;
1188 gcall *call = as_a <gcall *> (stmt);
1190 /* If this is a routine, make a dummy loop for it. */
1191 if (tree decl = gimple_call_fndecl (call))
1192 if (tree attrs = oacc_get_fn_attrib (decl))
1194 gcc_assert (!marker);
1195 new_oacc_loop_routine (loop, call, decl, attrs);
1198 if (!gimple_call_internal_p (call))
1199 continue;
1201 switch (gimple_call_internal_fn (call))
1203 default:
1204 break;
1206 case IFN_GOACC_LOOP:
1207 case IFN_GOACC_TILE:
1208 /* Record the abstraction function, so we can manipulate it
1209 later. */
1210 loop->ifns.safe_push (call);
1211 break;
1213 case IFN_UNIQUE:
1214 enum ifn_unique_kind kind
1215 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
1216 (gimple_call_arg (call, 0)));
1217 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
1218 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
1220 if (gimple_call_num_args (call) == 2)
1222 gcc_assert (marker && !remaining);
1223 marker = 0;
1224 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
1225 loop = finish_oacc_loop (loop);
1226 else
1227 loop->head_end = call;
1229 else
1231 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1233 if (!marker)
1235 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1236 loop = new_oacc_loop (loop, call);
1237 remaining = count;
1239 gcc_assert (count == remaining);
1240 if (remaining)
1242 remaining--;
1243 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1244 loop->heads[marker] = call;
1245 else
1246 loop->tails[remaining] = call;
1248 marker++;
1253 if (remaining || marker)
1255 bb = single_succ (bb);
1256 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1257 goto follow;
1260 /* Walk successor blocks. */
1261 edge e;
1262 edge_iterator ei;
1264 FOR_EACH_EDGE (e, ei, bb->succs)
1265 oacc_loop_discover_walk (loop, e->dest);
1268 /* LOOP is the first sibling. Reverse the order in place and return
1269 the new first sibling. Recurse to child loops. */
1271 static oacc_loop *
1272 oacc_loop_sibling_nreverse (oacc_loop *loop)
1274 oacc_loop *last = NULL;
1277 if (loop->child)
1278 loop->child = oacc_loop_sibling_nreverse (loop->child);
1280 oacc_loop *next = loop->sibling;
1281 loop->sibling = last;
1282 last = loop;
1283 loop = next;
1285 while (loop);
1287 return last;
1290 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1291 the current function. */
1293 static oacc_loop *
1294 oacc_loop_discovery ()
1296 /* Clear basic block flags, in particular BB_VISITED which we're going to use
1297 in the following. */
1298 clear_bb_flags ();
1300 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1301 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1303 /* The siblings were constructed in reverse order, reverse them so
1304 that diagnostics come out in an unsurprising order. */
1305 top = oacc_loop_sibling_nreverse (top);
1307 return top;
1310 /* Transform the abstract internal function markers starting at FROM
1311 to be for partitioning level LEVEL. Stop when we meet another HEAD
1312 or TAIL marker. */
1314 static void
1315 oacc_loop_xform_head_tail (gcall *from, int level)
1317 enum ifn_unique_kind kind
1318 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1319 tree replacement = build_int_cst (unsigned_type_node, level);
1321 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1323 gimple *stmt = gsi_stmt (gsi);
1325 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1327 enum ifn_unique_kind k
1328 = ((enum ifn_unique_kind)
1329 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1331 if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1332 *gimple_call_arg_ptr (stmt, 2) = replacement;
1333 else if (k == kind && stmt != from)
1334 break;
1336 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1337 *gimple_call_arg_ptr (stmt, 3) = replacement;
1339 gsi_next (&gsi);
1340 while (gsi_end_p (gsi))
1341 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1345 /* Process the discovered OpenACC loops, setting the correct
1346 partitioning level etc. */
1348 static void
1349 oacc_loop_process (oacc_loop *loop)
1351 if (loop->child)
1352 oacc_loop_process (loop->child);
1354 if (loop->mask && !loop->routine)
1356 int ix;
1357 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1358 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1359 tree chunk_arg = loop->chunk_size;
1360 gcall *call;
1362 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1363 switch (gimple_call_internal_fn (call))
1365 case IFN_GOACC_LOOP:
1367 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1368 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1369 if (!is_e)
1370 gimple_call_set_arg (call, 4, chunk_arg);
1372 break;
1374 case IFN_GOACC_TILE:
1375 gimple_call_set_arg (call, 3, mask_arg);
1376 gimple_call_set_arg (call, 4, e_mask_arg);
1377 break;
1379 default:
1380 gcc_unreachable ();
1383 unsigned dim = GOMP_DIM_GANG;
1384 unsigned mask = loop->mask | loop->e_mask;
1385 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1387 while (!(GOMP_DIM_MASK (dim) & mask))
1388 dim++;
1390 oacc_loop_xform_head_tail (loop->heads[ix], dim);
1391 oacc_loop_xform_head_tail (loop->tails[ix], dim);
1393 mask ^= GOMP_DIM_MASK (dim);
1397 if (loop->sibling)
1398 oacc_loop_process (loop->sibling);
1401 /* Walk the OpenACC loop heirarchy checking and assigning the
1402 programmer-specified partitionings. OUTER_MASK is the partitioning
1403 this loop is contained within. Return mask of partitioning
1404 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1405 bit. */
1407 static unsigned
1408 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1410 unsigned this_mask = loop->mask;
1411 unsigned mask_all = 0;
1412 bool noisy = true;
1414 #ifdef ACCEL_COMPILER
1415 /* When device_type is supported, we want the device compiler to be
1416 noisy, if the loop parameters are device_type-specific. */
1417 noisy = false;
1418 #endif
1420 if (!loop->routine)
1422 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1423 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1424 bool tiling = (loop->flags & OLF_TILE) != 0;
1426 this_mask = ((loop->flags >> OLF_DIM_BASE)
1427 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1429 /* Apply auto partitioning if this is a non-partitioned regular
1430 loop, or (no more than) single axis tiled loop. */
1431 bool maybe_auto
1432 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1434 if ((this_mask != 0) + auto_par + seq_par > 1)
1436 if (noisy)
1437 error_at (loop->loc,
1438 seq_par
1439 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1440 : G_("%<auto%> conflicts with other OpenACC loop "
1441 "specifiers"));
1442 maybe_auto = false;
1443 loop->flags &= ~OLF_AUTO;
1444 if (seq_par)
1446 loop->flags
1447 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1448 this_mask = 0;
1452 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1454 loop->flags |= OLF_AUTO;
1455 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1459 if (this_mask & outer_mask)
1461 const oacc_loop *outer;
1462 for (outer = loop->parent; outer; outer = outer->parent)
1463 if ((outer->mask | outer->e_mask) & this_mask)
1464 break;
1466 if (noisy)
1468 if (outer)
1470 error_at (loop->loc,
1471 loop->routine
1472 ? G_("routine call uses same OpenACC parallelism"
1473 " as containing loop")
1474 : G_("inner loop uses same OpenACC parallelism"
1475 " as containing loop"));
1476 inform (outer->loc, "containing loop here");
1478 else
1479 error_at (loop->loc,
1480 loop->routine
1481 ? G_("routine call uses OpenACC parallelism disallowed"
1482 " by containing routine")
1483 : G_("loop uses OpenACC parallelism disallowed"
1484 " by containing routine"));
1486 if (loop->routine)
1487 inform (DECL_SOURCE_LOCATION (loop->routine),
1488 "routine %qD declared here", loop->routine);
1490 this_mask &= ~outer_mask;
1492 else
1494 unsigned outermost = least_bit_hwi (this_mask);
1496 if (outermost && outermost <= outer_mask)
1498 if (noisy)
1500 error_at (loop->loc,
1501 "incorrectly nested OpenACC loop parallelism");
1503 const oacc_loop *outer;
1504 for (outer = loop->parent;
1505 outer->flags && outer->flags < outermost;
1506 outer = outer->parent)
1507 continue;
1508 inform (outer->loc, "containing loop here");
1511 this_mask &= ~outermost;
1515 mask_all |= this_mask;
1517 if (loop->flags & OLF_TILE)
1519 /* When tiling, vector goes to the element loop, and failing
1520 that we put worker there. The std doesn't contemplate
1521 specifying all three. We choose to put worker and vector on
1522 the element loops in that case. */
1523 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1524 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1525 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1527 loop->e_mask = this_e_mask;
1528 this_mask ^= this_e_mask;
1531 loop->mask = this_mask;
1533 if (dump_file)
1534 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1535 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1536 loop->mask, loop->e_mask);
1538 if (loop->child)
1540 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1541 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1542 mask_all |= loop->inner;
1545 if (loop->sibling)
1546 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1548 return mask_all;
1551 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1552 OUTER_MASK is the partitioning this loop is contained within.
1553 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1554 Return the cumulative partitioning used by this loop, siblings and
1555 children. */
1557 static unsigned
1558 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1559 bool outer_assign)
1561 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1562 bool noisy = true;
1563 bool tiling = loop->flags & OLF_TILE;
1565 #ifdef ACCEL_COMPILER
1566 /* When device_type is supported, we want the device compiler to be
1567 noisy, if the loop parameters are device_type-specific. */
1568 noisy = false;
1569 #endif
1571 if (assign && (!outer_assign || loop->inner))
1573 /* Allocate outermost and non-innermost loops at the outermost
1574 non-innermost available level. */
1575 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1577 /* Find the first outermost available partition. */
1578 while (this_mask <= outer_mask)
1579 this_mask <<= 1;
1581 /* Grab two axes if tiling, and we've not assigned anything */
1582 if (tiling && !(loop->mask | loop->e_mask))
1583 this_mask |= this_mask << 1;
1585 /* Prohibit the innermost partitioning at the moment. */
1586 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1588 /* Don't use any dimension explicitly claimed by an inner loop. */
1589 this_mask &= ~loop->inner;
1591 if (tiling && !loop->e_mask)
1593 /* If we got two axes, allocate the inner one to the element
1594 loop. */
1595 loop->e_mask = this_mask & (this_mask << 1);
1596 this_mask ^= loop->e_mask;
1599 loop->mask |= this_mask;
1602 if (loop->child)
1604 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1605 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1606 outer_assign | assign);
1609 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1611 /* Allocate the loop at the innermost available level. Note
1612 that we do this even if we already assigned this loop the
1613 outermost available level above. That way we'll partition
1614 this along 2 axes, if they are available. */
1615 unsigned this_mask = 0;
1617 /* Determine the outermost partitioning used within this loop. */
1618 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1619 this_mask = least_bit_hwi (this_mask);
1621 /* Pick the partitioning just inside that one. */
1622 this_mask >>= 1;
1624 /* And avoid picking one use by an outer loop. */
1625 this_mask &= ~outer_mask;
1627 /* If tiling and we failed completely above, grab the next one
1628 too. Making sure it doesn't hit an outer loop. */
1629 if (tiling)
1631 this_mask &= ~(loop->e_mask | loop->mask);
1632 unsigned tile_mask = ((this_mask >> 1)
1633 & ~(outer_mask | loop->e_mask | loop->mask));
1635 if (tile_mask || loop->mask)
1637 loop->e_mask |= this_mask;
1638 this_mask = tile_mask;
1640 if (!loop->e_mask && noisy)
1641 warning_at (loop->loc, 0,
1642 "insufficient partitioning available"
1643 " to parallelize element loop");
1646 loop->mask |= this_mask;
1647 if (!loop->mask && noisy)
1648 warning_at (loop->loc, 0,
1649 tiling
1650 ? G_("insufficient partitioning available"
1651 " to parallelize tile loop")
1652 : G_("insufficient partitioning available"
1653 " to parallelize loop"));
1656 if (assign && dump_file)
1657 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1658 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1659 loop->mask, loop->e_mask);
1661 unsigned inner_mask = 0;
1663 if (loop->sibling)
1664 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1665 outer_mask, outer_assign);
1667 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1669 return inner_mask;
1672 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1673 axes. Return mask of partitioning. */
1675 static unsigned
1676 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1678 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1680 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1682 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1683 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1685 return mask_all;
1688 /* Default fork/join early expander. Delete the function calls if
1689 there is no RTL expander. */
1691 bool
1692 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1693 const int *ARG_UNUSED (dims), bool is_fork)
1695 if (is_fork)
1696 return targetm.have_oacc_fork ();
1697 else
1698 return targetm.have_oacc_join ();
1701 /* Default goacc.reduction early expander.
1703 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1704 If RES_PTR is not integer-zerop:
1705 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1706 TEARDOWN - emit '*RES_PTR = VAR'
1707 If LHS is not NULL
1708 emit 'LHS = VAR' */
1710 void
1711 default_goacc_reduction (gcall *call)
1713 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1714 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1715 tree lhs = gimple_call_lhs (call);
1716 tree var = gimple_call_arg (call, 2);
1717 gimple_seq seq = NULL;
1719 if (code == IFN_GOACC_REDUCTION_SETUP
1720 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1722 /* Setup and Teardown need to copy from/to the receiver object,
1723 if there is one. */
1724 tree ref_to_res = gimple_call_arg (call, 1);
1726 if (!integer_zerop (ref_to_res))
1728 tree dst = build_simple_mem_ref (ref_to_res);
1729 tree src = var;
1731 if (code == IFN_GOACC_REDUCTION_SETUP)
1733 src = dst;
1734 dst = lhs;
1735 lhs = NULL;
1737 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1741 /* Copy VAR to LHS, if there is an LHS. */
1742 if (lhs)
1743 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1745 gsi_replace_with_seq (&gsi, seq, true);
1748 /* Main entry point for oacc transformations which run on the device
1749 compiler after LTO, so we know what the target device is at this
1750 point (including the host fallback). */
1752 static unsigned int
1753 execute_oacc_device_lower ()
1755 tree attrs = oacc_get_fn_attrib (current_function_decl);
1757 if (!attrs)
1758 /* Not an offloaded function. */
1759 return 0;
1761 /* Parse the default dim argument exactly once. */
1762 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1764 oacc_parse_default_dims (flag_openacc_dims);
1765 flag_openacc_dims = (char *)&flag_openacc_dims;
1768 bool is_oacc_parallel
1769 = (lookup_attribute ("oacc parallel",
1770 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1771 bool is_oacc_kernels
1772 = (lookup_attribute ("oacc kernels",
1773 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1774 bool is_oacc_serial
1775 = (lookup_attribute ("oacc serial",
1776 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1777 bool is_oacc_parallel_kernels_parallelized
1778 = (lookup_attribute ("oacc parallel_kernels_parallelized",
1779 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1780 bool is_oacc_parallel_kernels_gang_single
1781 = (lookup_attribute ("oacc parallel_kernels_gang_single",
1782 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1783 int fn_level = oacc_fn_attrib_level (attrs);
1784 bool is_oacc_routine = (fn_level >= 0);
1785 gcc_checking_assert (is_oacc_parallel
1786 + is_oacc_kernels
1787 + is_oacc_serial
1788 + is_oacc_parallel_kernels_parallelized
1789 + is_oacc_parallel_kernels_gang_single
1790 + is_oacc_routine
1791 == 1);
1793 bool is_oacc_kernels_parallelized
1794 = (lookup_attribute ("oacc kernels parallelized",
1795 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1796 if (is_oacc_kernels_parallelized)
1797 gcc_checking_assert (is_oacc_kernels);
1799 if (dump_file)
1801 if (is_oacc_parallel)
1802 fprintf (dump_file, "Function is OpenACC parallel offload\n");
1803 else if (is_oacc_kernels)
1804 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1805 (is_oacc_kernels_parallelized
1806 ? "parallelized" : "unparallelized"));
1807 else if (is_oacc_serial)
1808 fprintf (dump_file, "Function is OpenACC serial offload\n");
1809 else if (is_oacc_parallel_kernels_parallelized)
1810 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1811 "parallel_kernels_parallelized");
1812 else if (is_oacc_parallel_kernels_gang_single)
1813 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1814 "parallel_kernels_gang_single");
1815 else if (is_oacc_routine)
1816 fprintf (dump_file, "Function is OpenACC routine level %d\n",
1817 fn_level);
1818 else
1819 gcc_unreachable ();
1822 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
1823 kernels, so remove the parallelism dimensions function attributes
1824 potentially set earlier on. */
1825 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
1827 oacc_set_fn_attrib (current_function_decl, NULL, NULL);
1828 attrs = oacc_get_fn_attrib (current_function_decl);
1831 /* Discover, partition and process the loops. */
1832 oacc_loop *loops = oacc_loop_discovery ();
1834 unsigned outer_mask = 0;
1835 if (is_oacc_routine)
1836 outer_mask = GOMP_DIM_MASK (fn_level) - 1;
1837 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1838 /* OpenACC kernels constructs are special: they currently don't use the
1839 generic oacc_loop infrastructure and attribute/dimension processing. */
1840 if (is_oacc_kernels && is_oacc_kernels_parallelized)
1842 /* Parallelized OpenACC kernels constructs use gang parallelism. See
1843 also tree-parloops.c:create_parallel_loop. */
1844 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
1847 int dims[GOMP_DIM_MAX];
1848 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1850 if (dump_file)
1852 const char *comma = "Compute dimensions [";
1853 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1854 fprintf (dump_file, "%s%d", comma, dims[ix]);
1855 fprintf (dump_file, "]\n");
1858 /* Verify that for OpenACC 'kernels' decomposed "gang-single" parts we launch
1859 a single gang only. */
1860 if (is_oacc_parallel_kernels_gang_single)
1861 gcc_checking_assert (dims[GOMP_DIM_GANG] == 1);
1863 oacc_loop_process (loops);
1864 if (dump_file)
1866 fprintf (dump_file, "OpenACC loops\n");
1867 dump_oacc_loop (dump_file, loops, 0);
1868 fprintf (dump_file, "\n");
1870 if (dump_enabled_p ())
1872 oacc_loop *l = loops;
1873 /* OpenACC kernels constructs are special: they currently don't use the
1874 generic oacc_loop infrastructure. */
1875 if (is_oacc_kernels)
1877 /* Create a fake oacc_loop for diagnostic purposes. */
1878 l = new_oacc_loop_raw (NULL,
1879 DECL_SOURCE_LOCATION (current_function_decl));
1880 l->mask = used_mask;
1882 else
1884 /* Skip the outermost, dummy OpenACC loop */
1885 l = l->child;
1887 if (l)
1888 inform_oacc_loop (l);
1889 if (is_oacc_kernels)
1890 free_oacc_loop (l);
1893 /* Offloaded targets may introduce new basic blocks, which require
1894 dominance information to update SSA. */
1895 calculate_dominance_info (CDI_DOMINATORS);
1897 /* Now lower internal loop functions to target-specific code
1898 sequences. */
1899 basic_block bb;
1900 FOR_ALL_BB_FN (bb, cfun)
1901 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1903 gimple *stmt = gsi_stmt (gsi);
1904 if (!is_gimple_call (stmt))
1906 gsi_next (&gsi);
1907 continue;
1910 gcall *call = as_a <gcall *> (stmt);
1911 if (!gimple_call_internal_p (call))
1913 gsi_next (&gsi);
1914 continue;
1917 /* Rewind to allow rescan. */
1918 gsi_prev (&gsi);
1919 bool rescan = false, remove = false;
1920 enum internal_fn ifn_code = gimple_call_internal_fn (call);
1922 switch (ifn_code)
1924 default: break;
1926 case IFN_GOACC_TILE:
1927 oacc_xform_tile (call);
1928 rescan = true;
1929 break;
1931 case IFN_GOACC_LOOP:
1932 oacc_xform_loop (call);
1933 rescan = true;
1934 break;
1936 case IFN_GOACC_REDUCTION:
1937 /* Mark the function for SSA renaming. */
1938 mark_virtual_operands_for_renaming (cfun);
1940 /* If the level is -1, this ended up being an unused
1941 axis. Handle as a default. */
1942 if (integer_minus_onep (gimple_call_arg (call, 3)))
1943 default_goacc_reduction (call);
1944 else
1945 targetm.goacc.reduction (call);
1946 rescan = true;
1947 break;
1949 case IFN_UNIQUE:
1951 enum ifn_unique_kind kind
1952 = ((enum ifn_unique_kind)
1953 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1955 switch (kind)
1957 default:
1958 break;
1960 case IFN_UNIQUE_OACC_FORK:
1961 case IFN_UNIQUE_OACC_JOIN:
1962 if (integer_minus_onep (gimple_call_arg (call, 2)))
1963 remove = true;
1964 else if (!targetm.goacc.fork_join
1965 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1966 remove = true;
1967 break;
1969 case IFN_UNIQUE_OACC_HEAD_MARK:
1970 case IFN_UNIQUE_OACC_TAIL_MARK:
1971 remove = true;
1972 break;
1974 break;
1978 if (gsi_end_p (gsi))
1979 /* We rewound past the beginning of the BB. */
1980 gsi = gsi_start_bb (bb);
1981 else
1982 /* Undo the rewind. */
1983 gsi_next (&gsi);
1985 if (remove)
1987 if (gimple_vdef (call))
1988 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1989 if (gimple_call_lhs (call))
1991 /* Propagate the data dependency var. */
1992 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1993 gimple_call_arg (call, 1));
1994 gsi_replace (&gsi, ass, false);
1996 else
1997 gsi_remove (&gsi, true);
1999 else if (!rescan)
2000 /* If not rescanning, advance over the call. */
2001 gsi_next (&gsi);
2004 free_oacc_loop (loops);
2006 return 0;
2009 /* Default launch dimension validator. Force everything to 1. A
2010 backend that wants to provide larger dimensions must override this
2011 hook. */
2013 bool
2014 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
2015 int ARG_UNUSED (fn_level),
2016 unsigned ARG_UNUSED (used))
2018 bool changed = false;
2020 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
2022 if (dims[ix] != 1)
2024 dims[ix] = 1;
2025 changed = true;
2029 return changed;
2032 /* Default dimension bound is unknown on accelerator and 1 on host. */
2035 default_goacc_dim_limit (int ARG_UNUSED (axis))
2037 #ifdef ACCEL_COMPILER
2038 return 0;
2039 #else
2040 return 1;
2041 #endif
2044 namespace {
2046 const pass_data pass_data_oacc_device_lower =
2048 GIMPLE_PASS, /* type */
2049 "oaccdevlow", /* name */
2050 OPTGROUP_OMP, /* optinfo_flags */
2051 TV_NONE, /* tv_id */
2052 PROP_cfg, /* properties_required */
2053 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2054 0, /* properties_destroyed */
2055 0, /* todo_flags_start */
2056 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2059 class pass_oacc_device_lower : public gimple_opt_pass
2061 public:
2062 pass_oacc_device_lower (gcc::context *ctxt)
2063 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
2066 /* opt_pass methods: */
2067 virtual bool gate (function *) { return flag_openacc; };
2069 virtual unsigned int execute (function *)
2071 return execute_oacc_device_lower ();
2074 }; // class pass_oacc_device_lower
2076 } // anon namespace
2078 gimple_opt_pass *
2079 make_pass_oacc_device_lower (gcc::context *ctxt)
2081 return new pass_oacc_device_lower (ctxt);
2085 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
2086 GOMP_SIMT_ENTER call identifying the privatized variables, which are
2087 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
2088 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
2090 static void
2091 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
2093 gimple *alloc_stmt = gsi_stmt (*gsi);
2094 tree simtrec = gimple_call_lhs (alloc_stmt);
2095 tree simduid = gimple_call_arg (alloc_stmt, 0);
2096 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
2097 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
2098 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
2099 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
2100 TREE_ADDRESSABLE (rectype) = 1;
2101 TREE_TYPE (simtrec) = build_pointer_type (rectype);
2102 for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
2104 tree *argp = gimple_call_arg_ptr (enter_stmt, i);
2105 if (*argp == null_pointer_node)
2106 continue;
2107 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
2108 && VAR_P (TREE_OPERAND (*argp, 0)));
2109 tree var = TREE_OPERAND (*argp, 0);
2111 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
2112 DECL_NAME (var), TREE_TYPE (var));
2113 SET_DECL_ALIGN (field, DECL_ALIGN (var));
2114 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
2115 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
2117 insert_field_into_struct (rectype, field);
2119 tree t = build_simple_mem_ref (simtrec);
2120 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
2121 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
2122 SET_DECL_VALUE_EXPR (var, t);
2123 DECL_HAS_VALUE_EXPR_P (var) = 1;
2124 *regimplify = true;
2126 layout_type (rectype);
2127 tree size = TYPE_SIZE_UNIT (rectype);
2128 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
2130 alloc_stmt
2131 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
2132 gimple_call_set_lhs (alloc_stmt, simtrec);
2133 gsi_replace (gsi, alloc_stmt, false);
2134 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
2135 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
2136 gsi_replace (&enter_gsi, enter_stmt, false);
2138 use_operand_p use;
2139 gimple *exit_stmt;
2140 if (single_imm_use (simtrec, &use, &exit_stmt))
2142 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
2143 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
2144 tree clobber = build_clobber (rectype);
2145 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
2146 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
2148 else
2149 gcc_checking_assert (has_zero_uses (simtrec));
2152 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
2154 static tree
2155 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
2157 tree t = *tp;
2159 if (VAR_P (t)
2160 && DECL_HAS_VALUE_EXPR_P (t)
2161 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
2163 *walk_subtrees = 0;
2164 return t;
2166 return NULL_TREE;
2169 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
2170 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
2171 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
2172 internal functions on non-SIMT targets, and likewise some SIMD internal
2173 functions on SIMT targets. */
2175 static unsigned int
2176 execute_omp_device_lower ()
2178 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
2179 bool regimplify = false;
2180 basic_block bb;
2181 gimple_stmt_iterator gsi;
2182 bool calls_declare_variant_alt
2183 = cgraph_node::get (cfun->decl)->calls_declare_variant_alt;
2184 FOR_EACH_BB_FN (bb, cfun)
2185 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2187 gimple *stmt = gsi_stmt (gsi);
2188 if (!is_gimple_call (stmt))
2189 continue;
2190 if (!gimple_call_internal_p (stmt))
2192 if (calls_declare_variant_alt)
2193 if (tree fndecl = gimple_call_fndecl (stmt))
2195 tree new_fndecl = omp_resolve_declare_variant (fndecl);
2196 if (new_fndecl != fndecl)
2198 gimple_call_set_fndecl (stmt, new_fndecl);
2199 update_stmt (stmt);
2202 continue;
2204 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
2205 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
2206 switch (gimple_call_internal_fn (stmt))
2208 case IFN_GOMP_USE_SIMT:
2209 rhs = vf == 1 ? integer_zero_node : integer_one_node;
2210 break;
2211 case IFN_GOMP_SIMT_ENTER:
2212 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2213 goto simtreg_enter_exit;
2214 case IFN_GOMP_SIMT_ENTER_ALLOC:
2215 if (vf != 1)
2216 ompdevlow_adjust_simt_enter (&gsi, &regimplify);
2217 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
2218 goto simtreg_enter_exit;
2219 case IFN_GOMP_SIMT_EXIT:
2220 simtreg_enter_exit:
2221 if (vf != 1)
2222 continue;
2223 unlink_stmt_vdef (stmt);
2224 break;
2225 case IFN_GOMP_SIMT_LANE:
2226 case IFN_GOMP_SIMT_LAST_LANE:
2227 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
2228 break;
2229 case IFN_GOMP_SIMT_VF:
2230 rhs = build_int_cst (type, vf);
2231 break;
2232 case IFN_GOMP_SIMT_ORDERED_PRED:
2233 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
2234 if (rhs || !lhs)
2235 unlink_stmt_vdef (stmt);
2236 break;
2237 case IFN_GOMP_SIMT_VOTE_ANY:
2238 case IFN_GOMP_SIMT_XCHG_BFLY:
2239 case IFN_GOMP_SIMT_XCHG_IDX:
2240 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2241 break;
2242 case IFN_GOMP_SIMD_LANE:
2243 case IFN_GOMP_SIMD_LAST_LANE:
2244 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
2245 break;
2246 case IFN_GOMP_SIMD_VF:
2247 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
2248 break;
2249 default:
2250 continue;
2252 if (lhs && !rhs)
2253 continue;
2254 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
2255 gsi_replace (&gsi, stmt, false);
2257 if (regimplify)
2258 FOR_EACH_BB_REVERSE_FN (bb, cfun)
2259 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
2260 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
2262 if (gimple_clobber_p (gsi_stmt (gsi)))
2263 gsi_remove (&gsi, true);
2264 else
2265 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2267 if (vf != 1)
2268 cfun->has_force_vectorize_loops = false;
2269 return 0;
2272 namespace {
2274 const pass_data pass_data_omp_device_lower =
2276 GIMPLE_PASS, /* type */
2277 "ompdevlow", /* name */
2278 OPTGROUP_OMP, /* optinfo_flags */
2279 TV_NONE, /* tv_id */
2280 PROP_cfg, /* properties_required */
2281 PROP_gimple_lomp_dev, /* properties_provided */
2282 0, /* properties_destroyed */
2283 0, /* todo_flags_start */
2284 TODO_update_ssa, /* todo_flags_finish */
2287 class pass_omp_device_lower : public gimple_opt_pass
2289 public:
2290 pass_omp_device_lower (gcc::context *ctxt)
2291 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2294 /* opt_pass methods: */
2295 virtual bool gate (function *fun)
2297 return (!(fun->curr_properties & PROP_gimple_lomp_dev)
2298 || (flag_openmp
2299 && cgraph_node::get (fun->decl)->calls_declare_variant_alt));
2301 virtual unsigned int execute (function *)
2303 return execute_omp_device_lower ();
2306 }; // class pass_expand_omp_ssa
2308 } // anon namespace
2310 gimple_opt_pass *
2311 make_pass_omp_device_lower (gcc::context *ctxt)
2313 return new pass_omp_device_lower (ctxt);
2316 /* "omp declare target link" handling pass. */
2318 namespace {
2320 const pass_data pass_data_omp_target_link =
2322 GIMPLE_PASS, /* type */
2323 "omptargetlink", /* name */
2324 OPTGROUP_OMP, /* optinfo_flags */
2325 TV_NONE, /* tv_id */
2326 PROP_ssa, /* properties_required */
2327 0, /* properties_provided */
2328 0, /* properties_destroyed */
2329 0, /* todo_flags_start */
2330 TODO_update_ssa, /* todo_flags_finish */
2333 class pass_omp_target_link : public gimple_opt_pass
2335 public:
2336 pass_omp_target_link (gcc::context *ctxt)
2337 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2340 /* opt_pass methods: */
2341 virtual bool gate (function *fun)
2343 #ifdef ACCEL_COMPILER
2344 return offloading_function_p (fun->decl);
2345 #else
2346 (void) fun;
2347 return false;
2348 #endif
2351 virtual unsigned execute (function *);
2354 /* Callback for walk_gimple_stmt used to scan for link var operands. */
2356 static tree
2357 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2359 tree t = *tp;
2361 if (VAR_P (t)
2362 && DECL_HAS_VALUE_EXPR_P (t)
2363 && is_global_var (t)
2364 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2366 *walk_subtrees = 0;
2367 return t;
2370 return NULL_TREE;
2373 unsigned
2374 pass_omp_target_link::execute (function *fun)
2376 basic_block bb;
2377 FOR_EACH_BB_FN (bb, fun)
2379 gimple_stmt_iterator gsi;
2380 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2381 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2382 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2385 return 0;
2388 } // anon namespace
2390 gimple_opt_pass *
2391 make_pass_omp_target_link (gcc::context *ctxt)
2393 return new pass_omp_target_link (ctxt);