[Ada] Fix internal error on bit-aligned component of function call
[official-gcc.git] / gcc / omp-offload.c
blob590007b943c090b7606a4e23889035c08f213a08
1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
4 Copyright (C) 2005-2020 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
55 #include "context.h"
57 /* Describe the OpenACC looping structure of a function. The entire
58 function is held in a 'NULL' loop. */
60 struct oacc_loop
62 oacc_loop *parent; /* Containing loop. */
64 oacc_loop *child; /* First inner loop. */
66 oacc_loop *sibling; /* Next loop within same parent. */
68 location_t loc; /* Location of the loop start. */
70 gcall *marker; /* Initial head marker. */
72 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
73 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
75 tree routine; /* Pseudo-loop enclosing a routine. */
77 unsigned mask; /* Partitioning mask. */
78 unsigned e_mask; /* Partitioning of element loops (when tiling). */
79 unsigned inner; /* Partitioning of inner loops. */
80 unsigned flags; /* Partitioning flags. */
81 vec<gcall *> ifns; /* Contained loop abstraction functions. */
82 tree chunk_size; /* Chunk size. */
83 gcall *head_end; /* Final marker of head sequence. */
86 /* Holds offload tables with decls. */
87 vec<tree, va_gc> *offload_funcs, *offload_vars;
89 /* Return level at which oacc routine may spawn a partitioned loop, or
90 -1 if it is not a routine (i.e. is an offload fn). */
92 int
93 oacc_fn_attrib_level (tree attr)
95 tree pos = TREE_VALUE (attr);
97 if (!TREE_PURPOSE (pos))
98 return -1;
100 int ix = 0;
101 for (ix = 0; ix != GOMP_DIM_MAX;
102 ix++, pos = TREE_CHAIN (pos))
103 if (!integer_zerop (TREE_PURPOSE (pos)))
104 break;
106 return ix;
109 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
110 adds their addresses and sizes to constructor-vector V_CTOR. */
112 static void
113 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
114 vec<constructor_elt, va_gc> *v_ctor)
116 unsigned len = vec_safe_length (v_decls);
117 for (unsigned i = 0; i < len; i++)
119 tree it = (*v_decls)[i];
120 bool is_var = VAR_P (it);
121 bool is_link_var
122 = is_var
123 #ifdef ACCEL_COMPILER
124 && DECL_HAS_VALUE_EXPR_P (it)
125 #endif
126 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
128 /* See also omp_finish_file and output_offload_tables in lto-cgraph.c. */
129 if (!in_lto_p && !symtab_node::get (it))
130 continue;
132 tree size = NULL_TREE;
133 if (is_var)
134 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
136 tree addr;
137 if (!is_link_var)
138 addr = build_fold_addr_expr (it);
139 else
141 #ifdef ACCEL_COMPILER
142 /* For "omp declare target link" vars add address of the pointer to
143 the target table, instead of address of the var. */
144 tree value_expr = DECL_VALUE_EXPR (it);
145 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
146 varpool_node::finalize_decl (link_ptr_decl);
147 addr = build_fold_addr_expr (link_ptr_decl);
148 #else
149 addr = build_fold_addr_expr (it);
150 #endif
152 /* Most significant bit of the size marks "omp declare target link"
153 vars in host and target tables. */
154 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
155 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
156 * BITS_PER_UNIT - 1);
157 size = wide_int_to_tree (const_ptr_type_node, isize);
160 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
161 if (is_var)
162 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
166 /* Return true if DECL is a function for which its references should be
167 analyzed. */
169 static bool
170 omp_declare_target_fn_p (tree decl)
172 return (TREE_CODE (decl) == FUNCTION_DECL
173 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
174 && !lookup_attribute ("omp declare target host",
175 DECL_ATTRIBUTES (decl))
176 && (!flag_openacc
177 || oacc_get_fn_attrib (decl) == NULL_TREE));
180 /* Return true if DECL Is a variable for which its initializer references
181 should be analyzed. */
183 static bool
184 omp_declare_target_var_p (tree decl)
186 return (VAR_P (decl)
187 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
188 && !lookup_attribute ("omp declare target link",
189 DECL_ATTRIBUTES (decl)));
192 /* Helper function for omp_discover_implicit_declare_target, called through
193 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
194 declare target to. */
196 static tree
197 omp_discover_declare_target_tgt_fn_r (tree *tp, int *walk_subtrees, void *data)
199 if (TREE_CODE (*tp) == FUNCTION_DECL)
201 tree decl = *tp;
202 tree id = get_identifier ("omp declare target");
203 symtab_node *node = symtab_node::get (*tp);
204 if (node != NULL)
206 while (node->alias_target
207 && TREE_CODE (node->alias_target) == FUNCTION_DECL)
209 if (!omp_declare_target_fn_p (node->decl)
210 && !lookup_attribute ("omp declare target host",
211 DECL_ATTRIBUTES (node->decl)))
213 node->offloadable = 1;
214 DECL_ATTRIBUTES (node->decl)
215 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
217 node = symtab_node::get (node->alias_target);
219 symtab_node *new_node = node->ultimate_alias_target ();
220 decl = new_node->decl;
221 while (node != new_node)
223 if (!omp_declare_target_fn_p (node->decl)
224 && !lookup_attribute ("omp declare target host",
225 DECL_ATTRIBUTES (node->decl)))
227 node->offloadable = 1;
228 DECL_ATTRIBUTES (node->decl)
229 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
231 gcc_assert (node->alias && node->analyzed);
232 node = node->get_alias_target ();
234 node->offloadable = 1;
235 if (ENABLE_OFFLOADING)
236 g->have_offload = true;
238 if (omp_declare_target_fn_p (decl)
239 || lookup_attribute ("omp declare target host",
240 DECL_ATTRIBUTES (decl)))
241 return NULL_TREE;
243 if (!DECL_EXTERNAL (decl) && DECL_SAVED_TREE (decl))
244 ((vec<tree> *) data)->safe_push (decl);
245 DECL_ATTRIBUTES (decl) = tree_cons (id, NULL_TREE,
246 DECL_ATTRIBUTES (decl));
248 else if (TYPE_P (*tp))
249 *walk_subtrees = 0;
250 /* else if (TREE_CODE (*tp) == OMP_TARGET)
252 if (tree dev = omp_find_clause (OMP_TARGET_CLAUSES (*tp)))
253 if (OMP_DEVICE_ANCESTOR (dev))
254 *walk_subtrees = 0;
255 } */
256 return NULL_TREE;
259 /* Similarly, but ignore references outside of OMP_TARGET regions. */
261 static tree
262 omp_discover_declare_target_fn_r (tree *tp, int *walk_subtrees, void *data)
264 if (TREE_CODE (*tp) == OMP_TARGET)
266 /* And not OMP_DEVICE_ANCESTOR. */
267 walk_tree_without_duplicates (&OMP_TARGET_BODY (*tp),
268 omp_discover_declare_target_tgt_fn_r,
269 data);
270 *walk_subtrees = 0;
272 else if (TYPE_P (*tp))
273 *walk_subtrees = 0;
274 return NULL_TREE;
277 /* Helper function for omp_discover_implicit_declare_target, called through
278 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
279 declare target to. */
281 static tree
282 omp_discover_declare_target_var_r (tree *tp, int *walk_subtrees, void *data)
284 if (TREE_CODE (*tp) == FUNCTION_DECL)
285 return omp_discover_declare_target_tgt_fn_r (tp, walk_subtrees, data);
286 else if (VAR_P (*tp)
287 && is_global_var (*tp)
288 && !omp_declare_target_var_p (*tp))
290 tree id = get_identifier ("omp declare target");
291 if (lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp)))
293 error_at (DECL_SOURCE_LOCATION (*tp),
294 "%qD specified both in declare target %<link%> and "
295 "implicitly in %<to%> clauses", *tp);
296 DECL_ATTRIBUTES (*tp)
297 = remove_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp));
299 if (TREE_STATIC (*tp) && DECL_INITIAL (*tp))
300 ((vec<tree> *) data)->safe_push (*tp);
301 DECL_ATTRIBUTES (*tp) = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (*tp));
302 symtab_node *node = symtab_node::get (*tp);
303 if (node != NULL && !node->offloadable)
305 node->offloadable = 1;
306 if (ENABLE_OFFLOADING)
308 g->have_offload = true;
309 if (is_a <varpool_node *> (node))
310 vec_safe_push (offload_vars, node->decl);
314 else if (TYPE_P (*tp))
315 *walk_subtrees = 0;
316 return NULL_TREE;
319 /* Perform the OpenMP implicit declare target to discovery. */
321 void
322 omp_discover_implicit_declare_target (void)
324 cgraph_node *node;
325 varpool_node *vnode;
326 auto_vec<tree> worklist;
328 FOR_EACH_DEFINED_FUNCTION (node)
329 if (DECL_SAVED_TREE (node->decl))
331 struct cgraph_node *cgn;
332 if (omp_declare_target_fn_p (node->decl))
333 worklist.safe_push (node->decl);
334 else if (DECL_STRUCT_FUNCTION (node->decl)
335 && DECL_STRUCT_FUNCTION (node->decl)->has_omp_target)
336 worklist.safe_push (node->decl);
337 for (cgn = node->nested; cgn; cgn = cgn->next_nested)
338 if (omp_declare_target_fn_p (cgn->decl))
339 worklist.safe_push (cgn->decl);
340 else if (DECL_STRUCT_FUNCTION (cgn->decl)
341 && DECL_STRUCT_FUNCTION (cgn->decl)->has_omp_target)
342 worklist.safe_push (cgn->decl);
344 FOR_EACH_STATIC_INITIALIZER (vnode)
345 if (omp_declare_target_var_p (vnode->decl))
346 worklist.safe_push (vnode->decl);
347 while (!worklist.is_empty ())
349 tree decl = worklist.pop ();
350 if (VAR_P (decl))
351 walk_tree_without_duplicates (&DECL_INITIAL (decl),
352 omp_discover_declare_target_var_r,
353 &worklist);
354 else if (omp_declare_target_fn_p (decl))
355 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
356 omp_discover_declare_target_tgt_fn_r,
357 &worklist);
358 else
359 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
360 omp_discover_declare_target_fn_r,
361 &worklist);
366 /* Create new symbols containing (address, size) pairs for global variables,
367 marked with "omp declare target" attribute, as well as addresses for the
368 functions, which are outlined offloading regions. */
369 void
370 omp_finish_file (void)
372 unsigned num_funcs = vec_safe_length (offload_funcs);
373 unsigned num_vars = vec_safe_length (offload_vars);
375 if (num_funcs == 0 && num_vars == 0)
376 return;
378 if (targetm_common.have_named_sections)
380 vec<constructor_elt, va_gc> *v_f, *v_v;
381 vec_alloc (v_f, num_funcs);
382 vec_alloc (v_v, num_vars * 2);
384 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
385 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
387 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
388 vec_safe_length (v_v));
389 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
390 num_funcs);
391 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
392 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
393 tree ctor_v = build_constructor (vars_decl_type, v_v);
394 tree ctor_f = build_constructor (funcs_decl_type, v_f);
395 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
396 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
397 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
398 get_identifier (".offload_func_table"),
399 funcs_decl_type);
400 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
401 get_identifier (".offload_var_table"),
402 vars_decl_type);
403 TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
404 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
405 otherwise a joint table in a binary will contain padding between
406 tables from multiple object files. */
407 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
408 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
409 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
410 DECL_INITIAL (funcs_decl) = ctor_f;
411 DECL_INITIAL (vars_decl) = ctor_v;
412 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
413 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
415 varpool_node::finalize_decl (vars_decl);
416 varpool_node::finalize_decl (funcs_decl);
418 else
420 for (unsigned i = 0; i < num_funcs; i++)
422 tree it = (*offload_funcs)[i];
423 /* See also add_decls_addresses_to_decl_constructor
424 and output_offload_tables in lto-cgraph.c. */
425 if (!in_lto_p && !symtab_node::get (it))
426 continue;
427 targetm.record_offload_symbol (it);
429 for (unsigned i = 0; i < num_vars; i++)
431 tree it = (*offload_vars)[i];
432 if (!in_lto_p && !symtab_node::get (it))
433 continue;
434 #ifdef ACCEL_COMPILER
435 if (DECL_HAS_VALUE_EXPR_P (it)
436 && lookup_attribute ("omp declare target link",
437 DECL_ATTRIBUTES (it)))
439 tree value_expr = DECL_VALUE_EXPR (it);
440 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
441 targetm.record_offload_symbol (link_ptr_decl);
442 varpool_node::finalize_decl (link_ptr_decl);
444 else
445 #endif
446 targetm.record_offload_symbol (it);
451 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
452 axis DIM. Return a tmp var holding the result. */
454 static tree
455 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
457 tree arg = build_int_cst (unsigned_type_node, dim);
458 tree size = create_tmp_var (integer_type_node);
459 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
460 gimple *call = gimple_build_call_internal (fn, 1, arg);
462 gimple_call_set_lhs (call, size);
463 gimple_seq_add_stmt (seq, call);
465 return size;
468 /* Find the number of threads (POS = false), or thread number (POS =
469 true) for an OpenACC region partitioned as MASK. Setup code
470 required for the calculation is added to SEQ. */
472 static tree
473 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
475 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
476 unsigned ix;
478 /* Start at gang level, and examine relevant dimension indices. */
479 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
480 if (GOMP_DIM_MASK (ix) & mask)
482 if (res)
484 /* We had an outer index, so scale that by the size of
485 this dimension. */
486 tree n = oacc_dim_call (false, ix, seq);
487 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
489 if (pos)
491 /* Determine index in this dimension. */
492 tree id = oacc_dim_call (true, ix, seq);
493 if (res)
494 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
495 else
496 res = id;
500 if (res == NULL_TREE)
501 res = integer_zero_node;
503 return res;
506 /* Transform IFN_GOACC_LOOP calls to actual code. See
507 expand_oacc_for for where these are generated. At the vector
508 level, we stride loops, such that each member of a warp will
509 operate on adjacent iterations. At the worker and gang level,
510 each gang/warp executes a set of contiguous iterations. Chunking
511 can override this such that each iteration engine executes a
512 contiguous chunk, and then moves on to stride to the next chunk. */
514 static void
515 oacc_xform_loop (gcall *call)
517 gimple_stmt_iterator gsi = gsi_for_stmt (call);
518 enum ifn_goacc_loop_kind code
519 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
520 tree dir = gimple_call_arg (call, 1);
521 tree range = gimple_call_arg (call, 2);
522 tree step = gimple_call_arg (call, 3);
523 tree chunk_size = NULL_TREE;
524 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
525 tree lhs = gimple_call_lhs (call);
526 tree type = NULL_TREE;
527 tree diff_type = TREE_TYPE (range);
528 tree r = NULL_TREE;
529 gimple_seq seq = NULL;
530 bool chunking = false, striding = true;
531 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
532 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
534 /* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
535 if (!lhs)
537 gsi_replace_with_seq (&gsi, seq, true);
538 return;
541 type = TREE_TYPE (lhs);
543 #ifdef ACCEL_COMPILER
544 chunk_size = gimple_call_arg (call, 4);
545 if (integer_minus_onep (chunk_size) /* Force static allocation. */
546 || integer_zerop (chunk_size)) /* Default (also static). */
548 /* If we're at the gang level, we want each to execute a
549 contiguous run of iterations. Otherwise we want each element
550 to stride. */
551 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
552 chunking = false;
554 else
556 /* Chunk of size 1 is striding. */
557 striding = integer_onep (chunk_size);
558 chunking = !striding;
560 #endif
562 /* striding=true, chunking=true
563 -> invalid.
564 striding=true, chunking=false
565 -> chunks=1
566 striding=false,chunking=true
567 -> chunks=ceil (range/(chunksize*threads*step))
568 striding=false,chunking=false
569 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
570 push_gimplify_context (true);
572 switch (code)
574 default: gcc_unreachable ();
576 case IFN_GOACC_LOOP_CHUNKS:
577 if (!chunking)
578 r = build_int_cst (type, 1);
579 else
581 /* chunk_max
582 = (range - dir) / (chunks * step * num_threads) + dir */
583 tree per = oacc_thread_numbers (false, mask, &seq);
584 per = fold_convert (type, per);
585 chunk_size = fold_convert (type, chunk_size);
586 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
587 per = fold_build2 (MULT_EXPR, type, per, step);
588 r = build2 (MINUS_EXPR, type, range, dir);
589 r = build2 (PLUS_EXPR, type, r, per);
590 r = build2 (TRUNC_DIV_EXPR, type, r, per);
592 break;
594 case IFN_GOACC_LOOP_STEP:
596 /* If striding, step by the entire compute volume, otherwise
597 step by the inner volume. */
598 unsigned volume = striding ? mask : inner_mask;
600 r = oacc_thread_numbers (false, volume, &seq);
601 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
603 break;
605 case IFN_GOACC_LOOP_OFFSET:
606 /* Enable vectorization on non-SIMT targets. */
607 if (!targetm.simt.vf
608 && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
609 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
610 the loop. */
611 && (flag_tree_loop_vectorize
612 || !global_options_set.x_flag_tree_loop_vectorize))
614 basic_block bb = gsi_bb (gsi);
615 class loop *parent = bb->loop_father;
616 class loop *body = parent->inner;
618 parent->force_vectorize = true;
619 parent->safelen = INT_MAX;
621 /* "Chunking loops" may have inner loops. */
622 if (parent->inner)
624 body->force_vectorize = true;
625 body->safelen = INT_MAX;
628 cfun->has_force_vectorize_loops = true;
630 if (striding)
632 r = oacc_thread_numbers (true, mask, &seq);
633 r = fold_convert (diff_type, r);
635 else
637 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
638 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
639 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
640 inner_size, outer_size);
642 volume = fold_convert (diff_type, volume);
643 if (chunking)
644 chunk_size = fold_convert (diff_type, chunk_size);
645 else
647 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
649 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
650 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
651 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
654 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
655 fold_convert (diff_type, inner_size));
656 r = oacc_thread_numbers (true, outer_mask, &seq);
657 r = fold_convert (diff_type, r);
658 r = build2 (MULT_EXPR, diff_type, r, span);
660 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
661 inner = fold_convert (diff_type, inner);
662 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
664 if (chunking)
666 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
667 tree per
668 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
669 per = build2 (MULT_EXPR, diff_type, per, chunk);
671 r = build2 (PLUS_EXPR, diff_type, r, per);
674 r = fold_build2 (MULT_EXPR, diff_type, r, step);
675 if (type != diff_type)
676 r = fold_convert (type, r);
677 break;
679 case IFN_GOACC_LOOP_BOUND:
680 if (striding)
681 r = range;
682 else
684 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
685 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
686 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
687 inner_size, outer_size);
689 volume = fold_convert (diff_type, volume);
690 if (chunking)
691 chunk_size = fold_convert (diff_type, chunk_size);
692 else
694 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
696 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
697 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
698 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
701 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
702 fold_convert (diff_type, inner_size));
704 r = fold_build2 (MULT_EXPR, diff_type, span, step);
706 tree offset = gimple_call_arg (call, 6);
707 r = build2 (PLUS_EXPR, diff_type, r,
708 fold_convert (diff_type, offset));
709 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
710 diff_type, r, range);
712 if (diff_type != type)
713 r = fold_convert (type, r);
714 break;
717 gimplify_assign (lhs, r, &seq);
719 pop_gimplify_context (NULL);
721 gsi_replace_with_seq (&gsi, seq, true);
724 /* Transform a GOACC_TILE call. Determines the element loop span for
725 the specified loop of the nest. This is 1 if we're not tiling.
727 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
729 static void
730 oacc_xform_tile (gcall *call)
732 gimple_stmt_iterator gsi = gsi_for_stmt (call);
733 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
734 /* Inner loops have higher loop_nos. */
735 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
736 tree tile_size = gimple_call_arg (call, 2);
737 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
738 tree lhs = gimple_call_lhs (call);
739 tree type = TREE_TYPE (lhs);
740 gimple_seq seq = NULL;
741 tree span = build_int_cst (type, 1);
743 gcc_assert (!(e_mask
744 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
745 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
746 push_gimplify_context (!seen_error ());
748 #ifndef ACCEL_COMPILER
749 /* Partitioning disabled on host compilers. */
750 e_mask = 0;
751 #endif
752 if (!e_mask)
753 /* Not paritioning. */
754 span = integer_one_node;
755 else if (!integer_zerop (tile_size))
756 /* User explicitly specified size. */
757 span = tile_size;
758 else
760 /* Pick a size based on the paritioning of the element loop and
761 the number of loop nests. */
762 tree first_size = NULL_TREE;
763 tree second_size = NULL_TREE;
765 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
766 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
767 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
768 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
770 if (!first_size)
772 first_size = second_size;
773 second_size = NULL_TREE;
776 if (loop_no + 1 == collapse)
778 span = first_size;
779 if (!loop_no && second_size)
780 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
781 span, second_size);
783 else if (loop_no + 2 == collapse)
784 span = second_size;
785 else
786 span = NULL_TREE;
788 if (!span)
789 /* There's no obvious element size for this loop. Options
790 are 1, first_size or some non-unity constant (32 is my
791 favourite). We should gather some statistics. */
792 span = first_size;
795 span = fold_convert (type, span);
796 gimplify_assign (lhs, span, &seq);
798 pop_gimplify_context (NULL);
800 gsi_replace_with_seq (&gsi, seq, true);
803 /* Default partitioned and minimum partitioned dimensions. */
805 static int oacc_default_dims[GOMP_DIM_MAX];
806 static int oacc_min_dims[GOMP_DIM_MAX];
809 oacc_get_default_dim (int dim)
811 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
812 return oacc_default_dims[dim];
816 oacc_get_min_dim (int dim)
818 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
819 return oacc_min_dims[dim];
822 /* Parse the default dimension parameter. This is a set of
823 :-separated optional compute dimensions. Each specified dimension
824 is a positive integer. When device type support is added, it is
825 planned to be a comma separated list of such compute dimensions,
826 with all but the first prefixed by the colon-terminated device
827 type. */
829 static void
830 oacc_parse_default_dims (const char *dims)
832 int ix;
834 for (ix = GOMP_DIM_MAX; ix--;)
836 oacc_default_dims[ix] = -1;
837 oacc_min_dims[ix] = 1;
840 #ifndef ACCEL_COMPILER
841 /* Cannot be overridden on the host. */
842 dims = NULL;
843 #endif
844 if (dims)
846 const char *pos = dims;
848 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
850 if (ix)
852 if (*pos != ':')
853 goto malformed;
854 pos++;
857 if (*pos != ':')
859 long val;
860 const char *eptr;
862 errno = 0;
863 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
864 if (errno || val <= 0 || (int) val != val)
865 goto malformed;
866 pos = eptr;
867 oacc_default_dims[ix] = (int) val;
870 if (*pos)
872 malformed:
873 error_at (UNKNOWN_LOCATION,
874 "%<-fopenacc-dim%> operand is malformed at %qs", pos);
878 /* Allow the backend to validate the dimensions. */
879 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
880 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
883 /* Validate and update the dimensions for offloaded FN. ATTRS is the
884 raw attribute. DIMS is an array of dimensions, which is filled in.
885 LEVEL is the partitioning level of a routine, or -1 for an offload
886 region itself. USED is the mask of partitioned execution in the
887 function. */
889 static void
890 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
892 tree purpose[GOMP_DIM_MAX];
893 unsigned ix;
894 tree pos = TREE_VALUE (attrs);
896 /* Make sure the attribute creator attached the dimension
897 information. */
898 gcc_assert (pos);
900 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
902 purpose[ix] = TREE_PURPOSE (pos);
903 tree val = TREE_VALUE (pos);
904 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
905 pos = TREE_CHAIN (pos);
908 bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
910 /* Default anything left to 1 or a partitioned default. */
911 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
912 if (dims[ix] < 0)
914 /* The OpenACC spec says 'If the [num_gangs] clause is not
915 specified, an implementation-defined default will be used;
916 the default may depend on the code within the construct.'
917 (2.5.6). Thus an implementation is free to choose
918 non-unity default for a parallel region that doesn't have
919 any gang-partitioned loops. However, it appears that there
920 is a sufficient body of user code that expects non-gang
921 partitioned regions to not execute in gang-redundant mode.
922 So we (a) don't warn about the non-portability and (b) pick
923 the minimum permissible dimension size when there is no
924 partitioned execution. Otherwise we pick the global
925 default for the dimension, which the user can control. The
926 same wording and logic applies to num_workers and
927 vector_length, however the worker- or vector- single
928 execution doesn't have the same impact as gang-redundant
929 execution. (If the minimum gang-level partioning is not 1,
930 the target is probably too confusing.) */
931 dims[ix] = (used & GOMP_DIM_MASK (ix)
932 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
933 changed = true;
936 if (changed)
938 /* Replace the attribute with new values. */
939 pos = NULL_TREE;
940 for (ix = GOMP_DIM_MAX; ix--;)
941 pos = tree_cons (purpose[ix],
942 build_int_cst (integer_type_node, dims[ix]), pos);
943 oacc_replace_fn_attrib (fn, pos);
947 /* Create an empty OpenACC loop structure at LOC. */
949 static oacc_loop *
950 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
952 oacc_loop *loop = XCNEW (oacc_loop);
954 loop->parent = parent;
956 if (parent)
958 loop->sibling = parent->child;
959 parent->child = loop;
962 loop->loc = loc;
963 return loop;
966 /* Create an outermost, dummy OpenACC loop for offloaded function
967 DECL. */
969 static oacc_loop *
970 new_oacc_loop_outer (tree decl)
972 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
975 /* Start a new OpenACC loop structure beginning at head marker HEAD.
976 Link into PARENT loop. Return the new loop. */
978 static oacc_loop *
979 new_oacc_loop (oacc_loop *parent, gcall *marker)
981 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
983 loop->marker = marker;
985 /* TODO: This is where device_type flattening would occur for the loop
986 flags. */
988 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
990 tree chunk_size = integer_zero_node;
991 if (loop->flags & OLF_GANG_STATIC)
992 chunk_size = gimple_call_arg (marker, 4);
993 loop->chunk_size = chunk_size;
995 return loop;
998 /* Create a dummy loop encompassing a call to a openACC routine.
999 Extract the routine's partitioning requirements. */
1001 static void
1002 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
1004 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
1005 int level = oacc_fn_attrib_level (attrs);
1007 gcc_assert (level >= 0);
1009 loop->marker = call;
1010 loop->routine = decl;
1011 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
1012 ^ (GOMP_DIM_MASK (level) - 1));
1015 /* Finish off the current OpenACC loop ending at tail marker TAIL.
1016 Return the parent loop. */
1018 static oacc_loop *
1019 finish_oacc_loop (oacc_loop *loop)
1021 /* If the loop has been collapsed, don't partition it. */
1022 if (loop->ifns.is_empty ())
1023 loop->mask = loop->flags = 0;
1024 return loop->parent;
1027 /* Free all OpenACC loop structures within LOOP (inclusive). */
1029 static void
1030 free_oacc_loop (oacc_loop *loop)
1032 if (loop->sibling)
1033 free_oacc_loop (loop->sibling);
1034 if (loop->child)
1035 free_oacc_loop (loop->child);
1037 loop->ifns.release ();
1038 free (loop);
1041 /* Dump out the OpenACC loop head or tail beginning at FROM. */
1043 static void
1044 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
1045 const char *title, int level)
1047 enum ifn_unique_kind kind
1048 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1050 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
1051 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1053 gimple *stmt = gsi_stmt (gsi);
1055 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1057 enum ifn_unique_kind k
1058 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
1059 (gimple_call_arg (stmt, 0)));
1061 if (k == kind && stmt != from)
1062 break;
1064 print_gimple_stmt (file, stmt, depth * 2 + 2);
1066 gsi_next (&gsi);
1067 while (gsi_end_p (gsi))
1068 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1072 /* Dump OpenACC loop LOOP, its children, and its siblings. */
1074 static void
1075 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
1077 int ix;
1079 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
1080 loop->flags, loop->mask,
1081 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
1083 if (loop->marker)
1084 print_gimple_stmt (file, loop->marker, depth * 2);
1086 if (loop->routine)
1087 fprintf (file, "%*sRoutine %s:%u:%s\n",
1088 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
1089 DECL_SOURCE_LINE (loop->routine),
1090 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
1092 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
1093 if (loop->heads[ix])
1094 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
1095 for (ix = GOMP_DIM_MAX; ix--;)
1096 if (loop->tails[ix])
1097 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
1099 if (loop->child)
1100 dump_oacc_loop (file, loop->child, depth + 1);
1101 if (loop->sibling)
1102 dump_oacc_loop (file, loop->sibling, depth);
1105 void debug_oacc_loop (oacc_loop *);
1107 /* Dump loops to stderr. */
1109 DEBUG_FUNCTION void
1110 debug_oacc_loop (oacc_loop *loop)
1112 dump_oacc_loop (stderr, loop, 0);
1115 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
1116 siblings. */
1118 static void
1119 inform_oacc_loop (const oacc_loop *loop)
1121 const char *gang
1122 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
1123 const char *worker
1124 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
1125 const char *vector
1126 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
1127 const char *seq = loop->mask == 0 ? " seq" : "";
1128 const dump_user_location_t loc
1129 = dump_user_location_t::from_location_t (loop->loc);
1130 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1131 "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
1132 vector, seq);
1134 if (loop->child)
1135 inform_oacc_loop (loop->child);
1136 if (loop->sibling)
1137 inform_oacc_loop (loop->sibling);
1140 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
1141 structures as we go. By construction these loops are properly
1142 nested. */
1144 static void
1145 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
1147 int marker = 0;
1148 int remaining = 0;
1150 if (bb->flags & BB_VISITED)
1151 return;
1153 follow:
1154 bb->flags |= BB_VISITED;
1156 /* Scan for loop markers. */
1157 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1158 gsi_next (&gsi))
1160 gimple *stmt = gsi_stmt (gsi);
1162 if (!is_gimple_call (stmt))
1163 continue;
1165 gcall *call = as_a <gcall *> (stmt);
1167 /* If this is a routine, make a dummy loop for it. */
1168 if (tree decl = gimple_call_fndecl (call))
1169 if (tree attrs = oacc_get_fn_attrib (decl))
1171 gcc_assert (!marker);
1172 new_oacc_loop_routine (loop, call, decl, attrs);
1175 if (!gimple_call_internal_p (call))
1176 continue;
1178 switch (gimple_call_internal_fn (call))
1180 default:
1181 break;
1183 case IFN_GOACC_LOOP:
1184 case IFN_GOACC_TILE:
1185 /* Record the abstraction function, so we can manipulate it
1186 later. */
1187 loop->ifns.safe_push (call);
1188 break;
1190 case IFN_UNIQUE:
1191 enum ifn_unique_kind kind
1192 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
1193 (gimple_call_arg (call, 0)));
1194 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
1195 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
1197 if (gimple_call_num_args (call) == 2)
1199 gcc_assert (marker && !remaining);
1200 marker = 0;
1201 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
1202 loop = finish_oacc_loop (loop);
1203 else
1204 loop->head_end = call;
1206 else
1208 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1210 if (!marker)
1212 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1213 loop = new_oacc_loop (loop, call);
1214 remaining = count;
1216 gcc_assert (count == remaining);
1217 if (remaining)
1219 remaining--;
1220 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1221 loop->heads[marker] = call;
1222 else
1223 loop->tails[remaining] = call;
1225 marker++;
1230 if (remaining || marker)
1232 bb = single_succ (bb);
1233 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1234 goto follow;
1237 /* Walk successor blocks. */
1238 edge e;
1239 edge_iterator ei;
1241 FOR_EACH_EDGE (e, ei, bb->succs)
1242 oacc_loop_discover_walk (loop, e->dest);
1245 /* LOOP is the first sibling. Reverse the order in place and return
1246 the new first sibling. Recurse to child loops. */
1248 static oacc_loop *
1249 oacc_loop_sibling_nreverse (oacc_loop *loop)
1251 oacc_loop *last = NULL;
1254 if (loop->child)
1255 loop->child = oacc_loop_sibling_nreverse (loop->child);
1257 oacc_loop *next = loop->sibling;
1258 loop->sibling = last;
1259 last = loop;
1260 loop = next;
1262 while (loop);
1264 return last;
1267 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1268 the current function. */
1270 static oacc_loop *
1271 oacc_loop_discovery ()
1273 /* Clear basic block flags, in particular BB_VISITED which we're going to use
1274 in the following. */
1275 clear_bb_flags ();
1277 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1278 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1280 /* The siblings were constructed in reverse order, reverse them so
1281 that diagnostics come out in an unsurprising order. */
1282 top = oacc_loop_sibling_nreverse (top);
1284 return top;
1287 /* Transform the abstract internal function markers starting at FROM
1288 to be for partitioning level LEVEL. Stop when we meet another HEAD
1289 or TAIL marker. */
1291 static void
1292 oacc_loop_xform_head_tail (gcall *from, int level)
1294 enum ifn_unique_kind kind
1295 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1296 tree replacement = build_int_cst (unsigned_type_node, level);
1298 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1300 gimple *stmt = gsi_stmt (gsi);
1302 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1304 enum ifn_unique_kind k
1305 = ((enum ifn_unique_kind)
1306 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1308 if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1309 *gimple_call_arg_ptr (stmt, 2) = replacement;
1310 else if (k == kind && stmt != from)
1311 break;
1313 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1314 *gimple_call_arg_ptr (stmt, 3) = replacement;
1316 gsi_next (&gsi);
1317 while (gsi_end_p (gsi))
1318 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1322 /* Process the discovered OpenACC loops, setting the correct
1323 partitioning level etc. */
1325 static void
1326 oacc_loop_process (oacc_loop *loop)
1328 if (loop->child)
1329 oacc_loop_process (loop->child);
1331 if (loop->mask && !loop->routine)
1333 int ix;
1334 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1335 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1336 tree chunk_arg = loop->chunk_size;
1337 gcall *call;
1339 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1340 switch (gimple_call_internal_fn (call))
1342 case IFN_GOACC_LOOP:
1344 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1345 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1346 if (!is_e)
1347 gimple_call_set_arg (call, 4, chunk_arg);
1349 break;
1351 case IFN_GOACC_TILE:
1352 gimple_call_set_arg (call, 3, mask_arg);
1353 gimple_call_set_arg (call, 4, e_mask_arg);
1354 break;
1356 default:
1357 gcc_unreachable ();
1360 unsigned dim = GOMP_DIM_GANG;
1361 unsigned mask = loop->mask | loop->e_mask;
1362 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1364 while (!(GOMP_DIM_MASK (dim) & mask))
1365 dim++;
1367 oacc_loop_xform_head_tail (loop->heads[ix], dim);
1368 oacc_loop_xform_head_tail (loop->tails[ix], dim);
1370 mask ^= GOMP_DIM_MASK (dim);
1374 if (loop->sibling)
1375 oacc_loop_process (loop->sibling);
1378 /* Walk the OpenACC loop heirarchy checking and assigning the
1379 programmer-specified partitionings. OUTER_MASK is the partitioning
1380 this loop is contained within. Return mask of partitioning
1381 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1382 bit. */
1384 static unsigned
1385 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1387 unsigned this_mask = loop->mask;
1388 unsigned mask_all = 0;
1389 bool noisy = true;
1391 #ifdef ACCEL_COMPILER
1392 /* When device_type is supported, we want the device compiler to be
1393 noisy, if the loop parameters are device_type-specific. */
1394 noisy = false;
1395 #endif
1397 if (!loop->routine)
1399 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1400 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1401 bool tiling = (loop->flags & OLF_TILE) != 0;
1403 this_mask = ((loop->flags >> OLF_DIM_BASE)
1404 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1406 /* Apply auto partitioning if this is a non-partitioned regular
1407 loop, or (no more than) single axis tiled loop. */
1408 bool maybe_auto
1409 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1411 if ((this_mask != 0) + auto_par + seq_par > 1)
1413 if (noisy)
1414 error_at (loop->loc,
1415 seq_par
1416 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1417 : G_("%<auto%> conflicts with other OpenACC loop "
1418 "specifiers"));
1419 maybe_auto = false;
1420 loop->flags &= ~OLF_AUTO;
1421 if (seq_par)
1423 loop->flags
1424 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1425 this_mask = 0;
1429 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1431 loop->flags |= OLF_AUTO;
1432 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1436 if (this_mask & outer_mask)
1438 const oacc_loop *outer;
1439 for (outer = loop->parent; outer; outer = outer->parent)
1440 if ((outer->mask | outer->e_mask) & this_mask)
1441 break;
1443 if (noisy)
1445 if (outer)
1447 error_at (loop->loc,
1448 loop->routine
1449 ? G_("routine call uses same OpenACC parallelism"
1450 " as containing loop")
1451 : G_("inner loop uses same OpenACC parallelism"
1452 " as containing loop"));
1453 inform (outer->loc, "containing loop here");
1455 else
1456 error_at (loop->loc,
1457 loop->routine
1458 ? G_("routine call uses OpenACC parallelism disallowed"
1459 " by containing routine")
1460 : G_("loop uses OpenACC parallelism disallowed"
1461 " by containing routine"));
1463 if (loop->routine)
1464 inform (DECL_SOURCE_LOCATION (loop->routine),
1465 "routine %qD declared here", loop->routine);
1467 this_mask &= ~outer_mask;
1469 else
1471 unsigned outermost = least_bit_hwi (this_mask);
1473 if (outermost && outermost <= outer_mask)
1475 if (noisy)
1477 error_at (loop->loc,
1478 "incorrectly nested OpenACC loop parallelism");
1480 const oacc_loop *outer;
1481 for (outer = loop->parent;
1482 outer->flags && outer->flags < outermost;
1483 outer = outer->parent)
1484 continue;
1485 inform (outer->loc, "containing loop here");
1488 this_mask &= ~outermost;
1492 mask_all |= this_mask;
1494 if (loop->flags & OLF_TILE)
1496 /* When tiling, vector goes to the element loop, and failing
1497 that we put worker there. The std doesn't contemplate
1498 specifying all three. We choose to put worker and vector on
1499 the element loops in that case. */
1500 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1501 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1502 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1504 loop->e_mask = this_e_mask;
1505 this_mask ^= this_e_mask;
1508 loop->mask = this_mask;
1510 if (dump_file)
1511 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1512 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1513 loop->mask, loop->e_mask);
1515 if (loop->child)
1517 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1518 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1519 mask_all |= loop->inner;
1522 if (loop->sibling)
1523 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1525 return mask_all;
1528 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1529 OUTER_MASK is the partitioning this loop is contained within.
1530 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1531 Return the cumulative partitioning used by this loop, siblings and
1532 children. */
1534 static unsigned
1535 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1536 bool outer_assign)
1538 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1539 bool noisy = true;
1540 bool tiling = loop->flags & OLF_TILE;
1542 #ifdef ACCEL_COMPILER
1543 /* When device_type is supported, we want the device compiler to be
1544 noisy, if the loop parameters are device_type-specific. */
1545 noisy = false;
1546 #endif
1548 if (assign && (!outer_assign || loop->inner))
1550 /* Allocate outermost and non-innermost loops at the outermost
1551 non-innermost available level. */
1552 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1554 /* Find the first outermost available partition. */
1555 while (this_mask <= outer_mask)
1556 this_mask <<= 1;
1558 /* Grab two axes if tiling, and we've not assigned anything */
1559 if (tiling && !(loop->mask | loop->e_mask))
1560 this_mask |= this_mask << 1;
1562 /* Prohibit the innermost partitioning at the moment. */
1563 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1565 /* Don't use any dimension explicitly claimed by an inner loop. */
1566 this_mask &= ~loop->inner;
1568 if (tiling && !loop->e_mask)
1570 /* If we got two axes, allocate the inner one to the element
1571 loop. */
1572 loop->e_mask = this_mask & (this_mask << 1);
1573 this_mask ^= loop->e_mask;
1576 loop->mask |= this_mask;
1579 if (loop->child)
1581 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1582 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1583 outer_assign | assign);
1586 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1588 /* Allocate the loop at the innermost available level. Note
1589 that we do this even if we already assigned this loop the
1590 outermost available level above. That way we'll partition
1591 this along 2 axes, if they are available. */
1592 unsigned this_mask = 0;
1594 /* Determine the outermost partitioning used within this loop. */
1595 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1596 this_mask = least_bit_hwi (this_mask);
1598 /* Pick the partitioning just inside that one. */
1599 this_mask >>= 1;
1601 /* And avoid picking one use by an outer loop. */
1602 this_mask &= ~outer_mask;
1604 /* If tiling and we failed completely above, grab the next one
1605 too. Making sure it doesn't hit an outer loop. */
1606 if (tiling)
1608 this_mask &= ~(loop->e_mask | loop->mask);
1609 unsigned tile_mask = ((this_mask >> 1)
1610 & ~(outer_mask | loop->e_mask | loop->mask));
1612 if (tile_mask || loop->mask)
1614 loop->e_mask |= this_mask;
1615 this_mask = tile_mask;
1617 if (!loop->e_mask && noisy)
1618 warning_at (loop->loc, 0,
1619 "insufficient partitioning available"
1620 " to parallelize element loop");
1623 loop->mask |= this_mask;
1624 if (!loop->mask && noisy)
1625 warning_at (loop->loc, 0,
1626 tiling
1627 ? G_("insufficient partitioning available"
1628 " to parallelize tile loop")
1629 : G_("insufficient partitioning available"
1630 " to parallelize loop"));
1633 if (assign && dump_file)
1634 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1635 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1636 loop->mask, loop->e_mask);
1638 unsigned inner_mask = 0;
1640 if (loop->sibling)
1641 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1642 outer_mask, outer_assign);
1644 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1646 return inner_mask;
1649 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1650 axes. Return mask of partitioning. */
1652 static unsigned
1653 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1655 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1657 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1659 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1660 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1662 return mask_all;
1665 /* Default fork/join early expander. Delete the function calls if
1666 there is no RTL expander. */
1668 bool
1669 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1670 const int *ARG_UNUSED (dims), bool is_fork)
1672 if (is_fork)
1673 return targetm.have_oacc_fork ();
1674 else
1675 return targetm.have_oacc_join ();
1678 /* Default goacc.reduction early expander.
1680 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1681 If RES_PTR is not integer-zerop:
1682 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1683 TEARDOWN - emit '*RES_PTR = VAR'
1684 If LHS is not NULL
1685 emit 'LHS = VAR' */
1687 void
1688 default_goacc_reduction (gcall *call)
1690 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1691 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1692 tree lhs = gimple_call_lhs (call);
1693 tree var = gimple_call_arg (call, 2);
1694 gimple_seq seq = NULL;
1696 if (code == IFN_GOACC_REDUCTION_SETUP
1697 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1699 /* Setup and Teardown need to copy from/to the receiver object,
1700 if there is one. */
1701 tree ref_to_res = gimple_call_arg (call, 1);
1703 if (!integer_zerop (ref_to_res))
1705 tree dst = build_simple_mem_ref (ref_to_res);
1706 tree src = var;
1708 if (code == IFN_GOACC_REDUCTION_SETUP)
1710 src = dst;
1711 dst = lhs;
1712 lhs = NULL;
1714 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1718 /* Copy VAR to LHS, if there is an LHS. */
1719 if (lhs)
1720 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1722 gsi_replace_with_seq (&gsi, seq, true);
1725 /* Main entry point for oacc transformations which run on the device
1726 compiler after LTO, so we know what the target device is at this
1727 point (including the host fallback). */
1729 static unsigned int
1730 execute_oacc_device_lower ()
1732 tree attrs = oacc_get_fn_attrib (current_function_decl);
1734 if (!attrs)
1735 /* Not an offloaded function. */
1736 return 0;
1738 /* Parse the default dim argument exactly once. */
1739 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1741 oacc_parse_default_dims (flag_openacc_dims);
1742 flag_openacc_dims = (char *)&flag_openacc_dims;
1745 bool is_oacc_kernels
1746 = (lookup_attribute ("oacc kernels",
1747 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1748 bool is_oacc_kernels_parallelized
1749 = (lookup_attribute ("oacc kernels parallelized",
1750 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1752 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
1753 kernels, so remove the parallelism dimensions function attributes
1754 potentially set earlier on. */
1755 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
1757 oacc_set_fn_attrib (current_function_decl, NULL, NULL);
1758 attrs = oacc_get_fn_attrib (current_function_decl);
1761 /* Discover, partition and process the loops. */
1762 oacc_loop *loops = oacc_loop_discovery ();
1763 int fn_level = oacc_fn_attrib_level (attrs);
1765 if (dump_file)
1767 if (fn_level >= 0)
1768 fprintf (dump_file, "Function is OpenACC routine level %d\n",
1769 fn_level);
1770 else if (is_oacc_kernels)
1771 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1772 (is_oacc_kernels_parallelized
1773 ? "parallelized" : "unparallelized"));
1774 else
1775 fprintf (dump_file, "Function is OpenACC parallel offload\n");
1778 unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
1779 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1780 /* OpenACC kernels constructs are special: they currently don't use the
1781 generic oacc_loop infrastructure and attribute/dimension processing. */
1782 if (is_oacc_kernels && is_oacc_kernels_parallelized)
1784 /* Parallelized OpenACC kernels constructs use gang parallelism. See
1785 also tree-parloops.c:create_parallel_loop. */
1786 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
1789 int dims[GOMP_DIM_MAX];
1790 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1792 if (dump_file)
1794 const char *comma = "Compute dimensions [";
1795 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1796 fprintf (dump_file, "%s%d", comma, dims[ix]);
1797 fprintf (dump_file, "]\n");
1800 oacc_loop_process (loops);
1801 if (dump_file)
1803 fprintf (dump_file, "OpenACC loops\n");
1804 dump_oacc_loop (dump_file, loops, 0);
1805 fprintf (dump_file, "\n");
1807 if (dump_enabled_p ())
1809 oacc_loop *l = loops;
1810 /* OpenACC kernels constructs are special: they currently don't use the
1811 generic oacc_loop infrastructure. */
1812 if (is_oacc_kernels)
1814 /* Create a fake oacc_loop for diagnostic purposes. */
1815 l = new_oacc_loop_raw (NULL,
1816 DECL_SOURCE_LOCATION (current_function_decl));
1817 l->mask = used_mask;
1819 else
1821 /* Skip the outermost, dummy OpenACC loop */
1822 l = l->child;
1824 if (l)
1825 inform_oacc_loop (l);
1826 if (is_oacc_kernels)
1827 free_oacc_loop (l);
1830 /* Offloaded targets may introduce new basic blocks, which require
1831 dominance information to update SSA. */
1832 calculate_dominance_info (CDI_DOMINATORS);
1834 /* Now lower internal loop functions to target-specific code
1835 sequences. */
1836 basic_block bb;
1837 FOR_ALL_BB_FN (bb, cfun)
1838 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1840 gimple *stmt = gsi_stmt (gsi);
1841 if (!is_gimple_call (stmt))
1843 gsi_next (&gsi);
1844 continue;
1847 gcall *call = as_a <gcall *> (stmt);
1848 if (!gimple_call_internal_p (call))
1850 gsi_next (&gsi);
1851 continue;
1854 /* Rewind to allow rescan. */
1855 gsi_prev (&gsi);
1856 bool rescan = false, remove = false;
1857 enum internal_fn ifn_code = gimple_call_internal_fn (call);
1859 switch (ifn_code)
1861 default: break;
1863 case IFN_GOACC_TILE:
1864 oacc_xform_tile (call);
1865 rescan = true;
1866 break;
1868 case IFN_GOACC_LOOP:
1869 oacc_xform_loop (call);
1870 rescan = true;
1871 break;
1873 case IFN_GOACC_REDUCTION:
1874 /* Mark the function for SSA renaming. */
1875 mark_virtual_operands_for_renaming (cfun);
1877 /* If the level is -1, this ended up being an unused
1878 axis. Handle as a default. */
1879 if (integer_minus_onep (gimple_call_arg (call, 3)))
1880 default_goacc_reduction (call);
1881 else
1882 targetm.goacc.reduction (call);
1883 rescan = true;
1884 break;
1886 case IFN_UNIQUE:
1888 enum ifn_unique_kind kind
1889 = ((enum ifn_unique_kind)
1890 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1892 switch (kind)
1894 default:
1895 break;
1897 case IFN_UNIQUE_OACC_FORK:
1898 case IFN_UNIQUE_OACC_JOIN:
1899 if (integer_minus_onep (gimple_call_arg (call, 2)))
1900 remove = true;
1901 else if (!targetm.goacc.fork_join
1902 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1903 remove = true;
1904 break;
1906 case IFN_UNIQUE_OACC_HEAD_MARK:
1907 case IFN_UNIQUE_OACC_TAIL_MARK:
1908 remove = true;
1909 break;
1911 break;
1915 if (gsi_end_p (gsi))
1916 /* We rewound past the beginning of the BB. */
1917 gsi = gsi_start_bb (bb);
1918 else
1919 /* Undo the rewind. */
1920 gsi_next (&gsi);
1922 if (remove)
1924 if (gimple_vdef (call))
1925 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1926 if (gimple_call_lhs (call))
1928 /* Propagate the data dependency var. */
1929 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1930 gimple_call_arg (call, 1));
1931 gsi_replace (&gsi, ass, false);
1933 else
1934 gsi_remove (&gsi, true);
1936 else if (!rescan)
1937 /* If not rescanning, advance over the call. */
1938 gsi_next (&gsi);
1941 free_oacc_loop (loops);
1943 return 0;
1946 /* Default launch dimension validator. Force everything to 1. A
1947 backend that wants to provide larger dimensions must override this
1948 hook. */
1950 bool
1951 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
1952 int ARG_UNUSED (fn_level),
1953 unsigned ARG_UNUSED (used))
1955 bool changed = false;
1957 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
1959 if (dims[ix] != 1)
1961 dims[ix] = 1;
1962 changed = true;
1966 return changed;
1969 /* Default dimension bound is unknown on accelerator and 1 on host. */
1972 default_goacc_dim_limit (int ARG_UNUSED (axis))
1974 #ifdef ACCEL_COMPILER
1975 return 0;
1976 #else
1977 return 1;
1978 #endif
1981 namespace {
1983 const pass_data pass_data_oacc_device_lower =
1985 GIMPLE_PASS, /* type */
1986 "oaccdevlow", /* name */
1987 OPTGROUP_OMP, /* optinfo_flags */
1988 TV_NONE, /* tv_id */
1989 PROP_cfg, /* properties_required */
1990 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
1991 0, /* properties_destroyed */
1992 0, /* todo_flags_start */
1993 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1996 class pass_oacc_device_lower : public gimple_opt_pass
1998 public:
1999 pass_oacc_device_lower (gcc::context *ctxt)
2000 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
2003 /* opt_pass methods: */
2004 virtual bool gate (function *) { return flag_openacc; };
2006 virtual unsigned int execute (function *)
2008 return execute_oacc_device_lower ();
2011 }; // class pass_oacc_device_lower
2013 } // anon namespace
2015 gimple_opt_pass *
2016 make_pass_oacc_device_lower (gcc::context *ctxt)
2018 return new pass_oacc_device_lower (ctxt);
2022 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
2023 GOMP_SIMT_ENTER call identifying the privatized variables, which are
2024 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
2025 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
2027 static void
2028 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
2030 gimple *alloc_stmt = gsi_stmt (*gsi);
2031 tree simtrec = gimple_call_lhs (alloc_stmt);
2032 tree simduid = gimple_call_arg (alloc_stmt, 0);
2033 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
2034 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
2035 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
2036 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
2037 TREE_ADDRESSABLE (rectype) = 1;
2038 TREE_TYPE (simtrec) = build_pointer_type (rectype);
2039 for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
2041 tree *argp = gimple_call_arg_ptr (enter_stmt, i);
2042 if (*argp == null_pointer_node)
2043 continue;
2044 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
2045 && VAR_P (TREE_OPERAND (*argp, 0)));
2046 tree var = TREE_OPERAND (*argp, 0);
2048 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
2049 DECL_NAME (var), TREE_TYPE (var));
2050 SET_DECL_ALIGN (field, DECL_ALIGN (var));
2051 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
2052 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
2054 insert_field_into_struct (rectype, field);
2056 tree t = build_simple_mem_ref (simtrec);
2057 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
2058 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
2059 SET_DECL_VALUE_EXPR (var, t);
2060 DECL_HAS_VALUE_EXPR_P (var) = 1;
2061 *regimplify = true;
2063 layout_type (rectype);
2064 tree size = TYPE_SIZE_UNIT (rectype);
2065 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
2067 alloc_stmt
2068 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
2069 gimple_call_set_lhs (alloc_stmt, simtrec);
2070 gsi_replace (gsi, alloc_stmt, false);
2071 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
2072 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
2073 gsi_replace (&enter_gsi, enter_stmt, false);
2075 use_operand_p use;
2076 gimple *exit_stmt;
2077 if (single_imm_use (simtrec, &use, &exit_stmt))
2079 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
2080 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
2081 tree clobber = build_clobber (rectype);
2082 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
2083 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
2085 else
2086 gcc_checking_assert (has_zero_uses (simtrec));
2089 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
2091 static tree
2092 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
2094 tree t = *tp;
2096 if (VAR_P (t)
2097 && DECL_HAS_VALUE_EXPR_P (t)
2098 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
2100 *walk_subtrees = 0;
2101 return t;
2103 return NULL_TREE;
2106 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
2107 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
2108 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
2109 internal functions on non-SIMT targets, and likewise some SIMD internal
2110 functions on SIMT targets. */
2112 static unsigned int
2113 execute_omp_device_lower ()
2115 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
2116 bool regimplify = false;
2117 basic_block bb;
2118 gimple_stmt_iterator gsi;
2119 bool calls_declare_variant_alt
2120 = cgraph_node::get (cfun->decl)->calls_declare_variant_alt;
2121 FOR_EACH_BB_FN (bb, cfun)
2122 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2124 gimple *stmt = gsi_stmt (gsi);
2125 if (!is_gimple_call (stmt))
2126 continue;
2127 if (!gimple_call_internal_p (stmt))
2129 if (calls_declare_variant_alt)
2130 if (tree fndecl = gimple_call_fndecl (stmt))
2132 tree new_fndecl = omp_resolve_declare_variant (fndecl);
2133 if (new_fndecl != fndecl)
2135 gimple_call_set_fndecl (stmt, new_fndecl);
2136 update_stmt (stmt);
2139 continue;
2141 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
2142 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
2143 switch (gimple_call_internal_fn (stmt))
2145 case IFN_GOMP_USE_SIMT:
2146 rhs = vf == 1 ? integer_zero_node : integer_one_node;
2147 break;
2148 case IFN_GOMP_SIMT_ENTER:
2149 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2150 goto simtreg_enter_exit;
2151 case IFN_GOMP_SIMT_ENTER_ALLOC:
2152 if (vf != 1)
2153 ompdevlow_adjust_simt_enter (&gsi, &regimplify);
2154 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
2155 goto simtreg_enter_exit;
2156 case IFN_GOMP_SIMT_EXIT:
2157 simtreg_enter_exit:
2158 if (vf != 1)
2159 continue;
2160 unlink_stmt_vdef (stmt);
2161 break;
2162 case IFN_GOMP_SIMT_LANE:
2163 case IFN_GOMP_SIMT_LAST_LANE:
2164 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
2165 break;
2166 case IFN_GOMP_SIMT_VF:
2167 rhs = build_int_cst (type, vf);
2168 break;
2169 case IFN_GOMP_SIMT_ORDERED_PRED:
2170 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
2171 if (rhs || !lhs)
2172 unlink_stmt_vdef (stmt);
2173 break;
2174 case IFN_GOMP_SIMT_VOTE_ANY:
2175 case IFN_GOMP_SIMT_XCHG_BFLY:
2176 case IFN_GOMP_SIMT_XCHG_IDX:
2177 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2178 break;
2179 case IFN_GOMP_SIMD_LANE:
2180 case IFN_GOMP_SIMD_LAST_LANE:
2181 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
2182 break;
2183 case IFN_GOMP_SIMD_VF:
2184 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
2185 break;
2186 default:
2187 continue;
2189 if (lhs && !rhs)
2190 continue;
2191 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
2192 gsi_replace (&gsi, stmt, false);
2194 if (regimplify)
2195 FOR_EACH_BB_REVERSE_FN (bb, cfun)
2196 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
2197 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
2199 if (gimple_clobber_p (gsi_stmt (gsi)))
2200 gsi_remove (&gsi, true);
2201 else
2202 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2204 if (vf != 1)
2205 cfun->has_force_vectorize_loops = false;
2206 return 0;
2209 namespace {
2211 const pass_data pass_data_omp_device_lower =
2213 GIMPLE_PASS, /* type */
2214 "ompdevlow", /* name */
2215 OPTGROUP_OMP, /* optinfo_flags */
2216 TV_NONE, /* tv_id */
2217 PROP_cfg, /* properties_required */
2218 PROP_gimple_lomp_dev, /* properties_provided */
2219 0, /* properties_destroyed */
2220 0, /* todo_flags_start */
2221 TODO_update_ssa, /* todo_flags_finish */
2224 class pass_omp_device_lower : public gimple_opt_pass
2226 public:
2227 pass_omp_device_lower (gcc::context *ctxt)
2228 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2231 /* opt_pass methods: */
2232 virtual bool gate (function *fun)
2234 return (!(fun->curr_properties & PROP_gimple_lomp_dev)
2235 || (flag_openmp
2236 && cgraph_node::get (fun->decl)->calls_declare_variant_alt));
2238 virtual unsigned int execute (function *)
2240 return execute_omp_device_lower ();
2243 }; // class pass_expand_omp_ssa
2245 } // anon namespace
2247 gimple_opt_pass *
2248 make_pass_omp_device_lower (gcc::context *ctxt)
2250 return new pass_omp_device_lower (ctxt);
2253 /* "omp declare target link" handling pass. */
2255 namespace {
2257 const pass_data pass_data_omp_target_link =
2259 GIMPLE_PASS, /* type */
2260 "omptargetlink", /* name */
2261 OPTGROUP_OMP, /* optinfo_flags */
2262 TV_NONE, /* tv_id */
2263 PROP_ssa, /* properties_required */
2264 0, /* properties_provided */
2265 0, /* properties_destroyed */
2266 0, /* todo_flags_start */
2267 TODO_update_ssa, /* todo_flags_finish */
2270 class pass_omp_target_link : public gimple_opt_pass
2272 public:
2273 pass_omp_target_link (gcc::context *ctxt)
2274 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2277 /* opt_pass methods: */
2278 virtual bool gate (function *fun)
2280 #ifdef ACCEL_COMPILER
2281 return offloading_function_p (fun->decl);
2282 #else
2283 (void) fun;
2284 return false;
2285 #endif
2288 virtual unsigned execute (function *);
2291 /* Callback for walk_gimple_stmt used to scan for link var operands. */
2293 static tree
2294 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2296 tree t = *tp;
2298 if (VAR_P (t)
2299 && DECL_HAS_VALUE_EXPR_P (t)
2300 && is_global_var (t)
2301 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2303 *walk_subtrees = 0;
2304 return t;
2307 return NULL_TREE;
2310 unsigned
2311 pass_omp_target_link::execute (function *fun)
2313 basic_block bb;
2314 FOR_EACH_BB_FN (bb, fun)
2316 gimple_stmt_iterator gsi;
2317 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2318 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2319 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2322 return 0;
2325 } // anon namespace
2327 gimple_opt_pass *
2328 make_pass_omp_target_link (gcc::context *ctxt)
2330 return new pass_omp_target_link (ctxt);