target/112280 - properly guard permute query
[official-gcc.git] / gcc / omp-offload.cc
blob35313c2ecf3c37bd46ffd0379039562ee7627f87
1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
4 Copyright (C) 2005-2024 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
55 #include "context.h"
56 #include "convert.h"
57 #include "opts.h"
59 /* Describe the OpenACC looping structure of a function. The entire
60 function is held in a 'NULL' loop. */
62 struct oacc_loop
64 oacc_loop *parent; /* Containing loop. */
66 oacc_loop *child; /* First inner loop. */
68 oacc_loop *sibling; /* Next loop within same parent. */
70 location_t loc; /* Location of the loop start. */
72 gcall *marker; /* Initial head marker. */
74 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
75 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
77 tree routine; /* Pseudo-loop enclosing a routine. */
79 unsigned mask; /* Partitioning mask. */
80 unsigned e_mask; /* Partitioning of element loops (when tiling). */
81 unsigned inner; /* Partitioning of inner loops. */
82 unsigned flags; /* Partitioning flags. */
83 vec<gcall *> ifns; /* Contained loop abstraction functions. */
84 tree chunk_size; /* Chunk size. */
85 gcall *head_end; /* Final marker of head sequence. */
88 /* Holds offload tables with decls. */
89 vec<tree, va_gc> *offload_funcs, *offload_vars, *offload_ind_funcs;
91 /* Return level at which oacc routine may spawn a partitioned loop, or
92 -1 if it is not a routine (i.e. is an offload fn). */
94 int
95 oacc_fn_attrib_level (tree attr)
97 tree pos = TREE_VALUE (attr);
99 if (!TREE_PURPOSE (pos))
100 return -1;
102 int ix = 0;
103 for (ix = 0; ix != GOMP_DIM_MAX;
104 ix++, pos = TREE_CHAIN (pos))
105 if (!integer_zerop (TREE_PURPOSE (pos)))
106 break;
108 return ix;
111 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
112 adds their addresses and sizes to constructor-vector V_CTOR. */
114 static void
115 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
116 vec<constructor_elt, va_gc> *v_ctor)
118 unsigned len = vec_safe_length (v_decls);
119 for (unsigned i = 0; i < len; i++)
121 tree it = (*v_decls)[i];
122 bool is_var = VAR_P (it);
123 bool is_link_var
124 = is_var
125 #ifdef ACCEL_COMPILER
126 && DECL_HAS_VALUE_EXPR_P (it)
127 #endif
128 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
130 /* See also omp_finish_file and output_offload_tables in lto-cgraph.cc. */
131 if (!in_lto_p && !symtab_node::get (it))
132 continue;
134 tree size = NULL_TREE;
135 if (is_var)
136 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
138 tree addr;
139 if (!is_link_var)
140 addr = build_fold_addr_expr (it);
141 else
143 #ifdef ACCEL_COMPILER
144 /* For "omp declare target link" vars add address of the pointer to
145 the target table, instead of address of the var. */
146 tree value_expr = DECL_VALUE_EXPR (it);
147 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
148 varpool_node::finalize_decl (link_ptr_decl);
149 addr = build_fold_addr_expr (link_ptr_decl);
150 #else
151 addr = build_fold_addr_expr (it);
152 #endif
154 /* Most significant bit of the size marks "omp declare target link"
155 vars in host and target tables. */
156 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
157 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
158 * BITS_PER_UNIT - 1);
159 size = wide_int_to_tree (const_ptr_type_node, isize);
162 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
163 if (is_var)
164 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
168 /* Return true if DECL is a function for which its references should be
169 analyzed. */
171 static bool
172 omp_declare_target_fn_p (tree decl)
174 return (TREE_CODE (decl) == FUNCTION_DECL
175 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
176 && !lookup_attribute ("omp declare target host",
177 DECL_ATTRIBUTES (decl))
178 && (!flag_openacc
179 || oacc_get_fn_attrib (decl) == NULL_TREE));
182 /* Return true if DECL Is a variable for which its initializer references
183 should be analyzed. */
185 static bool
186 omp_declare_target_var_p (tree decl)
188 return (VAR_P (decl)
189 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
190 && !lookup_attribute ("omp declare target link",
191 DECL_ATTRIBUTES (decl)));
194 /* Helper function for omp_discover_implicit_declare_target, called through
195 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
196 declare target to. */
198 static tree
199 omp_discover_declare_target_tgt_fn_r (tree *tp, int *walk_subtrees, void *data)
201 if (TREE_CODE (*tp) == CALL_EXPR
202 && CALL_EXPR_FN (*tp)
203 && TREE_CODE (CALL_EXPR_FN (*tp)) == ADDR_EXPR
204 && TREE_CODE (TREE_OPERAND (CALL_EXPR_FN (*tp), 0)) == FUNCTION_DECL
205 && lookup_attribute ("omp declare variant base",
206 DECL_ATTRIBUTES (TREE_OPERAND (CALL_EXPR_FN (*tp),
207 0))))
209 tree fn = TREE_OPERAND (CALL_EXPR_FN (*tp), 0);
210 for (tree attr = DECL_ATTRIBUTES (fn); attr; attr = TREE_CHAIN (attr))
212 attr = lookup_attribute ("omp declare variant base", attr);
213 if (attr == NULL_TREE)
214 break;
215 tree purpose = TREE_PURPOSE (TREE_VALUE (attr));
216 if (TREE_CODE (purpose) == FUNCTION_DECL)
217 omp_discover_declare_target_tgt_fn_r (&purpose, walk_subtrees, data);
220 else if (TREE_CODE (*tp) == FUNCTION_DECL)
222 tree decl = *tp;
223 tree id = get_identifier ("omp declare target");
224 symtab_node *node = symtab_node::get (*tp);
225 if (node != NULL)
227 while (node->alias_target
228 && TREE_CODE (node->alias_target) == FUNCTION_DECL)
230 if (!omp_declare_target_fn_p (node->decl)
231 && !lookup_attribute ("omp declare target host",
232 DECL_ATTRIBUTES (node->decl)))
234 node->offloadable = 1;
235 DECL_ATTRIBUTES (node->decl)
236 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
238 node = symtab_node::get (node->alias_target);
240 symtab_node *new_node = node->ultimate_alias_target ();
241 decl = new_node->decl;
242 while (node != new_node)
244 if (!omp_declare_target_fn_p (node->decl)
245 && !lookup_attribute ("omp declare target host",
246 DECL_ATTRIBUTES (node->decl)))
248 node->offloadable = 1;
249 DECL_ATTRIBUTES (node->decl)
250 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
252 gcc_assert (node->alias && node->analyzed);
253 node = node->get_alias_target ();
255 node->offloadable = 1;
256 if (ENABLE_OFFLOADING)
257 g->have_offload = true;
259 if (omp_declare_target_fn_p (decl)
260 || lookup_attribute ("omp declare target host",
261 DECL_ATTRIBUTES (decl)))
262 return NULL_TREE;
264 if (!DECL_EXTERNAL (decl) && DECL_SAVED_TREE (decl))
265 ((vec<tree> *) data)->safe_push (decl);
266 DECL_ATTRIBUTES (decl) = tree_cons (id, NULL_TREE,
267 DECL_ATTRIBUTES (decl));
269 else if (TYPE_P (*tp))
270 *walk_subtrees = 0;
271 else if (TREE_CODE (*tp) == OMP_TARGET)
273 tree c = omp_find_clause (OMP_CLAUSES (*tp), OMP_CLAUSE_DEVICE);
274 if (c && OMP_CLAUSE_DEVICE_ANCESTOR (c))
275 *walk_subtrees = 0;
277 return NULL_TREE;
280 /* Similarly, but ignore references outside of OMP_TARGET regions. */
282 static tree
283 omp_discover_declare_target_fn_r (tree *tp, int *walk_subtrees, void *data)
285 if (TREE_CODE (*tp) == OMP_TARGET)
287 tree c = omp_find_clause (OMP_CLAUSES (*tp), OMP_CLAUSE_DEVICE);
288 if (!c || !OMP_CLAUSE_DEVICE_ANCESTOR (c))
289 walk_tree_without_duplicates (&OMP_TARGET_BODY (*tp),
290 omp_discover_declare_target_tgt_fn_r,
291 data);
292 *walk_subtrees = 0;
294 else if (TYPE_P (*tp))
295 *walk_subtrees = 0;
296 return NULL_TREE;
299 /* Helper function for omp_discover_implicit_declare_target, called through
300 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
301 declare target to. */
303 static tree
304 omp_discover_declare_target_var_r (tree *tp, int *walk_subtrees, void *data)
306 if (TREE_CODE (*tp) == FUNCTION_DECL)
307 return omp_discover_declare_target_tgt_fn_r (tp, walk_subtrees, data);
308 else if (VAR_P (*tp)
309 && is_global_var (*tp)
310 && !omp_declare_target_var_p (*tp))
312 tree id = get_identifier ("omp declare target");
313 if (lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp)))
315 error_at (DECL_SOURCE_LOCATION (*tp),
316 "%qD specified both in declare target %<link%> and "
317 "implicitly in %<to%> clauses", *tp);
318 DECL_ATTRIBUTES (*tp)
319 = remove_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp));
321 if (TREE_STATIC (*tp) && lang_hooks.decls.omp_get_decl_init (*tp))
322 ((vec<tree> *) data)->safe_push (*tp);
323 DECL_ATTRIBUTES (*tp) = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (*tp));
324 symtab_node *node = symtab_node::get (*tp);
325 if (node != NULL && !node->offloadable)
327 node->offloadable = 1;
328 if (ENABLE_OFFLOADING)
330 g->have_offload = true;
331 if (is_a <varpool_node *> (node))
332 vec_safe_push (offload_vars, node->decl);
336 else if (TYPE_P (*tp))
337 *walk_subtrees = 0;
338 return NULL_TREE;
341 /* Perform the OpenMP implicit declare target to discovery. */
343 void
344 omp_discover_implicit_declare_target (void)
346 cgraph_node *node;
347 varpool_node *vnode;
348 auto_vec<tree> worklist;
350 FOR_EACH_DEFINED_FUNCTION (node)
351 if (DECL_SAVED_TREE (node->decl))
353 struct cgraph_node *cgn;
354 if (lookup_attribute ("omp declare target indirect",
355 DECL_ATTRIBUTES (node->decl)))
356 vec_safe_push (offload_ind_funcs, node->decl);
357 if (omp_declare_target_fn_p (node->decl))
358 worklist.safe_push (node->decl);
359 else if (DECL_STRUCT_FUNCTION (node->decl)
360 && DECL_STRUCT_FUNCTION (node->decl)->has_omp_target)
361 worklist.safe_push (node->decl);
362 for (cgn = first_nested_function (node);
363 cgn; cgn = next_nested_function (cgn))
364 if (omp_declare_target_fn_p (cgn->decl))
365 worklist.safe_push (cgn->decl);
366 else if (DECL_STRUCT_FUNCTION (cgn->decl)
367 && DECL_STRUCT_FUNCTION (cgn->decl)->has_omp_target)
368 worklist.safe_push (cgn->decl);
370 FOR_EACH_VARIABLE (vnode)
371 if (lang_hooks.decls.omp_get_decl_init (vnode->decl)
372 && omp_declare_target_var_p (vnode->decl))
373 worklist.safe_push (vnode->decl);
374 while (!worklist.is_empty ())
376 tree decl = worklist.pop ();
377 if (VAR_P (decl))
378 walk_tree_without_duplicates (lang_hooks.decls.omp_get_decl_init (decl),
379 omp_discover_declare_target_var_r,
380 &worklist);
381 else if (omp_declare_target_fn_p (decl))
382 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
383 omp_discover_declare_target_tgt_fn_r,
384 &worklist);
385 else
386 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
387 omp_discover_declare_target_fn_r,
388 &worklist);
391 lang_hooks.decls.omp_finish_decl_inits ();
395 /* Create new symbols containing (address, size) pairs for global variables,
396 marked with "omp declare target" attribute, as well as addresses for the
397 functions, which are outlined offloading regions. */
398 void
399 omp_finish_file (void)
401 unsigned num_funcs = vec_safe_length (offload_funcs);
402 unsigned num_vars = vec_safe_length (offload_vars);
403 unsigned num_ind_funcs = vec_safe_length (offload_ind_funcs);
405 if (num_funcs == 0 && num_vars == 0 && num_ind_funcs == 0)
406 return;
408 if (targetm_common.have_named_sections)
410 vec<constructor_elt, va_gc> *v_f, *v_v, *v_if;
411 vec_alloc (v_f, num_funcs);
412 vec_alloc (v_v, num_vars * 2);
413 vec_alloc (v_if, num_ind_funcs);
415 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
416 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
417 add_decls_addresses_to_decl_constructor (offload_ind_funcs, v_if);
419 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
420 vec_safe_length (v_v));
421 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
422 num_funcs);
423 tree ind_funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
424 num_ind_funcs);
426 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
427 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
428 SET_TYPE_ALIGN (ind_funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
429 tree ctor_v = build_constructor (vars_decl_type, v_v);
430 tree ctor_f = build_constructor (funcs_decl_type, v_f);
431 tree ctor_if = build_constructor (ind_funcs_decl_type, v_if);
432 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = TREE_CONSTANT (ctor_if) = 1;
433 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = TREE_STATIC (ctor_if) = 1;
434 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
435 get_identifier (".offload_func_table"),
436 funcs_decl_type);
437 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
438 get_identifier (".offload_var_table"),
439 vars_decl_type);
440 tree ind_funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
441 get_identifier (".offload_ind_func_table"),
442 ind_funcs_decl_type);
443 TREE_STATIC (funcs_decl) = TREE_STATIC (ind_funcs_decl) = 1;
444 TREE_STATIC (vars_decl) = 1;
445 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
446 otherwise a joint table in a binary will contain padding between
447 tables from multiple object files. */
448 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (ind_funcs_decl) = 1;
449 DECL_USER_ALIGN (vars_decl) = 1;
450 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
451 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
452 SET_DECL_ALIGN (ind_funcs_decl, TYPE_ALIGN (ind_funcs_decl_type));
453 DECL_INITIAL (funcs_decl) = ctor_f;
454 DECL_INITIAL (vars_decl) = ctor_v;
455 DECL_INITIAL (ind_funcs_decl) = ctor_if;
456 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
457 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
458 set_decl_section_name (ind_funcs_decl,
459 OFFLOAD_IND_FUNC_TABLE_SECTION_NAME);
460 varpool_node::finalize_decl (vars_decl);
461 varpool_node::finalize_decl (funcs_decl);
462 varpool_node::finalize_decl (ind_funcs_decl);
464 else
466 for (unsigned i = 0; i < num_funcs; i++)
468 tree it = (*offload_funcs)[i];
469 /* See also add_decls_addresses_to_decl_constructor
470 and output_offload_tables in lto-cgraph.cc. */
471 if (!in_lto_p && !symtab_node::get (it))
472 continue;
473 targetm.record_offload_symbol (it);
475 for (unsigned i = 0; i < num_vars; i++)
477 tree it = (*offload_vars)[i];
478 if (!in_lto_p && !symtab_node::get (it))
479 continue;
480 #ifdef ACCEL_COMPILER
481 if (DECL_HAS_VALUE_EXPR_P (it)
482 && lookup_attribute ("omp declare target link",
483 DECL_ATTRIBUTES (it)))
485 tree value_expr = DECL_VALUE_EXPR (it);
486 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
487 targetm.record_offload_symbol (link_ptr_decl);
488 varpool_node::finalize_decl (link_ptr_decl);
490 else
491 #endif
492 targetm.record_offload_symbol (it);
494 for (unsigned i = 0; i < num_ind_funcs; i++)
496 tree it = (*offload_ind_funcs)[i];
497 /* See also add_decls_addresses_to_decl_constructor
498 and output_offload_tables in lto-cgraph.cc. */
499 if (!in_lto_p && !symtab_node::get (it))
500 continue;
501 targetm.record_offload_symbol (it);
506 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
507 axis DIM. Return a tmp var holding the result. */
509 static tree
510 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
512 tree arg = build_int_cst (unsigned_type_node, dim);
513 tree size = create_tmp_var (integer_type_node);
514 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
515 gimple *call = gimple_build_call_internal (fn, 1, arg);
517 gimple_call_set_lhs (call, size);
518 gimple_seq_add_stmt (seq, call);
520 return size;
523 /* Find the number of threads (POS = false), or thread number (POS =
524 true) for an OpenACC region partitioned as MASK. Setup code
525 required for the calculation is added to SEQ. */
527 static tree
528 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
530 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
531 unsigned ix;
533 /* Start at gang level, and examine relevant dimension indices. */
534 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
535 if (GOMP_DIM_MASK (ix) & mask)
537 if (res)
539 /* We had an outer index, so scale that by the size of
540 this dimension. */
541 tree n = oacc_dim_call (false, ix, seq);
542 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
544 if (pos)
546 /* Determine index in this dimension. */
547 tree id = oacc_dim_call (true, ix, seq);
548 if (res)
549 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
550 else
551 res = id;
555 if (res == NULL_TREE)
556 res = integer_zero_node;
558 return res;
561 /* Transform IFN_GOACC_LOOP calls to actual code. See
562 expand_oacc_for for where these are generated. At the vector
563 level, we stride loops, such that each member of a warp will
564 operate on adjacent iterations. At the worker and gang level,
565 each gang/warp executes a set of contiguous iterations. Chunking
566 can override this such that each iteration engine executes a
567 contiguous chunk, and then moves on to stride to the next chunk. */
569 static void
570 oacc_xform_loop (gcall *call)
572 gimple_stmt_iterator gsi = gsi_for_stmt (call);
573 enum ifn_goacc_loop_kind code
574 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
575 tree dir = gimple_call_arg (call, 1);
576 tree range = gimple_call_arg (call, 2);
577 tree step = gimple_call_arg (call, 3);
578 tree chunk_size = NULL_TREE;
579 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
580 tree lhs = gimple_call_lhs (call);
581 tree type = NULL_TREE;
582 tree diff_type = TREE_TYPE (range);
583 tree r = NULL_TREE;
584 gimple_seq seq = NULL;
585 bool chunking = false, striding = true;
586 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
587 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
589 /* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
590 if (!lhs)
592 gsi_replace_with_seq (&gsi, seq, true);
593 return;
596 type = TREE_TYPE (lhs);
598 #ifdef ACCEL_COMPILER
599 chunk_size = gimple_call_arg (call, 4);
600 if (integer_minus_onep (chunk_size) /* Force static allocation. */
601 || integer_zerop (chunk_size)) /* Default (also static). */
603 /* If we're at the gang level, we want each to execute a
604 contiguous run of iterations. Otherwise we want each element
605 to stride. */
606 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
607 chunking = false;
609 else
611 /* Chunk of size 1 is striding. */
612 striding = integer_onep (chunk_size);
613 chunking = !striding;
615 #endif
617 /* striding=true, chunking=true
618 -> invalid.
619 striding=true, chunking=false
620 -> chunks=1
621 striding=false,chunking=true
622 -> chunks=ceil (range/(chunksize*threads*step))
623 striding=false,chunking=false
624 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
625 push_gimplify_context (true);
627 switch (code)
629 default: gcc_unreachable ();
631 case IFN_GOACC_LOOP_CHUNKS:
632 if (!chunking)
633 r = build_int_cst (type, 1);
634 else
636 /* chunk_max
637 = (range - dir) / (chunks * step * num_threads) + dir */
638 tree per = oacc_thread_numbers (false, mask, &seq);
639 per = fold_convert (type, per);
640 chunk_size = fold_convert (type, chunk_size);
641 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
642 per = fold_build2 (MULT_EXPR, type, per, step);
643 r = build2 (MINUS_EXPR, type, range, dir);
644 r = build2 (PLUS_EXPR, type, r, per);
645 r = build2 (TRUNC_DIV_EXPR, type, r, per);
647 break;
649 case IFN_GOACC_LOOP_STEP:
651 /* If striding, step by the entire compute volume, otherwise
652 step by the inner volume. */
653 unsigned volume = striding ? mask : inner_mask;
655 r = oacc_thread_numbers (false, volume, &seq);
656 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
658 break;
660 case IFN_GOACC_LOOP_OFFSET:
661 /* Enable vectorization on non-SIMT targets. */
662 if (!targetm.simt.vf
663 && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
664 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
665 the loop. */
666 && (flag_tree_loop_vectorize
667 || !OPTION_SET_P (flag_tree_loop_vectorize)))
669 basic_block bb = gsi_bb (gsi);
670 class loop *parent = bb->loop_father;
671 class loop *body = parent->inner;
673 parent->force_vectorize = true;
674 parent->safelen = INT_MAX;
676 /* "Chunking loops" may have inner loops. */
677 if (parent->inner)
679 body->force_vectorize = true;
680 body->safelen = INT_MAX;
683 cfun->has_force_vectorize_loops = true;
685 if (striding)
687 r = oacc_thread_numbers (true, mask, &seq);
688 r = fold_convert (diff_type, r);
690 else
692 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
693 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
694 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
695 inner_size, outer_size);
697 volume = fold_convert (diff_type, volume);
698 if (chunking)
699 chunk_size = fold_convert (diff_type, chunk_size);
700 else
702 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
704 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
705 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
706 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
709 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
710 fold_convert (diff_type, inner_size));
711 r = oacc_thread_numbers (true, outer_mask, &seq);
712 r = fold_convert (diff_type, r);
713 r = build2 (MULT_EXPR, diff_type, r, span);
715 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
716 inner = fold_convert (diff_type, inner);
717 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
719 if (chunking)
721 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
722 tree per
723 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
724 per = build2 (MULT_EXPR, diff_type, per, chunk);
726 r = build2 (PLUS_EXPR, diff_type, r, per);
729 r = fold_build2 (MULT_EXPR, diff_type, r, step);
730 if (type != diff_type)
731 r = fold_convert (type, r);
732 break;
734 case IFN_GOACC_LOOP_BOUND:
735 if (striding)
736 r = range;
737 else
739 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
740 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
741 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
742 inner_size, outer_size);
744 volume = fold_convert (diff_type, volume);
745 if (chunking)
746 chunk_size = fold_convert (diff_type, chunk_size);
747 else
749 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
751 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
752 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
753 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
756 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
757 fold_convert (diff_type, inner_size));
759 r = fold_build2 (MULT_EXPR, diff_type, span, step);
761 tree offset = gimple_call_arg (call, 6);
762 r = build2 (PLUS_EXPR, diff_type, r,
763 fold_convert (diff_type, offset));
764 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
765 diff_type, r, range);
767 if (diff_type != type)
768 r = fold_convert (type, r);
769 break;
772 gimplify_assign (lhs, r, &seq);
774 pop_gimplify_context (NULL);
776 gsi_replace_with_seq (&gsi, seq, true);
779 /* Transform a GOACC_TILE call. Determines the element loop span for
780 the specified loop of the nest. This is 1 if we're not tiling.
782 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
784 static void
785 oacc_xform_tile (gcall *call)
787 gimple_stmt_iterator gsi = gsi_for_stmt (call);
788 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
789 /* Inner loops have higher loop_nos. */
790 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
791 tree tile_size = gimple_call_arg (call, 2);
792 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
793 tree lhs = gimple_call_lhs (call);
794 tree type = TREE_TYPE (lhs);
795 gimple_seq seq = NULL;
796 tree span = build_int_cst (type, 1);
798 gcc_assert (!(e_mask
799 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
800 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
801 push_gimplify_context (!seen_error ());
803 #ifndef ACCEL_COMPILER
804 /* Partitioning disabled on host compilers. */
805 e_mask = 0;
806 #endif
807 if (!e_mask)
808 /* Not paritioning. */
809 span = integer_one_node;
810 else if (!integer_zerop (tile_size))
811 /* User explicitly specified size. */
812 span = tile_size;
813 else
815 /* Pick a size based on the paritioning of the element loop and
816 the number of loop nests. */
817 tree first_size = NULL_TREE;
818 tree second_size = NULL_TREE;
820 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
821 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
822 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
823 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
825 if (!first_size)
827 first_size = second_size;
828 second_size = NULL_TREE;
831 if (loop_no + 1 == collapse)
833 span = first_size;
834 if (!loop_no && second_size)
835 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
836 span, second_size);
838 else if (loop_no + 2 == collapse)
839 span = second_size;
840 else
841 span = NULL_TREE;
843 if (!span)
844 /* There's no obvious element size for this loop. Options
845 are 1, first_size or some non-unity constant (32 is my
846 favourite). We should gather some statistics. */
847 span = first_size;
850 span = fold_convert (type, span);
851 gimplify_assign (lhs, span, &seq);
853 pop_gimplify_context (NULL);
855 gsi_replace_with_seq (&gsi, seq, true);
858 /* Default partitioned and minimum partitioned dimensions. */
860 static int oacc_default_dims[GOMP_DIM_MAX];
861 static int oacc_min_dims[GOMP_DIM_MAX];
864 oacc_get_default_dim (int dim)
866 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
867 return oacc_default_dims[dim];
871 oacc_get_min_dim (int dim)
873 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
874 return oacc_min_dims[dim];
877 /* Parse the default dimension parameter. This is a set of
878 :-separated optional compute dimensions. Each specified dimension
879 is a positive integer. When device type support is added, it is
880 planned to be a comma separated list of such compute dimensions,
881 with all but the first prefixed by the colon-terminated device
882 type. */
884 static void
885 oacc_parse_default_dims (const char *dims)
887 int ix;
889 for (ix = GOMP_DIM_MAX; ix--;)
891 oacc_default_dims[ix] = -1;
892 oacc_min_dims[ix] = 1;
895 #ifndef ACCEL_COMPILER
896 /* Cannot be overridden on the host. */
897 dims = NULL;
898 #endif
899 if (dims)
901 const char *pos = dims;
903 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
905 if (ix)
907 if (*pos != ':')
908 goto malformed;
909 pos++;
912 if (*pos != ':')
914 long val;
915 const char *eptr;
917 errno = 0;
918 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
919 if (errno || val <= 0 || (int) val != val)
920 goto malformed;
921 pos = eptr;
922 oacc_default_dims[ix] = (int) val;
925 if (*pos)
927 malformed:
928 error_at (UNKNOWN_LOCATION,
929 "%<-fopenacc-dim%> operand is malformed at %qs", pos);
933 /* Allow the backend to validate the dimensions. */
934 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
935 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
938 /* Validate and update the dimensions for offloaded FN. ATTRS is the
939 raw attribute. DIMS is an array of dimensions, which is filled in.
940 LEVEL is the partitioning level of a routine, or -1 for an offload
941 region itself. USED is the mask of partitioned execution in the
942 function. */
944 static void
945 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
947 tree purpose[GOMP_DIM_MAX];
948 unsigned ix;
949 tree pos = TREE_VALUE (attrs);
951 /* Make sure the attribute creator attached the dimension
952 information. */
953 gcc_assert (pos);
955 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
957 purpose[ix] = TREE_PURPOSE (pos);
958 tree val = TREE_VALUE (pos);
959 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
960 pos = TREE_CHAIN (pos);
963 bool check = true;
964 #ifdef ACCEL_COMPILER
965 check = false;
966 #endif
967 if (check
968 && warn_openacc_parallelism
969 && !lookup_attribute ("oacc kernels", DECL_ATTRIBUTES (fn)))
971 static char const *const axes[] =
972 /* Must be kept in sync with GOMP_DIM enumeration. */
973 { "gang", "worker", "vector" };
974 for (ix = level >= 0 ? level : 0; ix != GOMP_DIM_MAX; ix++)
975 if (dims[ix] < 0)
976 ; /* Defaulting axis. */
977 else if ((used & GOMP_DIM_MASK (ix)) && dims[ix] == 1)
978 /* There is partitioned execution, but the user requested a
979 dimension size of 1. They're probably confused. */
980 warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
981 "region contains %s partitioned code but"
982 " is not %s partitioned", axes[ix], axes[ix]);
983 else if (!(used & GOMP_DIM_MASK (ix)) && dims[ix] != 1)
984 /* The dimension is explicitly partitioned to non-unity, but
985 no use is made within the region. */
986 warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
987 "region is %s partitioned but"
988 " does not contain %s partitioned code",
989 axes[ix], axes[ix]);
992 bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
994 /* Default anything left to 1 or a partitioned default. */
995 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
996 if (dims[ix] < 0)
998 /* The OpenACC spec says 'If the [num_gangs] clause is not
999 specified, an implementation-defined default will be used;
1000 the default may depend on the code within the construct.'
1001 (2.5.6). Thus an implementation is free to choose
1002 non-unity default for a parallel region that doesn't have
1003 any gang-partitioned loops. However, it appears that there
1004 is a sufficient body of user code that expects non-gang
1005 partitioned regions to not execute in gang-redundant mode.
1006 So we (a) don't warn about the non-portability and (b) pick
1007 the minimum permissible dimension size when there is no
1008 partitioned execution. Otherwise we pick the global
1009 default for the dimension, which the user can control. The
1010 same wording and logic applies to num_workers and
1011 vector_length, however the worker- or vector- single
1012 execution doesn't have the same impact as gang-redundant
1013 execution. (If the minimum gang-level partioning is not 1,
1014 the target is probably too confusing.) */
1015 dims[ix] = (used & GOMP_DIM_MASK (ix)
1016 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
1017 changed = true;
1020 if (changed)
1022 /* Replace the attribute with new values. */
1023 pos = NULL_TREE;
1024 for (ix = GOMP_DIM_MAX; ix--;)
1025 pos = tree_cons (purpose[ix],
1026 build_int_cst (integer_type_node, dims[ix]), pos);
1027 oacc_replace_fn_attrib (fn, pos);
1031 /* Create an empty OpenACC loop structure at LOC. */
1033 static oacc_loop *
1034 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
1036 oacc_loop *loop = XCNEW (oacc_loop);
1038 loop->parent = parent;
1040 if (parent)
1042 loop->sibling = parent->child;
1043 parent->child = loop;
1046 loop->loc = loc;
1047 return loop;
1050 /* Create an outermost, dummy OpenACC loop for offloaded function
1051 DECL. */
1053 static oacc_loop *
1054 new_oacc_loop_outer (tree decl)
1056 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
1059 /* Start a new OpenACC loop structure beginning at head marker HEAD.
1060 Link into PARENT loop. Return the new loop. */
1062 static oacc_loop *
1063 new_oacc_loop (oacc_loop *parent, gcall *marker)
1065 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
1067 loop->marker = marker;
1069 /* TODO: This is where device_type flattening would occur for the loop
1070 flags. */
1072 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
1074 tree chunk_size = integer_zero_node;
1075 if (loop->flags & OLF_GANG_STATIC)
1076 chunk_size = gimple_call_arg (marker, 4);
1077 loop->chunk_size = chunk_size;
1079 return loop;
1082 /* Create a dummy loop encompassing a call to a openACC routine.
1083 Extract the routine's partitioning requirements. */
1085 static void
1086 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
1088 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
1089 int level = oacc_fn_attrib_level (attrs);
1091 gcc_assert (level >= 0);
1093 loop->marker = call;
1094 loop->routine = decl;
1095 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
1096 ^ (GOMP_DIM_MASK (level) - 1));
1099 /* Finish off the current OpenACC loop ending at tail marker TAIL.
1100 Return the parent loop. */
1102 static oacc_loop *
1103 finish_oacc_loop (oacc_loop *loop)
1105 /* If the loop has been collapsed, don't partition it. */
1106 if (loop->ifns.is_empty ())
1107 loop->mask = loop->flags = 0;
1108 return loop->parent;
1111 /* Free all OpenACC loop structures within LOOP (inclusive). */
1113 static void
1114 free_oacc_loop (oacc_loop *loop)
1116 if (loop->sibling)
1117 free_oacc_loop (loop->sibling);
1118 if (loop->child)
1119 free_oacc_loop (loop->child);
1121 loop->ifns.release ();
1122 free (loop);
1125 /* Dump out the OpenACC loop head or tail beginning at FROM. */
1127 static void
1128 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
1129 const char *title, int level)
1131 enum ifn_unique_kind kind
1132 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1134 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
1135 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1137 gimple *stmt = gsi_stmt (gsi);
1139 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1141 enum ifn_unique_kind k
1142 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
1143 (gimple_call_arg (stmt, 0)));
1145 if (k == kind && stmt != from)
1146 break;
1148 print_gimple_stmt (file, stmt, depth * 2 + 2);
1150 gsi_next (&gsi);
1151 while (gsi_end_p (gsi))
1152 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1156 /* Dump OpenACC loop LOOP, its children, and its siblings. */
1158 static void
1159 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
1161 int ix;
1163 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
1164 loop->flags, loop->mask,
1165 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
1167 if (loop->marker)
1168 print_gimple_stmt (file, loop->marker, depth * 2);
1170 if (loop->routine)
1171 fprintf (file, "%*sRoutine %s:%u:%s\n",
1172 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
1173 DECL_SOURCE_LINE (loop->routine),
1174 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
1176 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
1177 if (loop->heads[ix])
1178 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
1179 for (ix = GOMP_DIM_MAX; ix--;)
1180 if (loop->tails[ix])
1181 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
1183 if (loop->child)
1184 dump_oacc_loop (file, loop->child, depth + 1);
1185 if (loop->sibling)
1186 dump_oacc_loop (file, loop->sibling, depth);
1189 void debug_oacc_loop (oacc_loop *);
1191 /* Dump loops to stderr. */
1193 DEBUG_FUNCTION void
1194 debug_oacc_loop (oacc_loop *loop)
1196 dump_oacc_loop (stderr, loop, 0);
1199 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
1200 siblings. */
1202 static void
1203 inform_oacc_loop (const oacc_loop *loop)
1205 const char *gang
1206 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
1207 const char *worker
1208 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
1209 const char *vector
1210 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
1211 const char *seq = loop->mask == 0 ? " seq" : "";
1212 const dump_user_location_t loc
1213 = dump_user_location_t::from_location_t (loop->loc);
1214 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1215 "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
1216 vector, seq);
1218 if (loop->child)
1219 inform_oacc_loop (loop->child);
1220 if (loop->sibling)
1221 inform_oacc_loop (loop->sibling);
1224 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
1225 structures as we go. By construction these loops are properly
1226 nested. */
1228 static void
1229 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
1231 int marker = 0;
1232 int remaining = 0;
1234 if (bb->flags & BB_VISITED)
1235 return;
1237 follow:
1238 bb->flags |= BB_VISITED;
1240 /* Scan for loop markers. */
1241 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1242 gsi_next (&gsi))
1244 gimple *stmt = gsi_stmt (gsi);
1246 if (!is_gimple_call (stmt))
1247 continue;
1249 gcall *call = as_a <gcall *> (stmt);
1251 /* If this is a routine, make a dummy loop for it. */
1252 if (tree decl = gimple_call_fndecl (call))
1253 if (tree attrs = oacc_get_fn_attrib (decl))
1255 gcc_assert (!marker);
1256 new_oacc_loop_routine (loop, call, decl, attrs);
1259 if (!gimple_call_internal_p (call))
1260 continue;
1262 switch (gimple_call_internal_fn (call))
1264 default:
1265 break;
1267 case IFN_GOACC_LOOP:
1268 case IFN_GOACC_TILE:
1269 /* Record the abstraction function, so we can manipulate it
1270 later. */
1271 loop->ifns.safe_push (call);
1272 break;
1274 case IFN_UNIQUE:
1275 enum ifn_unique_kind kind
1276 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
1277 (gimple_call_arg (call, 0)));
1278 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
1279 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
1281 if (gimple_call_num_args (call) == 2)
1283 gcc_assert (marker && !remaining);
1284 marker = 0;
1285 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
1286 loop = finish_oacc_loop (loop);
1287 else
1288 loop->head_end = call;
1290 else
1292 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1294 if (!marker)
1296 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1297 loop = new_oacc_loop (loop, call);
1298 remaining = count;
1300 gcc_assert (count == remaining);
1301 if (remaining)
1303 remaining--;
1304 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1305 loop->heads[marker] = call;
1306 else
1307 loop->tails[remaining] = call;
1309 marker++;
1314 if (remaining || marker)
1316 bb = single_succ (bb);
1317 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1318 goto follow;
1321 /* Walk successor blocks. */
1322 edge e;
1323 edge_iterator ei;
1325 FOR_EACH_EDGE (e, ei, bb->succs)
1326 oacc_loop_discover_walk (loop, e->dest);
1329 /* LOOP is the first sibling. Reverse the order in place and return
1330 the new first sibling. Recurse to child loops. */
1332 static oacc_loop *
1333 oacc_loop_sibling_nreverse (oacc_loop *loop)
1335 oacc_loop *last = NULL;
1338 if (loop->child)
1339 loop->child = oacc_loop_sibling_nreverse (loop->child);
1341 oacc_loop *next = loop->sibling;
1342 loop->sibling = last;
1343 last = loop;
1344 loop = next;
1346 while (loop);
1348 return last;
1351 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1352 the current function. */
1354 static oacc_loop *
1355 oacc_loop_discovery ()
1357 /* Clear basic block flags, in particular BB_VISITED which we're going to use
1358 in the following. */
1359 clear_bb_flags ();
1361 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1362 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1364 /* The siblings were constructed in reverse order, reverse them so
1365 that diagnostics come out in an unsurprising order. */
1366 top = oacc_loop_sibling_nreverse (top);
1368 return top;
1371 /* Transform the abstract internal function markers starting at FROM
1372 to be for partitioning level LEVEL. Stop when we meet another HEAD
1373 or TAIL marker. */
1375 static void
1376 oacc_loop_xform_head_tail (gcall *from, int level)
1378 enum ifn_unique_kind kind
1379 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1380 tree replacement = build_int_cst (unsigned_type_node, level);
1382 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1384 gimple *stmt = gsi_stmt (gsi);
1386 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1388 enum ifn_unique_kind k
1389 = ((enum ifn_unique_kind)
1390 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1392 if (k == IFN_UNIQUE_OACC_FORK
1393 || k == IFN_UNIQUE_OACC_JOIN
1394 || k == IFN_UNIQUE_OACC_PRIVATE)
1395 *gimple_call_arg_ptr (stmt, 2) = replacement;
1396 else if (k == kind && stmt != from)
1397 break;
1399 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1400 *gimple_call_arg_ptr (stmt, 3) = replacement;
1401 update_stmt (stmt);
1403 gsi_next (&gsi);
1404 while (gsi_end_p (gsi))
1405 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1409 /* Process the discovered OpenACC loops, setting the correct
1410 partitioning level etc. */
1412 static void
1413 oacc_loop_process (oacc_loop *loop, int fn_level)
1415 if (loop->child)
1416 oacc_loop_process (loop->child, fn_level);
1418 if (loop->mask && !loop->routine)
1420 int ix;
1421 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1422 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1423 tree chunk_arg = loop->chunk_size;
1424 gcall *call;
1426 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1428 switch (gimple_call_internal_fn (call))
1430 case IFN_GOACC_LOOP:
1432 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1433 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1434 if (!is_e)
1435 gimple_call_set_arg (call, 4, chunk_arg);
1437 break;
1439 case IFN_GOACC_TILE:
1440 gimple_call_set_arg (call, 3, mask_arg);
1441 gimple_call_set_arg (call, 4, e_mask_arg);
1442 break;
1444 default:
1445 gcc_unreachable ();
1447 update_stmt (call);
1450 unsigned dim = GOMP_DIM_GANG;
1451 unsigned mask = loop->mask | loop->e_mask;
1452 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1454 while (!(GOMP_DIM_MASK (dim) & mask))
1455 dim++;
1457 oacc_loop_xform_head_tail (loop->heads[ix], dim);
1458 oacc_loop_xform_head_tail (loop->tails[ix], dim);
1460 mask ^= GOMP_DIM_MASK (dim);
1464 if (loop->sibling)
1465 oacc_loop_process (loop->sibling, fn_level);
1468 /* OpenACC 2.6, 2.9.11. "reduction clause" places a restriction such that
1469 "The 'reduction' clause may not be specified on an orphaned 'loop'
1470 construct with the 'gang' clause, or on an orphaned 'loop' construct that
1471 will generate gang parallelism in a procedure that is compiled with the
1472 'routine gang' clause." */
1473 if (fn_level == GOMP_DIM_GANG
1474 && (loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1475 && (loop->flags & OLF_REDUCTION))
1476 error_at (loop->loc,
1477 "gang reduction on an orphan loop");
1480 /* Walk the OpenACC loop heirarchy checking and assigning the
1481 programmer-specified partitionings. OUTER_MASK is the partitioning
1482 this loop is contained within. Return mask of partitioning
1483 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1484 bit. */
1486 static unsigned
1487 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1489 unsigned this_mask = loop->mask;
1490 unsigned mask_all = 0;
1491 bool noisy = true;
1493 #ifdef ACCEL_COMPILER
1494 /* When device_type is supported, we want the device compiler to be
1495 noisy, if the loop parameters are device_type-specific. */
1496 noisy = false;
1497 #endif
1499 if (!loop->routine)
1501 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1502 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1503 bool tiling = (loop->flags & OLF_TILE) != 0;
1505 this_mask = ((loop->flags >> OLF_DIM_BASE)
1506 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1508 /* Apply auto partitioning if this is a non-partitioned regular
1509 loop, or (no more than) single axis tiled loop. */
1510 bool maybe_auto
1511 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1513 if ((this_mask != 0) + auto_par + seq_par > 1)
1515 if (noisy)
1516 error_at (loop->loc,
1517 seq_par
1518 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1519 : G_("%<auto%> conflicts with other OpenACC loop "
1520 "specifiers"));
1521 maybe_auto = false;
1522 loop->flags &= ~OLF_AUTO;
1523 if (seq_par)
1525 loop->flags
1526 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1527 this_mask = 0;
1531 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1533 loop->flags |= OLF_AUTO;
1534 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1538 if (this_mask & outer_mask)
1540 const oacc_loop *outer;
1541 for (outer = loop->parent; outer; outer = outer->parent)
1542 if ((outer->mask | outer->e_mask) & this_mask)
1543 break;
1545 if (noisy)
1547 if (outer)
1549 error_at (loop->loc,
1550 loop->routine
1551 ? G_("routine call uses same OpenACC parallelism"
1552 " as containing loop")
1553 : G_("inner loop uses same OpenACC parallelism"
1554 " as containing loop"));
1555 inform (outer->loc, "containing loop here");
1557 else
1558 error_at (loop->loc,
1559 loop->routine
1560 ? G_("routine call uses OpenACC parallelism disallowed"
1561 " by containing routine")
1562 : G_("loop uses OpenACC parallelism disallowed"
1563 " by containing routine"));
1565 if (loop->routine)
1566 inform (DECL_SOURCE_LOCATION (loop->routine),
1567 "routine %qD declared here", loop->routine);
1569 this_mask &= ~outer_mask;
1571 else
1573 unsigned outermost = least_bit_hwi (this_mask);
1575 if (outermost && outermost <= outer_mask)
1577 if (noisy)
1579 error_at (loop->loc,
1580 "incorrectly nested OpenACC loop parallelism");
1582 const oacc_loop *outer;
1583 for (outer = loop->parent;
1584 outer->flags && outer->flags < outermost;
1585 outer = outer->parent)
1586 continue;
1587 inform (outer->loc, "containing loop here");
1590 this_mask &= ~outermost;
1594 mask_all |= this_mask;
1596 if (loop->flags & OLF_TILE)
1598 /* When tiling, vector goes to the element loop, and failing
1599 that we put worker there. The std doesn't contemplate
1600 specifying all three. We choose to put worker and vector on
1601 the element loops in that case. */
1602 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1603 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1604 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1606 loop->e_mask = this_e_mask;
1607 this_mask ^= this_e_mask;
1610 loop->mask = this_mask;
1612 if (dump_file)
1613 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1614 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1615 loop->mask, loop->e_mask);
1617 if (loop->child)
1619 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1620 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1621 mask_all |= loop->inner;
1624 if (loop->sibling)
1625 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1627 return mask_all;
1630 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1631 OUTER_MASK is the partitioning this loop is contained within.
1632 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1633 Return the cumulative partitioning used by this loop, siblings and
1634 children. */
1636 static unsigned
1637 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1638 bool outer_assign)
1640 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1641 bool noisy = true;
1642 bool tiling = loop->flags & OLF_TILE;
1644 #ifdef ACCEL_COMPILER
1645 /* When device_type is supported, we want the device compiler to be
1646 noisy, if the loop parameters are device_type-specific. */
1647 noisy = false;
1648 #endif
1650 if (assign && (!outer_assign || loop->inner))
1652 /* Allocate outermost and non-innermost loops at the outermost
1653 non-innermost available level. */
1654 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1656 /* Find the first outermost available partition. */
1657 while (this_mask <= outer_mask)
1658 this_mask <<= 1;
1660 /* Grab two axes if tiling, and we've not assigned anything */
1661 if (tiling && !(loop->mask | loop->e_mask))
1662 this_mask |= this_mask << 1;
1664 /* Prohibit the innermost partitioning at the moment. */
1665 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1667 /* Don't use any dimension explicitly claimed by an inner loop. */
1668 this_mask &= ~loop->inner;
1670 if (tiling && !loop->e_mask)
1672 /* If we got two axes, allocate the inner one to the element
1673 loop. */
1674 loop->e_mask = this_mask & (this_mask << 1);
1675 this_mask ^= loop->e_mask;
1678 loop->mask |= this_mask;
1681 if (loop->child)
1683 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1684 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1685 outer_assign | assign);
1688 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1690 /* Allocate the loop at the innermost available level. Note
1691 that we do this even if we already assigned this loop the
1692 outermost available level above. That way we'll partition
1693 this along 2 axes, if they are available. */
1694 unsigned this_mask = 0;
1696 /* Determine the outermost partitioning used within this loop. */
1697 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1698 this_mask = least_bit_hwi (this_mask);
1700 /* Pick the partitioning just inside that one. */
1701 this_mask >>= 1;
1703 /* And avoid picking one use by an outer loop. */
1704 this_mask &= ~outer_mask;
1706 /* If tiling and we failed completely above, grab the next one
1707 too. Making sure it doesn't hit an outer loop. */
1708 if (tiling)
1710 this_mask &= ~(loop->e_mask | loop->mask);
1711 unsigned tile_mask = ((this_mask >> 1)
1712 & ~(outer_mask | loop->e_mask | loop->mask));
1714 if (tile_mask || loop->mask)
1716 loop->e_mask |= this_mask;
1717 this_mask = tile_mask;
1719 if (!loop->e_mask && noisy)
1720 warning_at (loop->loc, 0,
1721 "insufficient partitioning available"
1722 " to parallelize element loop");
1725 loop->mask |= this_mask;
1726 if (!loop->mask && noisy)
1727 warning_at (loop->loc, 0,
1728 tiling
1729 ? G_("insufficient partitioning available"
1730 " to parallelize tile loop")
1731 : G_("insufficient partitioning available"
1732 " to parallelize loop"));
1735 if (assign && dump_file)
1736 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1737 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1738 loop->mask, loop->e_mask);
1740 unsigned inner_mask = 0;
1742 if (loop->sibling)
1743 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1744 outer_mask, outer_assign);
1746 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1748 return inner_mask;
1751 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1752 axes. Return mask of partitioning. */
1754 static unsigned
1755 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1757 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1759 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1761 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1762 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1764 return mask_all;
1767 /* Default fork/join early expander. Delete the function calls if
1768 there is no RTL expander. */
1770 bool
1771 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1772 const int *ARG_UNUSED (dims), bool is_fork)
1774 if (is_fork)
1775 return targetm.have_oacc_fork ();
1776 else
1777 return targetm.have_oacc_join ();
1780 /* Default goacc.reduction early expander.
1782 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1783 If RES_PTR is not integer-zerop:
1784 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1785 TEARDOWN - emit '*RES_PTR = VAR'
1786 If LHS is not NULL
1787 emit 'LHS = VAR' */
1789 void
1790 default_goacc_reduction (gcall *call)
1792 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1793 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1794 tree lhs = gimple_call_lhs (call);
1795 tree var = gimple_call_arg (call, 2);
1796 gimple_seq seq = NULL;
1798 if (code == IFN_GOACC_REDUCTION_SETUP
1799 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1801 /* Setup and Teardown need to copy from/to the receiver object,
1802 if there is one. */
1803 tree ref_to_res = gimple_call_arg (call, 1);
1805 if (!integer_zerop (ref_to_res))
1807 tree dst = build_simple_mem_ref (ref_to_res);
1808 tree src = var;
1810 if (code == IFN_GOACC_REDUCTION_SETUP)
1812 src = dst;
1813 dst = lhs;
1814 lhs = NULL;
1816 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1820 /* Copy VAR to LHS, if there is an LHS. */
1821 if (lhs)
1822 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1824 gsi_replace_with_seq (&gsi, seq, true);
1827 struct var_decl_rewrite_info
1829 gimple *stmt;
1830 hash_map<tree, tree> *adjusted_vars;
1831 bool avoid_pointer_conversion;
1832 bool modified;
1835 /* Helper function for execute_oacc_device_lower. Rewrite VAR_DECLs (by
1836 themselves or wrapped in various other nodes) according to ADJUSTED_VARS in
1837 the var_decl_rewrite_info pointed to via DATA. Used as part of coercing
1838 gang-private variables in OpenACC offload regions to reside in GPU shared
1839 memory. */
1841 static tree
1842 oacc_rewrite_var_decl (tree *tp, int *walk_subtrees, void *data)
1844 walk_stmt_info *wi = (walk_stmt_info *) data;
1845 var_decl_rewrite_info *info = (var_decl_rewrite_info *) wi->info;
1847 if (TREE_CODE (*tp) == ADDR_EXPR)
1849 tree arg = TREE_OPERAND (*tp, 0);
1850 tree *new_arg = info->adjusted_vars->get (arg);
1852 if (new_arg)
1854 if (info->avoid_pointer_conversion)
1856 *tp = build_fold_addr_expr (*new_arg);
1857 info->modified = true;
1858 *walk_subtrees = 0;
1860 else
1862 gimple_stmt_iterator gsi = gsi_for_stmt (info->stmt);
1863 tree repl = build_fold_addr_expr (*new_arg);
1864 gimple *stmt1
1865 = gimple_build_assign (make_ssa_name (TREE_TYPE (repl)), repl);
1866 tree conv = convert_to_pointer (TREE_TYPE (*tp),
1867 gimple_assign_lhs (stmt1));
1868 gimple *stmt2
1869 = gimple_build_assign (make_ssa_name (TREE_TYPE (*tp)), conv);
1870 gsi_insert_before (&gsi, stmt1, GSI_SAME_STMT);
1871 gsi_insert_before (&gsi, stmt2, GSI_SAME_STMT);
1872 *tp = gimple_assign_lhs (stmt2);
1873 info->modified = true;
1874 *walk_subtrees = 0;
1878 else if (TREE_CODE (*tp) == COMPONENT_REF || TREE_CODE (*tp) == ARRAY_REF)
1880 tree *base = &TREE_OPERAND (*tp, 0);
1882 while (TREE_CODE (*base) == COMPONENT_REF
1883 || TREE_CODE (*base) == ARRAY_REF)
1884 base = &TREE_OPERAND (*base, 0);
1886 if (TREE_CODE (*base) != VAR_DECL)
1887 return NULL;
1889 tree *new_decl = info->adjusted_vars->get (*base);
1890 if (!new_decl)
1891 return NULL;
1893 int base_quals = TYPE_QUALS (TREE_TYPE (*new_decl));
1894 tree field = TREE_OPERAND (*tp, 1);
1896 /* Adjust the type of the field. */
1897 int field_quals = TYPE_QUALS (TREE_TYPE (field));
1898 if (TREE_CODE (field) == FIELD_DECL && field_quals != base_quals)
1900 tree *field_type = &TREE_TYPE (field);
1901 while (TREE_CODE (*field_type) == ARRAY_TYPE)
1902 field_type = &TREE_TYPE (*field_type);
1903 field_quals |= base_quals;
1904 *field_type = build_qualified_type (*field_type, field_quals);
1907 /* Adjust the type of the component ref itself. */
1908 tree comp_type = TREE_TYPE (*tp);
1909 int comp_quals = TYPE_QUALS (comp_type);
1910 if (TREE_CODE (*tp) == COMPONENT_REF && comp_quals != base_quals)
1912 comp_quals |= base_quals;
1913 TREE_TYPE (*tp)
1914 = build_qualified_type (comp_type, comp_quals);
1917 *base = *new_decl;
1918 info->modified = true;
1920 else if (VAR_P (*tp))
1922 tree *new_decl = info->adjusted_vars->get (*tp);
1923 if (new_decl)
1925 *tp = *new_decl;
1926 info->modified = true;
1930 return NULL_TREE;
1933 /* Return TRUE if CALL is a call to a builtin atomic/sync operation. */
1935 static bool
1936 is_sync_builtin_call (gcall *call)
1938 tree callee = gimple_call_fndecl (call);
1940 if (callee != NULL_TREE
1941 && gimple_call_builtin_p (call, BUILT_IN_NORMAL))
1942 switch (DECL_FUNCTION_CODE (callee))
1944 #undef DEF_SYNC_BUILTIN
1945 #define DEF_SYNC_BUILTIN(ENUM, NAME, TYPE, ATTRS) case ENUM:
1946 #include "sync-builtins.def"
1947 #undef DEF_SYNC_BUILTIN
1948 return true;
1950 default:
1954 return false;
1957 /* Main entry point for oacc transformations which run on the device
1958 compiler after LTO, so we know what the target device is at this
1959 point (including the host fallback). */
1961 static unsigned int
1962 execute_oacc_loop_designation ()
1964 tree attrs = oacc_get_fn_attrib (current_function_decl);
1966 if (!attrs)
1967 /* Not an offloaded function. */
1968 return 0;
1970 /* Parse the default dim argument exactly once. */
1971 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1973 oacc_parse_default_dims (flag_openacc_dims);
1974 flag_openacc_dims = (char *)&flag_openacc_dims;
1977 bool is_oacc_parallel
1978 = (lookup_attribute ("oacc parallel",
1979 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1980 bool is_oacc_kernels
1981 = (lookup_attribute ("oacc kernels",
1982 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1983 bool is_oacc_serial
1984 = (lookup_attribute ("oacc serial",
1985 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1986 bool is_oacc_parallel_kernels_parallelized
1987 = (lookup_attribute ("oacc parallel_kernels_parallelized",
1988 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1989 bool is_oacc_parallel_kernels_gang_single
1990 = (lookup_attribute ("oacc parallel_kernels_gang_single",
1991 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1992 int fn_level = oacc_fn_attrib_level (attrs);
1993 bool is_oacc_routine = (fn_level >= 0);
1994 gcc_checking_assert (is_oacc_parallel
1995 + is_oacc_kernels
1996 + is_oacc_serial
1997 + is_oacc_parallel_kernels_parallelized
1998 + is_oacc_parallel_kernels_gang_single
1999 + is_oacc_routine
2000 == 1);
2002 bool is_oacc_kernels_parallelized
2003 = (lookup_attribute ("oacc kernels parallelized",
2004 DECL_ATTRIBUTES (current_function_decl)) != NULL);
2005 if (is_oacc_kernels_parallelized)
2006 gcc_checking_assert (is_oacc_kernels);
2008 if (dump_file)
2010 if (is_oacc_parallel)
2011 fprintf (dump_file, "Function is OpenACC parallel offload\n");
2012 else if (is_oacc_kernels)
2013 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
2014 (is_oacc_kernels_parallelized
2015 ? "parallelized" : "unparallelized"));
2016 else if (is_oacc_serial)
2017 fprintf (dump_file, "Function is OpenACC serial offload\n");
2018 else if (is_oacc_parallel_kernels_parallelized)
2019 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
2020 "parallel_kernels_parallelized");
2021 else if (is_oacc_parallel_kernels_gang_single)
2022 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
2023 "parallel_kernels_gang_single");
2024 else if (is_oacc_routine)
2025 fprintf (dump_file, "Function is OpenACC routine level %d\n",
2026 fn_level);
2027 else
2028 gcc_unreachable ();
2031 /* This doesn't belong into 'pass_oacc_loop_designation' conceptually, but
2032 it's a convenient place, so... */
2033 if (is_oacc_routine)
2035 tree attr = lookup_attribute ("omp declare target",
2036 DECL_ATTRIBUTES (current_function_decl));
2037 gcc_checking_assert (attr);
2038 tree clauses = TREE_VALUE (attr);
2039 gcc_checking_assert (clauses);
2041 /* Should this OpenACC routine be discarded? */
2042 bool discard = false;
2044 tree clause_nohost = omp_find_clause (clauses, OMP_CLAUSE_NOHOST);
2045 if (dump_file)
2046 fprintf (dump_file,
2047 "OpenACC routine '%s' %s '%s' clause.\n",
2048 lang_hooks.decl_printable_name (current_function_decl, 2),
2049 clause_nohost ? "has" : "doesn't have",
2050 omp_clause_code_name[OMP_CLAUSE_NOHOST]);
2051 /* Host compiler, 'nohost' clause? */
2052 #ifndef ACCEL_COMPILER
2053 if (clause_nohost)
2054 discard = true;
2055 #endif
2057 if (dump_file)
2058 fprintf (dump_file,
2059 "OpenACC routine '%s' %sdiscarded.\n",
2060 lang_hooks.decl_printable_name (current_function_decl, 2),
2061 discard ? "" : "not ");
2062 if (discard)
2064 TREE_ASM_WRITTEN (current_function_decl) = 1;
2065 return TODO_discard_function;
2069 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
2070 kernels, so remove the parallelism dimensions function attributes
2071 potentially set earlier on. */
2072 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
2074 oacc_set_fn_attrib (current_function_decl, NULL, NULL);
2075 attrs = oacc_get_fn_attrib (current_function_decl);
2078 /* Discover, partition and process the loops. */
2079 oacc_loop *loops = oacc_loop_discovery ();
2081 unsigned outer_mask = 0;
2082 if (is_oacc_routine)
2083 outer_mask = GOMP_DIM_MASK (fn_level) - 1;
2084 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
2085 /* OpenACC kernels constructs are special: they currently don't use the
2086 generic oacc_loop infrastructure and attribute/dimension processing. */
2087 if (is_oacc_kernels && is_oacc_kernels_parallelized)
2089 /* Parallelized OpenACC kernels constructs use gang parallelism. See
2090 also tree-parloops.cc:create_parallel_loop. */
2091 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
2094 int dims[GOMP_DIM_MAX];
2095 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
2097 if (dump_file)
2099 const char *comma = "Compute dimensions [";
2100 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
2101 fprintf (dump_file, "%s%d", comma, dims[ix]);
2102 fprintf (dump_file, "]\n");
2105 /* Verify that for OpenACC 'kernels' decomposed "gang-single" parts we launch
2106 a single gang only. */
2107 if (is_oacc_parallel_kernels_gang_single)
2108 gcc_checking_assert (dims[GOMP_DIM_GANG] == 1);
2110 oacc_loop_process (loops, fn_level);
2111 if (dump_file)
2113 fprintf (dump_file, "OpenACC loops\n");
2114 dump_oacc_loop (dump_file, loops, 0);
2115 fprintf (dump_file, "\n");
2117 if (dump_enabled_p ())
2119 oacc_loop *l = loops;
2120 /* OpenACC kernels constructs are special: they currently don't use the
2121 generic oacc_loop infrastructure. */
2122 if (is_oacc_kernels)
2124 /* Create a fake oacc_loop for diagnostic purposes. */
2125 l = new_oacc_loop_raw (NULL,
2126 DECL_SOURCE_LOCATION (current_function_decl));
2127 l->mask = used_mask;
2129 else
2131 /* Skip the outermost, dummy OpenACC loop */
2132 l = l->child;
2134 if (l)
2135 inform_oacc_loop (l);
2136 if (is_oacc_kernels)
2137 free_oacc_loop (l);
2140 free_oacc_loop (loops);
2142 return 0;
2145 static unsigned int
2146 execute_oacc_device_lower ()
2148 tree attrs = oacc_get_fn_attrib (current_function_decl);
2150 if (!attrs)
2151 /* Not an offloaded function. */
2152 return 0;
2154 int dims[GOMP_DIM_MAX];
2155 for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
2156 dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
2158 hash_map<tree, tree> adjusted_vars;
2160 /* Now lower internal loop functions to target-specific code
2161 sequences. */
2162 basic_block bb;
2163 FOR_ALL_BB_FN (bb, cfun)
2164 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
2166 gimple *stmt = gsi_stmt (gsi);
2167 if (!is_gimple_call (stmt))
2169 gsi_next (&gsi);
2170 continue;
2173 gcall *call = as_a <gcall *> (stmt);
2174 if (!gimple_call_internal_p (call))
2176 gsi_next (&gsi);
2177 continue;
2180 /* Rewind to allow rescan. */
2181 gsi_prev (&gsi);
2182 bool rescan = false, remove = false;
2183 enum internal_fn ifn_code = gimple_call_internal_fn (call);
2185 switch (ifn_code)
2187 default: break;
2189 case IFN_GOACC_TILE:
2190 oacc_xform_tile (call);
2191 rescan = true;
2192 break;
2194 case IFN_GOACC_LOOP:
2195 oacc_xform_loop (call);
2196 rescan = true;
2197 break;
2199 case IFN_GOACC_REDUCTION:
2200 /* Mark the function for SSA renaming. */
2201 mark_virtual_operands_for_renaming (cfun);
2203 /* If the level is -1, this ended up being an unused
2204 axis. Handle as a default. */
2205 if (integer_minus_onep (gimple_call_arg (call, 3)))
2206 default_goacc_reduction (call);
2207 else
2208 targetm.goacc.reduction (call);
2209 rescan = true;
2210 break;
2212 case IFN_UNIQUE:
2214 enum ifn_unique_kind kind
2215 = ((enum ifn_unique_kind)
2216 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
2218 switch (kind)
2220 default:
2221 break;
2223 case IFN_UNIQUE_OACC_FORK:
2224 case IFN_UNIQUE_OACC_JOIN:
2225 if (integer_minus_onep (gimple_call_arg (call, 2)))
2226 remove = true;
2227 else if (!targetm.goacc.fork_join
2228 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
2229 remove = true;
2230 break;
2232 case IFN_UNIQUE_OACC_HEAD_MARK:
2233 case IFN_UNIQUE_OACC_TAIL_MARK:
2234 remove = true;
2235 break;
2237 case IFN_UNIQUE_OACC_PRIVATE:
2239 dump_flags_t l_dump_flags
2240 = get_openacc_privatization_dump_flags ();
2242 location_t loc = gimple_location (stmt);
2243 if (LOCATION_LOCUS (loc) == UNKNOWN_LOCATION)
2244 loc = DECL_SOURCE_LOCATION (current_function_decl);
2245 const dump_user_location_t d_u_loc
2246 = dump_user_location_t::from_location_t (loc);
2248 HOST_WIDE_INT level
2249 = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
2250 gcc_checking_assert (level == -1
2251 || (level >= 0
2252 && level < GOMP_DIM_MAX));
2253 for (unsigned i = 3;
2254 i < gimple_call_num_args (call);
2255 i++)
2257 static char const *const axes[] =
2258 /* Must be kept in sync with GOMP_DIM enumeration. */
2259 { "gang", "worker", "vector" };
2261 tree arg = gimple_call_arg (call, i);
2262 gcc_checking_assert (TREE_CODE (arg) == ADDR_EXPR);
2263 tree decl = TREE_OPERAND (arg, 0);
2264 if (dump_enabled_p ())
2265 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2266 #if __GNUC__ >= 10
2267 # pragma GCC diagnostic push
2268 # pragma GCC diagnostic ignored "-Wformat"
2269 #endif
2270 dump_printf_loc (l_dump_flags, d_u_loc,
2271 "variable %<%T%> ought to be"
2272 " adjusted for OpenACC"
2273 " privatization level: %qs\n",
2274 decl,
2275 (level == -1
2276 ? "UNKNOWN" : axes[level]));
2277 #if __GNUC__ >= 10
2278 # pragma GCC diagnostic pop
2279 #endif
2280 bool adjusted;
2281 if (level == -1)
2282 adjusted = false;
2283 else if (!targetm.goacc.adjust_private_decl)
2284 adjusted = false;
2285 else if (level == GOMP_DIM_VECTOR)
2287 /* That's the default behavior. */
2288 adjusted = true;
2290 else
2292 tree oldtype = TREE_TYPE (decl);
2293 tree newdecl
2294 = targetm.goacc.adjust_private_decl (loc, decl,
2295 level);
2296 adjusted = (TREE_TYPE (newdecl) != oldtype
2297 || newdecl != decl);
2298 if (adjusted)
2299 adjusted_vars.put (decl, newdecl);
2301 if (adjusted
2302 && dump_enabled_p ())
2303 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2304 #if __GNUC__ >= 10
2305 # pragma GCC diagnostic push
2306 # pragma GCC diagnostic ignored "-Wformat"
2307 #endif
2308 dump_printf_loc (l_dump_flags, d_u_loc,
2309 "variable %<%T%> adjusted for"
2310 " OpenACC privatization level:"
2311 " %qs\n",
2312 decl, axes[level]);
2313 #if __GNUC__ >= 10
2314 # pragma GCC diagnostic pop
2315 #endif
2317 remove = true;
2319 break;
2321 break;
2325 if (gsi_end_p (gsi))
2326 /* We rewound past the beginning of the BB. */
2327 gsi = gsi_start_bb (bb);
2328 else
2329 /* Undo the rewind. */
2330 gsi_next (&gsi);
2332 if (remove)
2334 if (gimple_vdef (call))
2335 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
2336 if (gimple_call_lhs (call))
2338 /* Propagate the data dependency var. */
2339 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
2340 gimple_call_arg (call, 1));
2341 gsi_replace (&gsi, ass, false);
2343 else
2344 gsi_remove (&gsi, true);
2346 else if (!rescan)
2347 /* If not rescanning, advance over the call. */
2348 gsi_next (&gsi);
2351 /* Regarding the OpenACC privatization level, we're currently only looking at
2352 making the gang-private level work. Regarding that, we have the following
2353 configurations:
2355 - GCN offloading: 'targetm.goacc.adjust_private_decl' does the work (in
2356 particular, change 'TREE_TYPE', etc.) and there is no
2357 'targetm.goacc.expand_var_decl'.
2359 - nvptx offloading: 'targetm.goacc.adjust_private_decl' only sets a
2360 marker and then 'targetm.goacc.expand_var_decl' does the work.
2362 Eventually (in particular, for worker-private level?), both
2363 'targetm.goacc.adjust_private_decl' and 'targetm.goacc.expand_var_decl'
2364 may need to do things, but that's currently not meant to be addressed, and
2365 thus not fully worked out and implemented, and thus untested. Hence,
2366 'assert' what currently is implemented/tested, only. */
2368 if (targetm.goacc.expand_var_decl)
2369 gcc_assert (adjusted_vars.is_empty ());
2371 /* Make adjustments to gang-private local variables if required by the
2372 target, e.g. forcing them into a particular address space. Afterwards,
2373 ADDR_EXPR nodes which have adjusted variables as their argument need to
2374 be modified in one of two ways:
2376 1. They can be recreated, making a pointer to the variable in the new
2377 address space, or
2379 2. The address of the variable in the new address space can be taken,
2380 converted to the default (original) address space, and the result of
2381 that conversion subsituted in place of the original ADDR_EXPR node.
2383 Which of these is done depends on the gimple statement being processed.
2384 At present atomic operations and inline asms use (1), and everything else
2385 uses (2). At least on AMD GCN, there are atomic operations that work
2386 directly in the LDS address space.
2388 COMPONENT_REFS, ARRAY_REFS and plain VAR_DECLs are also rewritten to use
2389 the new decl, adjusting types of appropriate tree nodes as necessary. */
2391 if (targetm.goacc.adjust_private_decl
2392 && !adjusted_vars.is_empty ())
2394 FOR_ALL_BB_FN (bb, cfun)
2395 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
2396 !gsi_end_p (gsi);
2397 gsi_next (&gsi))
2399 gimple *stmt = gsi_stmt (gsi);
2400 walk_stmt_info wi;
2401 var_decl_rewrite_info info;
2403 info.avoid_pointer_conversion
2404 = (is_gimple_call (stmt)
2405 && is_sync_builtin_call (as_a <gcall *> (stmt)))
2406 || gimple_code (stmt) == GIMPLE_ASM;
2407 info.stmt = stmt;
2408 info.modified = false;
2409 info.adjusted_vars = &adjusted_vars;
2411 memset (&wi, 0, sizeof (wi));
2412 wi.info = &info;
2414 walk_gimple_op (stmt, oacc_rewrite_var_decl, &wi);
2416 if (info.modified)
2417 update_stmt (stmt);
2421 return 0;
2424 /* Default launch dimension validator. Force everything to 1. A
2425 backend that wants to provide larger dimensions must override this
2426 hook. */
2428 bool
2429 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
2430 int ARG_UNUSED (fn_level),
2431 unsigned ARG_UNUSED (used))
2433 bool changed = false;
2435 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
2437 if (dims[ix] != 1)
2439 dims[ix] = 1;
2440 changed = true;
2444 return changed;
2447 /* Default dimension bound is unknown on accelerator and 1 on host. */
2450 default_goacc_dim_limit (int ARG_UNUSED (axis))
2452 #ifdef ACCEL_COMPILER
2453 return 0;
2454 #else
2455 return 1;
2456 #endif
2459 namespace {
2461 const pass_data pass_data_oacc_loop_designation =
2463 GIMPLE_PASS, /* type */
2464 "oaccloops", /* name */
2465 OPTGROUP_OMP, /* optinfo_flags */
2466 TV_NONE, /* tv_id */
2467 PROP_cfg, /* properties_required */
2468 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2469 0, /* properties_destroyed */
2470 0, /* todo_flags_start */
2471 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2474 class pass_oacc_loop_designation : public gimple_opt_pass
2476 public:
2477 pass_oacc_loop_designation (gcc::context *ctxt)
2478 : gimple_opt_pass (pass_data_oacc_loop_designation, ctxt)
2481 /* opt_pass methods: */
2482 bool gate (function *) final override { return flag_openacc; };
2484 unsigned int execute (function *) final override
2486 return execute_oacc_loop_designation ();
2489 }; // class pass_oacc_loop_designation
2491 const pass_data pass_data_oacc_device_lower =
2493 GIMPLE_PASS, /* type */
2494 "oaccdevlow", /* name */
2495 OPTGROUP_OMP, /* optinfo_flags */
2496 TV_NONE, /* tv_id */
2497 PROP_cfg, /* properties_required */
2498 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2499 0, /* properties_destroyed */
2500 0, /* todo_flags_start */
2501 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2504 class pass_oacc_device_lower : public gimple_opt_pass
2506 public:
2507 pass_oacc_device_lower (gcc::context *ctxt)
2508 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
2511 /* opt_pass methods: */
2512 bool gate (function *) final override { return flag_openacc; };
2514 unsigned int execute (function *) final override
2516 return execute_oacc_device_lower ();
2519 }; // class pass_oacc_device_lower
2521 } // anon namespace
2523 gimple_opt_pass *
2524 make_pass_oacc_loop_designation (gcc::context *ctxt)
2526 return new pass_oacc_loop_designation (ctxt);
2529 gimple_opt_pass *
2530 make_pass_oacc_device_lower (gcc::context *ctxt)
2532 return new pass_oacc_device_lower (ctxt);
2536 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
2537 GOMP_SIMT_ENTER call identifying the privatized variables, which are
2538 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
2539 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
2541 static void
2542 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
2544 gimple *alloc_stmt = gsi_stmt (*gsi);
2545 tree simtrec = gimple_call_lhs (alloc_stmt);
2546 tree simduid = gimple_call_arg (alloc_stmt, 0);
2547 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
2548 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
2549 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
2550 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
2551 TREE_ADDRESSABLE (rectype) = 1;
2552 TREE_TYPE (simtrec) = build_pointer_type (rectype);
2553 for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
2555 tree *argp = gimple_call_arg_ptr (enter_stmt, i);
2556 if (*argp == null_pointer_node)
2557 continue;
2558 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
2559 && VAR_P (TREE_OPERAND (*argp, 0)));
2560 tree var = TREE_OPERAND (*argp, 0);
2562 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
2563 DECL_NAME (var), TREE_TYPE (var));
2564 SET_DECL_ALIGN (field, DECL_ALIGN (var));
2565 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
2566 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
2568 insert_field_into_struct (rectype, field);
2570 tree t = build_simple_mem_ref (simtrec);
2571 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
2572 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
2573 SET_DECL_VALUE_EXPR (var, t);
2574 DECL_HAS_VALUE_EXPR_P (var) = 1;
2575 *regimplify = true;
2577 layout_type (rectype);
2578 tree size = TYPE_SIZE_UNIT (rectype);
2579 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
2581 alloc_stmt
2582 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
2583 gimple_call_set_lhs (alloc_stmt, simtrec);
2584 gsi_replace (gsi, alloc_stmt, false);
2585 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
2586 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
2587 gsi_replace (&enter_gsi, enter_stmt, false);
2589 use_operand_p use;
2590 gimple *exit_stmt;
2591 if (single_imm_use (simtrec, &use, &exit_stmt))
2593 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
2594 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
2595 tree clobber = build_clobber (rectype);
2596 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
2597 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
2599 else
2600 gcc_checking_assert (has_zero_uses (simtrec));
2603 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
2605 static tree
2606 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
2608 tree t = *tp;
2610 if (VAR_P (t)
2611 && DECL_HAS_VALUE_EXPR_P (t)
2612 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
2614 *walk_subtrees = 0;
2615 return t;
2617 return NULL_TREE;
2620 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
2621 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
2622 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
2623 internal functions on non-SIMT targets, and likewise some SIMD internal
2624 functions on SIMT targets. */
2626 static unsigned int
2627 execute_omp_device_lower ()
2629 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
2630 bool regimplify = false;
2631 basic_block bb;
2632 gimple_stmt_iterator gsi;
2633 bool calls_declare_variant_alt
2634 = cgraph_node::get (cfun->decl)->calls_declare_variant_alt;
2635 #ifdef ACCEL_COMPILER
2636 bool omp_redirect_indirect_calls = vec_safe_length (offload_ind_funcs) > 0;
2637 tree map_ptr_fn
2638 = builtin_decl_explicit (BUILT_IN_GOMP_TARGET_MAP_INDIRECT_PTR);
2639 #endif
2640 FOR_EACH_BB_FN (bb, cfun)
2641 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2643 gimple *stmt = gsi_stmt (gsi);
2644 if (!is_gimple_call (stmt))
2645 continue;
2646 if (!gimple_call_internal_p (stmt))
2648 if (calls_declare_variant_alt)
2649 if (tree fndecl = gimple_call_fndecl (stmt))
2651 tree new_fndecl = omp_resolve_declare_variant (fndecl);
2652 if (new_fndecl != fndecl)
2654 gimple_call_set_fndecl (stmt, new_fndecl);
2655 update_stmt (stmt);
2658 #ifdef ACCEL_COMPILER
2659 if (omp_redirect_indirect_calls
2660 && gimple_call_fndecl (stmt) == NULL_TREE)
2662 gcall *orig_call = dyn_cast <gcall *> (stmt);
2663 tree call_fn = gimple_call_fn (stmt);
2664 tree fn_ty = TREE_TYPE (call_fn);
2666 if (TREE_CODE (call_fn) == OBJ_TYPE_REF)
2668 tree obj_ref = create_tmp_reg (TREE_TYPE (call_fn),
2669 ".ind_fn_objref");
2670 gimple *gassign = gimple_build_assign (obj_ref, call_fn);
2671 gsi_insert_before (&gsi, gassign, GSI_SAME_STMT);
2672 call_fn = obj_ref;
2674 tree mapped_fn = create_tmp_reg (fn_ty, ".ind_fn");
2675 gimple *gcall =
2676 gimple_build_call (map_ptr_fn, 1, call_fn);
2677 gimple_set_location (gcall, gimple_location (stmt));
2678 gimple_call_set_lhs (gcall, mapped_fn);
2679 gsi_insert_before (&gsi, gcall, GSI_SAME_STMT);
2681 gimple_call_set_fn (orig_call, mapped_fn);
2682 update_stmt (orig_call);
2684 #endif
2685 continue;
2687 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
2688 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
2689 switch (gimple_call_internal_fn (stmt))
2691 case IFN_GOMP_TARGET_REV:
2693 #ifndef ACCEL_COMPILER
2694 gimple_stmt_iterator gsi2 = gsi;
2695 gsi_next (&gsi2);
2696 gcc_assert (!gsi_end_p (gsi2));
2697 gcc_assert (gimple_call_builtin_p (gsi_stmt (gsi2),
2698 BUILT_IN_GOMP_TARGET));
2699 tree old_decl
2700 = TREE_OPERAND (gimple_call_arg (gsi_stmt (gsi2), 1), 0);
2701 tree new_decl = gimple_call_arg (gsi_stmt (gsi), 0);
2702 gimple_call_set_arg (gsi_stmt (gsi2), 1, new_decl);
2703 update_stmt (gsi_stmt (gsi2));
2704 new_decl = TREE_OPERAND (new_decl, 0);
2705 unsigned i;
2706 unsigned num_funcs = vec_safe_length (offload_funcs);
2707 for (i = 0; i < num_funcs; i++)
2709 if ((*offload_funcs)[i] == old_decl)
2711 (*offload_funcs)[i] = new_decl;
2712 break;
2714 else if ((*offload_funcs)[i] == new_decl)
2715 break; /* This can happen due to inlining. */
2717 gcc_assert (i < num_funcs);
2718 #else
2719 tree old_decl = TREE_OPERAND (gimple_call_arg (gsi_stmt (gsi), 0),
2721 #endif
2722 /* FIXME: Find a way to actually prevent outputting the empty-body
2723 old_decl as debug symbol + function in the assembly file. */
2724 cgraph_node *node = cgraph_node::get (old_decl);
2725 node->address_taken = false;
2726 node->need_lto_streaming = false;
2727 node->offloadable = false;
2729 unlink_stmt_vdef (stmt);
2731 break;
2732 case IFN_GOMP_USE_SIMT:
2733 rhs = vf == 1 ? integer_zero_node : integer_one_node;
2734 break;
2735 case IFN_GOMP_SIMT_ENTER:
2736 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2737 goto simtreg_enter_exit;
2738 case IFN_GOMP_SIMT_ENTER_ALLOC:
2739 if (vf != 1)
2740 ompdevlow_adjust_simt_enter (&gsi, &regimplify);
2741 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
2742 goto simtreg_enter_exit;
2743 case IFN_GOMP_SIMT_EXIT:
2744 simtreg_enter_exit:
2745 if (vf != 1)
2746 continue;
2747 unlink_stmt_vdef (stmt);
2748 break;
2749 case IFN_GOMP_SIMT_LANE:
2750 case IFN_GOMP_SIMT_LAST_LANE:
2751 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
2752 break;
2753 case IFN_GOMP_SIMT_VF:
2754 rhs = build_int_cst (type, vf);
2755 break;
2756 case IFN_GOMP_SIMT_ORDERED_PRED:
2757 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
2758 if (rhs || !lhs)
2759 unlink_stmt_vdef (stmt);
2760 break;
2761 case IFN_GOMP_SIMT_VOTE_ANY:
2762 case IFN_GOMP_SIMT_XCHG_BFLY:
2763 case IFN_GOMP_SIMT_XCHG_IDX:
2764 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2765 break;
2766 case IFN_GOMP_SIMD_LANE:
2767 case IFN_GOMP_SIMD_LAST_LANE:
2768 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
2769 break;
2770 case IFN_GOMP_SIMD_VF:
2771 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
2772 break;
2773 default:
2774 continue;
2776 if (lhs && !rhs)
2777 continue;
2778 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
2779 gsi_replace (&gsi, stmt, false);
2781 if (regimplify)
2782 FOR_EACH_BB_REVERSE_FN (bb, cfun)
2783 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
2784 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
2786 if (gimple_clobber_p (gsi_stmt (gsi)))
2787 gsi_remove (&gsi, true);
2788 else
2789 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2791 if (vf != 1)
2792 cfun->has_force_vectorize_loops = false;
2793 return 0;
2796 namespace {
2798 const pass_data pass_data_omp_device_lower =
2800 GIMPLE_PASS, /* type */
2801 "ompdevlow", /* name */
2802 OPTGROUP_OMP, /* optinfo_flags */
2803 TV_NONE, /* tv_id */
2804 PROP_cfg, /* properties_required */
2805 PROP_gimple_lomp_dev, /* properties_provided */
2806 0, /* properties_destroyed */
2807 0, /* todo_flags_start */
2808 TODO_update_ssa, /* todo_flags_finish */
2811 class pass_omp_device_lower : public gimple_opt_pass
2813 public:
2814 pass_omp_device_lower (gcc::context *ctxt)
2815 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2818 /* opt_pass methods: */
2819 bool gate (function *fun) final override
2821 #ifdef ACCEL_COMPILER
2822 bool offload_ind_funcs_p = vec_safe_length (offload_ind_funcs) > 0;
2823 #else
2824 bool offload_ind_funcs_p = false;
2825 #endif
2826 return (!(fun->curr_properties & PROP_gimple_lomp_dev)
2827 || (flag_openmp
2828 && (cgraph_node::get (fun->decl)->calls_declare_variant_alt
2829 || offload_ind_funcs_p)));
2831 unsigned int execute (function *) final override
2833 return execute_omp_device_lower ();
2836 }; // class pass_expand_omp_ssa
2838 } // anon namespace
2840 gimple_opt_pass *
2841 make_pass_omp_device_lower (gcc::context *ctxt)
2843 return new pass_omp_device_lower (ctxt);
2846 /* "omp declare target link" handling pass. */
2848 namespace {
2850 const pass_data pass_data_omp_target_link =
2852 GIMPLE_PASS, /* type */
2853 "omptargetlink", /* name */
2854 OPTGROUP_OMP, /* optinfo_flags */
2855 TV_NONE, /* tv_id */
2856 PROP_ssa, /* properties_required */
2857 0, /* properties_provided */
2858 0, /* properties_destroyed */
2859 0, /* todo_flags_start */
2860 TODO_update_ssa, /* todo_flags_finish */
2863 class pass_omp_target_link : public gimple_opt_pass
2865 public:
2866 pass_omp_target_link (gcc::context *ctxt)
2867 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2870 /* opt_pass methods: */
2871 bool gate (function *fun) final override
2873 #ifdef ACCEL_COMPILER
2874 return offloading_function_p (fun->decl);
2875 #else
2876 (void) fun;
2877 return false;
2878 #endif
2881 unsigned execute (function *) final override;
2884 /* Callback for walk_gimple_stmt used to scan for link var operands. */
2886 static tree
2887 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2889 tree t = *tp;
2891 if (VAR_P (t)
2892 && DECL_HAS_VALUE_EXPR_P (t)
2893 && is_global_var (t)
2894 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2896 *walk_subtrees = 0;
2897 return t;
2900 return NULL_TREE;
2903 unsigned
2904 pass_omp_target_link::execute (function *fun)
2906 basic_block bb;
2907 FOR_EACH_BB_FN (bb, fun)
2909 gimple_stmt_iterator gsi;
2910 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2912 if (gimple_call_builtin_p (gsi_stmt (gsi), BUILT_IN_GOMP_TARGET))
2914 tree dev = gimple_call_arg (gsi_stmt (gsi), 0);
2915 tree fn = gimple_call_arg (gsi_stmt (gsi), 1);
2916 if (POINTER_TYPE_P (TREE_TYPE (fn)))
2917 fn = TREE_OPERAND (fn, 0);
2918 if (TREE_CODE (dev) == INTEGER_CST
2919 && wi::to_wide (dev) == GOMP_DEVICE_HOST_FALLBACK
2920 && lookup_attribute ("omp target device_ancestor_nohost",
2921 DECL_ATTRIBUTES (fn)) != NULL_TREE)
2922 continue; /* ancestor:1 */
2923 /* Nullify the second argument of __builtin_GOMP_target_ext. */
2924 gimple_call_set_arg (gsi_stmt (gsi), 1, null_pointer_node);
2925 update_stmt (gsi_stmt (gsi));
2927 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2928 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2932 return 0;
2935 } // anon namespace
2937 gimple_opt_pass *
2938 make_pass_omp_target_link (gcc::context *ctxt)
2940 return new pass_omp_target_link (ctxt);