fix __builtin___clear_cache overrider fallout
[official-gcc.git] / gcc / omp-offload.c
blob90139615c00f3f9e22b733f2d2cf1cfb75092c40
1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
4 Copyright (C) 2005-2020 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
55 #include "context.h"
57 /* Describe the OpenACC looping structure of a function. The entire
58 function is held in a 'NULL' loop. */
60 struct oacc_loop
62 oacc_loop *parent; /* Containing loop. */
64 oacc_loop *child; /* First inner loop. */
66 oacc_loop *sibling; /* Next loop within same parent. */
68 location_t loc; /* Location of the loop start. */
70 gcall *marker; /* Initial head marker. */
72 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
73 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
75 tree routine; /* Pseudo-loop enclosing a routine. */
77 unsigned mask; /* Partitioning mask. */
78 unsigned e_mask; /* Partitioning of element loops (when tiling). */
79 unsigned inner; /* Partitioning of inner loops. */
80 unsigned flags; /* Partitioning flags. */
81 vec<gcall *> ifns; /* Contained loop abstraction functions. */
82 tree chunk_size; /* Chunk size. */
83 gcall *head_end; /* Final marker of head sequence. */
86 /* Holds offload tables with decls. */
87 vec<tree, va_gc> *offload_funcs, *offload_vars;
89 /* Return level at which oacc routine may spawn a partitioned loop, or
90 -1 if it is not a routine (i.e. is an offload fn). */
92 int
93 oacc_fn_attrib_level (tree attr)
95 tree pos = TREE_VALUE (attr);
97 if (!TREE_PURPOSE (pos))
98 return -1;
100 int ix = 0;
101 for (ix = 0; ix != GOMP_DIM_MAX;
102 ix++, pos = TREE_CHAIN (pos))
103 if (!integer_zerop (TREE_PURPOSE (pos)))
104 break;
106 return ix;
109 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
110 adds their addresses and sizes to constructor-vector V_CTOR. */
112 static void
113 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
114 vec<constructor_elt, va_gc> *v_ctor)
116 unsigned len = vec_safe_length (v_decls);
117 for (unsigned i = 0; i < len; i++)
119 tree it = (*v_decls)[i];
120 bool is_var = VAR_P (it);
121 bool is_link_var
122 = is_var
123 #ifdef ACCEL_COMPILER
124 && DECL_HAS_VALUE_EXPR_P (it)
125 #endif
126 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
128 /* See also omp_finish_file and output_offload_tables in lto-cgraph.c. */
129 if (!in_lto_p && !symtab_node::get (it))
130 continue;
132 tree size = NULL_TREE;
133 if (is_var)
134 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
136 tree addr;
137 if (!is_link_var)
138 addr = build_fold_addr_expr (it);
139 else
141 #ifdef ACCEL_COMPILER
142 /* For "omp declare target link" vars add address of the pointer to
143 the target table, instead of address of the var. */
144 tree value_expr = DECL_VALUE_EXPR (it);
145 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
146 varpool_node::finalize_decl (link_ptr_decl);
147 addr = build_fold_addr_expr (link_ptr_decl);
148 #else
149 addr = build_fold_addr_expr (it);
150 #endif
152 /* Most significant bit of the size marks "omp declare target link"
153 vars in host and target tables. */
154 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
155 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
156 * BITS_PER_UNIT - 1);
157 size = wide_int_to_tree (const_ptr_type_node, isize);
160 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
161 if (is_var)
162 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
166 /* Return true if DECL is a function for which its references should be
167 analyzed. */
169 static bool
170 omp_declare_target_fn_p (tree decl)
172 return (TREE_CODE (decl) == FUNCTION_DECL
173 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
174 && !lookup_attribute ("omp declare target host",
175 DECL_ATTRIBUTES (decl))
176 && (!flag_openacc
177 || oacc_get_fn_attrib (decl) == NULL_TREE));
180 /* Return true if DECL Is a variable for which its initializer references
181 should be analyzed. */
183 static bool
184 omp_declare_target_var_p (tree decl)
186 return (VAR_P (decl)
187 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
188 && !lookup_attribute ("omp declare target link",
189 DECL_ATTRIBUTES (decl)));
192 /* Helper function for omp_discover_implicit_declare_target, called through
193 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
194 declare target to. */
196 static tree
197 omp_discover_declare_target_tgt_fn_r (tree *tp, int *walk_subtrees, void *data)
199 if (TREE_CODE (*tp) == CALL_EXPR
200 && CALL_EXPR_FN (*tp)
201 && TREE_CODE (CALL_EXPR_FN (*tp)) == ADDR_EXPR
202 && TREE_CODE (TREE_OPERAND (CALL_EXPR_FN (*tp), 0)) == FUNCTION_DECL
203 && lookup_attribute ("omp declare variant base",
204 DECL_ATTRIBUTES (TREE_OPERAND (CALL_EXPR_FN (*tp),
205 0))))
207 tree fn = TREE_OPERAND (CALL_EXPR_FN (*tp), 0);
208 for (tree attr = DECL_ATTRIBUTES (fn); attr; attr = TREE_CHAIN (attr))
210 attr = lookup_attribute ("omp declare variant base", attr);
211 if (attr == NULL_TREE)
212 break;
213 tree purpose = TREE_PURPOSE (TREE_VALUE (attr));
214 if (TREE_CODE (purpose) == FUNCTION_DECL)
215 omp_discover_declare_target_tgt_fn_r (&purpose, walk_subtrees, data);
218 else if (TREE_CODE (*tp) == FUNCTION_DECL)
220 tree decl = *tp;
221 tree id = get_identifier ("omp declare target");
222 symtab_node *node = symtab_node::get (*tp);
223 if (node != NULL)
225 while (node->alias_target
226 && TREE_CODE (node->alias_target) == FUNCTION_DECL)
228 if (!omp_declare_target_fn_p (node->decl)
229 && !lookup_attribute ("omp declare target host",
230 DECL_ATTRIBUTES (node->decl)))
232 node->offloadable = 1;
233 DECL_ATTRIBUTES (node->decl)
234 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
236 node = symtab_node::get (node->alias_target);
238 symtab_node *new_node = node->ultimate_alias_target ();
239 decl = new_node->decl;
240 while (node != new_node)
242 if (!omp_declare_target_fn_p (node->decl)
243 && !lookup_attribute ("omp declare target host",
244 DECL_ATTRIBUTES (node->decl)))
246 node->offloadable = 1;
247 DECL_ATTRIBUTES (node->decl)
248 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
250 gcc_assert (node->alias && node->analyzed);
251 node = node->get_alias_target ();
253 node->offloadable = 1;
254 if (ENABLE_OFFLOADING)
255 g->have_offload = true;
257 if (omp_declare_target_fn_p (decl)
258 || lookup_attribute ("omp declare target host",
259 DECL_ATTRIBUTES (decl)))
260 return NULL_TREE;
262 if (!DECL_EXTERNAL (decl) && DECL_SAVED_TREE (decl))
263 ((vec<tree> *) data)->safe_push (decl);
264 DECL_ATTRIBUTES (decl) = tree_cons (id, NULL_TREE,
265 DECL_ATTRIBUTES (decl));
267 else if (TYPE_P (*tp))
268 *walk_subtrees = 0;
269 /* else if (TREE_CODE (*tp) == OMP_TARGET)
271 if (tree dev = omp_find_clause (OMP_TARGET_CLAUSES (*tp)))
272 if (OMP_DEVICE_ANCESTOR (dev))
273 *walk_subtrees = 0;
274 } */
275 return NULL_TREE;
278 /* Similarly, but ignore references outside of OMP_TARGET regions. */
280 static tree
281 omp_discover_declare_target_fn_r (tree *tp, int *walk_subtrees, void *data)
283 if (TREE_CODE (*tp) == OMP_TARGET)
285 /* And not OMP_DEVICE_ANCESTOR. */
286 walk_tree_without_duplicates (&OMP_TARGET_BODY (*tp),
287 omp_discover_declare_target_tgt_fn_r,
288 data);
289 *walk_subtrees = 0;
291 else if (TYPE_P (*tp))
292 *walk_subtrees = 0;
293 return NULL_TREE;
296 /* Helper function for omp_discover_implicit_declare_target, called through
297 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
298 declare target to. */
300 static tree
301 omp_discover_declare_target_var_r (tree *tp, int *walk_subtrees, void *data)
303 if (TREE_CODE (*tp) == FUNCTION_DECL)
304 return omp_discover_declare_target_tgt_fn_r (tp, walk_subtrees, data);
305 else if (VAR_P (*tp)
306 && is_global_var (*tp)
307 && !omp_declare_target_var_p (*tp))
309 tree id = get_identifier ("omp declare target");
310 if (lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp)))
312 error_at (DECL_SOURCE_LOCATION (*tp),
313 "%qD specified both in declare target %<link%> and "
314 "implicitly in %<to%> clauses", *tp);
315 DECL_ATTRIBUTES (*tp)
316 = remove_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp));
318 if (TREE_STATIC (*tp) && DECL_INITIAL (*tp))
319 ((vec<tree> *) data)->safe_push (*tp);
320 DECL_ATTRIBUTES (*tp) = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (*tp));
321 symtab_node *node = symtab_node::get (*tp);
322 if (node != NULL && !node->offloadable)
324 node->offloadable = 1;
325 if (ENABLE_OFFLOADING)
327 g->have_offload = true;
328 if (is_a <varpool_node *> (node))
329 vec_safe_push (offload_vars, node->decl);
333 else if (TYPE_P (*tp))
334 *walk_subtrees = 0;
335 return NULL_TREE;
338 /* Perform the OpenMP implicit declare target to discovery. */
340 void
341 omp_discover_implicit_declare_target (void)
343 cgraph_node *node;
344 varpool_node *vnode;
345 auto_vec<tree> worklist;
347 FOR_EACH_DEFINED_FUNCTION (node)
348 if (DECL_SAVED_TREE (node->decl))
350 struct cgraph_node *cgn;
351 if (omp_declare_target_fn_p (node->decl))
352 worklist.safe_push (node->decl);
353 else if (DECL_STRUCT_FUNCTION (node->decl)
354 && DECL_STRUCT_FUNCTION (node->decl)->has_omp_target)
355 worklist.safe_push (node->decl);
356 for (cgn = first_nested_function (node);
357 cgn; cgn = next_nested_function (cgn))
358 if (omp_declare_target_fn_p (cgn->decl))
359 worklist.safe_push (cgn->decl);
360 else if (DECL_STRUCT_FUNCTION (cgn->decl)
361 && DECL_STRUCT_FUNCTION (cgn->decl)->has_omp_target)
362 worklist.safe_push (cgn->decl);
364 FOR_EACH_STATIC_INITIALIZER (vnode)
365 if (omp_declare_target_var_p (vnode->decl))
366 worklist.safe_push (vnode->decl);
367 while (!worklist.is_empty ())
369 tree decl = worklist.pop ();
370 if (VAR_P (decl))
371 walk_tree_without_duplicates (&DECL_INITIAL (decl),
372 omp_discover_declare_target_var_r,
373 &worklist);
374 else if (omp_declare_target_fn_p (decl))
375 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
376 omp_discover_declare_target_tgt_fn_r,
377 &worklist);
378 else
379 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
380 omp_discover_declare_target_fn_r,
381 &worklist);
386 /* Create new symbols containing (address, size) pairs for global variables,
387 marked with "omp declare target" attribute, as well as addresses for the
388 functions, which are outlined offloading regions. */
389 void
390 omp_finish_file (void)
392 unsigned num_funcs = vec_safe_length (offload_funcs);
393 unsigned num_vars = vec_safe_length (offload_vars);
395 if (num_funcs == 0 && num_vars == 0)
396 return;
398 if (targetm_common.have_named_sections)
400 vec<constructor_elt, va_gc> *v_f, *v_v;
401 vec_alloc (v_f, num_funcs);
402 vec_alloc (v_v, num_vars * 2);
404 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
405 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
407 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
408 vec_safe_length (v_v));
409 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
410 num_funcs);
411 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
412 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
413 tree ctor_v = build_constructor (vars_decl_type, v_v);
414 tree ctor_f = build_constructor (funcs_decl_type, v_f);
415 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
416 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
417 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
418 get_identifier (".offload_func_table"),
419 funcs_decl_type);
420 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
421 get_identifier (".offload_var_table"),
422 vars_decl_type);
423 TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
424 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
425 otherwise a joint table in a binary will contain padding between
426 tables from multiple object files. */
427 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
428 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
429 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
430 DECL_INITIAL (funcs_decl) = ctor_f;
431 DECL_INITIAL (vars_decl) = ctor_v;
432 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
433 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
435 varpool_node::finalize_decl (vars_decl);
436 varpool_node::finalize_decl (funcs_decl);
438 else
440 for (unsigned i = 0; i < num_funcs; i++)
442 tree it = (*offload_funcs)[i];
443 /* See also add_decls_addresses_to_decl_constructor
444 and output_offload_tables in lto-cgraph.c. */
445 if (!in_lto_p && !symtab_node::get (it))
446 continue;
447 targetm.record_offload_symbol (it);
449 for (unsigned i = 0; i < num_vars; i++)
451 tree it = (*offload_vars)[i];
452 if (!in_lto_p && !symtab_node::get (it))
453 continue;
454 #ifdef ACCEL_COMPILER
455 if (DECL_HAS_VALUE_EXPR_P (it)
456 && lookup_attribute ("omp declare target link",
457 DECL_ATTRIBUTES (it)))
459 tree value_expr = DECL_VALUE_EXPR (it);
460 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
461 targetm.record_offload_symbol (link_ptr_decl);
462 varpool_node::finalize_decl (link_ptr_decl);
464 else
465 #endif
466 targetm.record_offload_symbol (it);
471 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
472 axis DIM. Return a tmp var holding the result. */
474 static tree
475 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
477 tree arg = build_int_cst (unsigned_type_node, dim);
478 tree size = create_tmp_var (integer_type_node);
479 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
480 gimple *call = gimple_build_call_internal (fn, 1, arg);
482 gimple_call_set_lhs (call, size);
483 gimple_seq_add_stmt (seq, call);
485 return size;
488 /* Find the number of threads (POS = false), or thread number (POS =
489 true) for an OpenACC region partitioned as MASK. Setup code
490 required for the calculation is added to SEQ. */
492 static tree
493 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
495 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
496 unsigned ix;
498 /* Start at gang level, and examine relevant dimension indices. */
499 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
500 if (GOMP_DIM_MASK (ix) & mask)
502 if (res)
504 /* We had an outer index, so scale that by the size of
505 this dimension. */
506 tree n = oacc_dim_call (false, ix, seq);
507 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
509 if (pos)
511 /* Determine index in this dimension. */
512 tree id = oacc_dim_call (true, ix, seq);
513 if (res)
514 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
515 else
516 res = id;
520 if (res == NULL_TREE)
521 res = integer_zero_node;
523 return res;
526 /* Transform IFN_GOACC_LOOP calls to actual code. See
527 expand_oacc_for for where these are generated. At the vector
528 level, we stride loops, such that each member of a warp will
529 operate on adjacent iterations. At the worker and gang level,
530 each gang/warp executes a set of contiguous iterations. Chunking
531 can override this such that each iteration engine executes a
532 contiguous chunk, and then moves on to stride to the next chunk. */
534 static void
535 oacc_xform_loop (gcall *call)
537 gimple_stmt_iterator gsi = gsi_for_stmt (call);
538 enum ifn_goacc_loop_kind code
539 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
540 tree dir = gimple_call_arg (call, 1);
541 tree range = gimple_call_arg (call, 2);
542 tree step = gimple_call_arg (call, 3);
543 tree chunk_size = NULL_TREE;
544 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
545 tree lhs = gimple_call_lhs (call);
546 tree type = NULL_TREE;
547 tree diff_type = TREE_TYPE (range);
548 tree r = NULL_TREE;
549 gimple_seq seq = NULL;
550 bool chunking = false, striding = true;
551 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
552 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
554 /* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
555 if (!lhs)
557 gsi_replace_with_seq (&gsi, seq, true);
558 return;
561 type = TREE_TYPE (lhs);
563 #ifdef ACCEL_COMPILER
564 chunk_size = gimple_call_arg (call, 4);
565 if (integer_minus_onep (chunk_size) /* Force static allocation. */
566 || integer_zerop (chunk_size)) /* Default (also static). */
568 /* If we're at the gang level, we want each to execute a
569 contiguous run of iterations. Otherwise we want each element
570 to stride. */
571 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
572 chunking = false;
574 else
576 /* Chunk of size 1 is striding. */
577 striding = integer_onep (chunk_size);
578 chunking = !striding;
580 #endif
582 /* striding=true, chunking=true
583 -> invalid.
584 striding=true, chunking=false
585 -> chunks=1
586 striding=false,chunking=true
587 -> chunks=ceil (range/(chunksize*threads*step))
588 striding=false,chunking=false
589 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
590 push_gimplify_context (true);
592 switch (code)
594 default: gcc_unreachable ();
596 case IFN_GOACC_LOOP_CHUNKS:
597 if (!chunking)
598 r = build_int_cst (type, 1);
599 else
601 /* chunk_max
602 = (range - dir) / (chunks * step * num_threads) + dir */
603 tree per = oacc_thread_numbers (false, mask, &seq);
604 per = fold_convert (type, per);
605 chunk_size = fold_convert (type, chunk_size);
606 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
607 per = fold_build2 (MULT_EXPR, type, per, step);
608 r = build2 (MINUS_EXPR, type, range, dir);
609 r = build2 (PLUS_EXPR, type, r, per);
610 r = build2 (TRUNC_DIV_EXPR, type, r, per);
612 break;
614 case IFN_GOACC_LOOP_STEP:
616 /* If striding, step by the entire compute volume, otherwise
617 step by the inner volume. */
618 unsigned volume = striding ? mask : inner_mask;
620 r = oacc_thread_numbers (false, volume, &seq);
621 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
623 break;
625 case IFN_GOACC_LOOP_OFFSET:
626 /* Enable vectorization on non-SIMT targets. */
627 if (!targetm.simt.vf
628 && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
629 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
630 the loop. */
631 && (flag_tree_loop_vectorize
632 || !global_options_set.x_flag_tree_loop_vectorize))
634 basic_block bb = gsi_bb (gsi);
635 class loop *parent = bb->loop_father;
636 class loop *body = parent->inner;
638 parent->force_vectorize = true;
639 parent->safelen = INT_MAX;
641 /* "Chunking loops" may have inner loops. */
642 if (parent->inner)
644 body->force_vectorize = true;
645 body->safelen = INT_MAX;
648 cfun->has_force_vectorize_loops = true;
650 if (striding)
652 r = oacc_thread_numbers (true, mask, &seq);
653 r = fold_convert (diff_type, r);
655 else
657 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
658 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
659 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
660 inner_size, outer_size);
662 volume = fold_convert (diff_type, volume);
663 if (chunking)
664 chunk_size = fold_convert (diff_type, chunk_size);
665 else
667 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
669 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
670 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
671 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
674 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
675 fold_convert (diff_type, inner_size));
676 r = oacc_thread_numbers (true, outer_mask, &seq);
677 r = fold_convert (diff_type, r);
678 r = build2 (MULT_EXPR, diff_type, r, span);
680 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
681 inner = fold_convert (diff_type, inner);
682 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
684 if (chunking)
686 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
687 tree per
688 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
689 per = build2 (MULT_EXPR, diff_type, per, chunk);
691 r = build2 (PLUS_EXPR, diff_type, r, per);
694 r = fold_build2 (MULT_EXPR, diff_type, r, step);
695 if (type != diff_type)
696 r = fold_convert (type, r);
697 break;
699 case IFN_GOACC_LOOP_BOUND:
700 if (striding)
701 r = range;
702 else
704 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
705 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
706 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
707 inner_size, outer_size);
709 volume = fold_convert (diff_type, volume);
710 if (chunking)
711 chunk_size = fold_convert (diff_type, chunk_size);
712 else
714 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
716 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
717 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
718 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
721 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
722 fold_convert (diff_type, inner_size));
724 r = fold_build2 (MULT_EXPR, diff_type, span, step);
726 tree offset = gimple_call_arg (call, 6);
727 r = build2 (PLUS_EXPR, diff_type, r,
728 fold_convert (diff_type, offset));
729 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
730 diff_type, r, range);
732 if (diff_type != type)
733 r = fold_convert (type, r);
734 break;
737 gimplify_assign (lhs, r, &seq);
739 pop_gimplify_context (NULL);
741 gsi_replace_with_seq (&gsi, seq, true);
744 /* Transform a GOACC_TILE call. Determines the element loop span for
745 the specified loop of the nest. This is 1 if we're not tiling.
747 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
749 static void
750 oacc_xform_tile (gcall *call)
752 gimple_stmt_iterator gsi = gsi_for_stmt (call);
753 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
754 /* Inner loops have higher loop_nos. */
755 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
756 tree tile_size = gimple_call_arg (call, 2);
757 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
758 tree lhs = gimple_call_lhs (call);
759 tree type = TREE_TYPE (lhs);
760 gimple_seq seq = NULL;
761 tree span = build_int_cst (type, 1);
763 gcc_assert (!(e_mask
764 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
765 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
766 push_gimplify_context (!seen_error ());
768 #ifndef ACCEL_COMPILER
769 /* Partitioning disabled on host compilers. */
770 e_mask = 0;
771 #endif
772 if (!e_mask)
773 /* Not paritioning. */
774 span = integer_one_node;
775 else if (!integer_zerop (tile_size))
776 /* User explicitly specified size. */
777 span = tile_size;
778 else
780 /* Pick a size based on the paritioning of the element loop and
781 the number of loop nests. */
782 tree first_size = NULL_TREE;
783 tree second_size = NULL_TREE;
785 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
786 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
787 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
788 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
790 if (!first_size)
792 first_size = second_size;
793 second_size = NULL_TREE;
796 if (loop_no + 1 == collapse)
798 span = first_size;
799 if (!loop_no && second_size)
800 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
801 span, second_size);
803 else if (loop_no + 2 == collapse)
804 span = second_size;
805 else
806 span = NULL_TREE;
808 if (!span)
809 /* There's no obvious element size for this loop. Options
810 are 1, first_size or some non-unity constant (32 is my
811 favourite). We should gather some statistics. */
812 span = first_size;
815 span = fold_convert (type, span);
816 gimplify_assign (lhs, span, &seq);
818 pop_gimplify_context (NULL);
820 gsi_replace_with_seq (&gsi, seq, true);
823 /* Default partitioned and minimum partitioned dimensions. */
825 static int oacc_default_dims[GOMP_DIM_MAX];
826 static int oacc_min_dims[GOMP_DIM_MAX];
829 oacc_get_default_dim (int dim)
831 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
832 return oacc_default_dims[dim];
836 oacc_get_min_dim (int dim)
838 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
839 return oacc_min_dims[dim];
842 /* Parse the default dimension parameter. This is a set of
843 :-separated optional compute dimensions. Each specified dimension
844 is a positive integer. When device type support is added, it is
845 planned to be a comma separated list of such compute dimensions,
846 with all but the first prefixed by the colon-terminated device
847 type. */
849 static void
850 oacc_parse_default_dims (const char *dims)
852 int ix;
854 for (ix = GOMP_DIM_MAX; ix--;)
856 oacc_default_dims[ix] = -1;
857 oacc_min_dims[ix] = 1;
860 #ifndef ACCEL_COMPILER
861 /* Cannot be overridden on the host. */
862 dims = NULL;
863 #endif
864 if (dims)
866 const char *pos = dims;
868 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
870 if (ix)
872 if (*pos != ':')
873 goto malformed;
874 pos++;
877 if (*pos != ':')
879 long val;
880 const char *eptr;
882 errno = 0;
883 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
884 if (errno || val <= 0 || (int) val != val)
885 goto malformed;
886 pos = eptr;
887 oacc_default_dims[ix] = (int) val;
890 if (*pos)
892 malformed:
893 error_at (UNKNOWN_LOCATION,
894 "%<-fopenacc-dim%> operand is malformed at %qs", pos);
898 /* Allow the backend to validate the dimensions. */
899 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
900 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
903 /* Validate and update the dimensions for offloaded FN. ATTRS is the
904 raw attribute. DIMS is an array of dimensions, which is filled in.
905 LEVEL is the partitioning level of a routine, or -1 for an offload
906 region itself. USED is the mask of partitioned execution in the
907 function. */
909 static void
910 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
912 tree purpose[GOMP_DIM_MAX];
913 unsigned ix;
914 tree pos = TREE_VALUE (attrs);
916 /* Make sure the attribute creator attached the dimension
917 information. */
918 gcc_assert (pos);
920 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
922 purpose[ix] = TREE_PURPOSE (pos);
923 tree val = TREE_VALUE (pos);
924 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
925 pos = TREE_CHAIN (pos);
928 bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
930 /* Default anything left to 1 or a partitioned default. */
931 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
932 if (dims[ix] < 0)
934 /* The OpenACC spec says 'If the [num_gangs] clause is not
935 specified, an implementation-defined default will be used;
936 the default may depend on the code within the construct.'
937 (2.5.6). Thus an implementation is free to choose
938 non-unity default for a parallel region that doesn't have
939 any gang-partitioned loops. However, it appears that there
940 is a sufficient body of user code that expects non-gang
941 partitioned regions to not execute in gang-redundant mode.
942 So we (a) don't warn about the non-portability and (b) pick
943 the minimum permissible dimension size when there is no
944 partitioned execution. Otherwise we pick the global
945 default for the dimension, which the user can control. The
946 same wording and logic applies to num_workers and
947 vector_length, however the worker- or vector- single
948 execution doesn't have the same impact as gang-redundant
949 execution. (If the minimum gang-level partioning is not 1,
950 the target is probably too confusing.) */
951 dims[ix] = (used & GOMP_DIM_MASK (ix)
952 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
953 changed = true;
956 if (changed)
958 /* Replace the attribute with new values. */
959 pos = NULL_TREE;
960 for (ix = GOMP_DIM_MAX; ix--;)
961 pos = tree_cons (purpose[ix],
962 build_int_cst (integer_type_node, dims[ix]), pos);
963 oacc_replace_fn_attrib (fn, pos);
967 /* Create an empty OpenACC loop structure at LOC. */
969 static oacc_loop *
970 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
972 oacc_loop *loop = XCNEW (oacc_loop);
974 loop->parent = parent;
976 if (parent)
978 loop->sibling = parent->child;
979 parent->child = loop;
982 loop->loc = loc;
983 return loop;
986 /* Create an outermost, dummy OpenACC loop for offloaded function
987 DECL. */
989 static oacc_loop *
990 new_oacc_loop_outer (tree decl)
992 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
995 /* Start a new OpenACC loop structure beginning at head marker HEAD.
996 Link into PARENT loop. Return the new loop. */
998 static oacc_loop *
999 new_oacc_loop (oacc_loop *parent, gcall *marker)
1001 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
1003 loop->marker = marker;
1005 /* TODO: This is where device_type flattening would occur for the loop
1006 flags. */
1008 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
1010 tree chunk_size = integer_zero_node;
1011 if (loop->flags & OLF_GANG_STATIC)
1012 chunk_size = gimple_call_arg (marker, 4);
1013 loop->chunk_size = chunk_size;
1015 return loop;
1018 /* Create a dummy loop encompassing a call to a openACC routine.
1019 Extract the routine's partitioning requirements. */
1021 static void
1022 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
1024 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
1025 int level = oacc_fn_attrib_level (attrs);
1027 gcc_assert (level >= 0);
1029 loop->marker = call;
1030 loop->routine = decl;
1031 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
1032 ^ (GOMP_DIM_MASK (level) - 1));
1035 /* Finish off the current OpenACC loop ending at tail marker TAIL.
1036 Return the parent loop. */
1038 static oacc_loop *
1039 finish_oacc_loop (oacc_loop *loop)
1041 /* If the loop has been collapsed, don't partition it. */
1042 if (loop->ifns.is_empty ())
1043 loop->mask = loop->flags = 0;
1044 return loop->parent;
1047 /* Free all OpenACC loop structures within LOOP (inclusive). */
1049 static void
1050 free_oacc_loop (oacc_loop *loop)
1052 if (loop->sibling)
1053 free_oacc_loop (loop->sibling);
1054 if (loop->child)
1055 free_oacc_loop (loop->child);
1057 loop->ifns.release ();
1058 free (loop);
1061 /* Dump out the OpenACC loop head or tail beginning at FROM. */
1063 static void
1064 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
1065 const char *title, int level)
1067 enum ifn_unique_kind kind
1068 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1070 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
1071 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1073 gimple *stmt = gsi_stmt (gsi);
1075 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1077 enum ifn_unique_kind k
1078 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
1079 (gimple_call_arg (stmt, 0)));
1081 if (k == kind && stmt != from)
1082 break;
1084 print_gimple_stmt (file, stmt, depth * 2 + 2);
1086 gsi_next (&gsi);
1087 while (gsi_end_p (gsi))
1088 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1092 /* Dump OpenACC loop LOOP, its children, and its siblings. */
1094 static void
1095 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
1097 int ix;
1099 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
1100 loop->flags, loop->mask,
1101 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
1103 if (loop->marker)
1104 print_gimple_stmt (file, loop->marker, depth * 2);
1106 if (loop->routine)
1107 fprintf (file, "%*sRoutine %s:%u:%s\n",
1108 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
1109 DECL_SOURCE_LINE (loop->routine),
1110 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
1112 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
1113 if (loop->heads[ix])
1114 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
1115 for (ix = GOMP_DIM_MAX; ix--;)
1116 if (loop->tails[ix])
1117 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
1119 if (loop->child)
1120 dump_oacc_loop (file, loop->child, depth + 1);
1121 if (loop->sibling)
1122 dump_oacc_loop (file, loop->sibling, depth);
1125 void debug_oacc_loop (oacc_loop *);
1127 /* Dump loops to stderr. */
1129 DEBUG_FUNCTION void
1130 debug_oacc_loop (oacc_loop *loop)
1132 dump_oacc_loop (stderr, loop, 0);
1135 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
1136 siblings. */
1138 static void
1139 inform_oacc_loop (const oacc_loop *loop)
1141 const char *gang
1142 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
1143 const char *worker
1144 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
1145 const char *vector
1146 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
1147 const char *seq = loop->mask == 0 ? " seq" : "";
1148 const dump_user_location_t loc
1149 = dump_user_location_t::from_location_t (loop->loc);
1150 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1151 "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
1152 vector, seq);
1154 if (loop->child)
1155 inform_oacc_loop (loop->child);
1156 if (loop->sibling)
1157 inform_oacc_loop (loop->sibling);
1160 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
1161 structures as we go. By construction these loops are properly
1162 nested. */
1164 static void
1165 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
1167 int marker = 0;
1168 int remaining = 0;
1170 if (bb->flags & BB_VISITED)
1171 return;
1173 follow:
1174 bb->flags |= BB_VISITED;
1176 /* Scan for loop markers. */
1177 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1178 gsi_next (&gsi))
1180 gimple *stmt = gsi_stmt (gsi);
1182 if (!is_gimple_call (stmt))
1183 continue;
1185 gcall *call = as_a <gcall *> (stmt);
1187 /* If this is a routine, make a dummy loop for it. */
1188 if (tree decl = gimple_call_fndecl (call))
1189 if (tree attrs = oacc_get_fn_attrib (decl))
1191 gcc_assert (!marker);
1192 new_oacc_loop_routine (loop, call, decl, attrs);
1195 if (!gimple_call_internal_p (call))
1196 continue;
1198 switch (gimple_call_internal_fn (call))
1200 default:
1201 break;
1203 case IFN_GOACC_LOOP:
1204 case IFN_GOACC_TILE:
1205 /* Record the abstraction function, so we can manipulate it
1206 later. */
1207 loop->ifns.safe_push (call);
1208 break;
1210 case IFN_UNIQUE:
1211 enum ifn_unique_kind kind
1212 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
1213 (gimple_call_arg (call, 0)));
1214 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
1215 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
1217 if (gimple_call_num_args (call) == 2)
1219 gcc_assert (marker && !remaining);
1220 marker = 0;
1221 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
1222 loop = finish_oacc_loop (loop);
1223 else
1224 loop->head_end = call;
1226 else
1228 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1230 if (!marker)
1232 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1233 loop = new_oacc_loop (loop, call);
1234 remaining = count;
1236 gcc_assert (count == remaining);
1237 if (remaining)
1239 remaining--;
1240 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1241 loop->heads[marker] = call;
1242 else
1243 loop->tails[remaining] = call;
1245 marker++;
1250 if (remaining || marker)
1252 bb = single_succ (bb);
1253 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1254 goto follow;
1257 /* Walk successor blocks. */
1258 edge e;
1259 edge_iterator ei;
1261 FOR_EACH_EDGE (e, ei, bb->succs)
1262 oacc_loop_discover_walk (loop, e->dest);
1265 /* LOOP is the first sibling. Reverse the order in place and return
1266 the new first sibling. Recurse to child loops. */
1268 static oacc_loop *
1269 oacc_loop_sibling_nreverse (oacc_loop *loop)
1271 oacc_loop *last = NULL;
1274 if (loop->child)
1275 loop->child = oacc_loop_sibling_nreverse (loop->child);
1277 oacc_loop *next = loop->sibling;
1278 loop->sibling = last;
1279 last = loop;
1280 loop = next;
1282 while (loop);
1284 return last;
1287 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1288 the current function. */
1290 static oacc_loop *
1291 oacc_loop_discovery ()
1293 /* Clear basic block flags, in particular BB_VISITED which we're going to use
1294 in the following. */
1295 clear_bb_flags ();
1297 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1298 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1300 /* The siblings were constructed in reverse order, reverse them so
1301 that diagnostics come out in an unsurprising order. */
1302 top = oacc_loop_sibling_nreverse (top);
1304 return top;
1307 /* Transform the abstract internal function markers starting at FROM
1308 to be for partitioning level LEVEL. Stop when we meet another HEAD
1309 or TAIL marker. */
1311 static void
1312 oacc_loop_xform_head_tail (gcall *from, int level)
1314 enum ifn_unique_kind kind
1315 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1316 tree replacement = build_int_cst (unsigned_type_node, level);
1318 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1320 gimple *stmt = gsi_stmt (gsi);
1322 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1324 enum ifn_unique_kind k
1325 = ((enum ifn_unique_kind)
1326 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1328 if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1329 *gimple_call_arg_ptr (stmt, 2) = replacement;
1330 else if (k == kind && stmt != from)
1331 break;
1333 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1334 *gimple_call_arg_ptr (stmt, 3) = replacement;
1336 gsi_next (&gsi);
1337 while (gsi_end_p (gsi))
1338 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1342 /* Process the discovered OpenACC loops, setting the correct
1343 partitioning level etc. */
1345 static void
1346 oacc_loop_process (oacc_loop *loop)
1348 if (loop->child)
1349 oacc_loop_process (loop->child);
1351 if (loop->mask && !loop->routine)
1353 int ix;
1354 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1355 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1356 tree chunk_arg = loop->chunk_size;
1357 gcall *call;
1359 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1360 switch (gimple_call_internal_fn (call))
1362 case IFN_GOACC_LOOP:
1364 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1365 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1366 if (!is_e)
1367 gimple_call_set_arg (call, 4, chunk_arg);
1369 break;
1371 case IFN_GOACC_TILE:
1372 gimple_call_set_arg (call, 3, mask_arg);
1373 gimple_call_set_arg (call, 4, e_mask_arg);
1374 break;
1376 default:
1377 gcc_unreachable ();
1380 unsigned dim = GOMP_DIM_GANG;
1381 unsigned mask = loop->mask | loop->e_mask;
1382 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1384 while (!(GOMP_DIM_MASK (dim) & mask))
1385 dim++;
1387 oacc_loop_xform_head_tail (loop->heads[ix], dim);
1388 oacc_loop_xform_head_tail (loop->tails[ix], dim);
1390 mask ^= GOMP_DIM_MASK (dim);
1394 if (loop->sibling)
1395 oacc_loop_process (loop->sibling);
1398 /* Walk the OpenACC loop heirarchy checking and assigning the
1399 programmer-specified partitionings. OUTER_MASK is the partitioning
1400 this loop is contained within. Return mask of partitioning
1401 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1402 bit. */
1404 static unsigned
1405 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1407 unsigned this_mask = loop->mask;
1408 unsigned mask_all = 0;
1409 bool noisy = true;
1411 #ifdef ACCEL_COMPILER
1412 /* When device_type is supported, we want the device compiler to be
1413 noisy, if the loop parameters are device_type-specific. */
1414 noisy = false;
1415 #endif
1417 if (!loop->routine)
1419 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1420 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1421 bool tiling = (loop->flags & OLF_TILE) != 0;
1423 this_mask = ((loop->flags >> OLF_DIM_BASE)
1424 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1426 /* Apply auto partitioning if this is a non-partitioned regular
1427 loop, or (no more than) single axis tiled loop. */
1428 bool maybe_auto
1429 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1431 if ((this_mask != 0) + auto_par + seq_par > 1)
1433 if (noisy)
1434 error_at (loop->loc,
1435 seq_par
1436 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1437 : G_("%<auto%> conflicts with other OpenACC loop "
1438 "specifiers"));
1439 maybe_auto = false;
1440 loop->flags &= ~OLF_AUTO;
1441 if (seq_par)
1443 loop->flags
1444 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1445 this_mask = 0;
1449 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1451 loop->flags |= OLF_AUTO;
1452 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1456 if (this_mask & outer_mask)
1458 const oacc_loop *outer;
1459 for (outer = loop->parent; outer; outer = outer->parent)
1460 if ((outer->mask | outer->e_mask) & this_mask)
1461 break;
1463 if (noisy)
1465 if (outer)
1467 error_at (loop->loc,
1468 loop->routine
1469 ? G_("routine call uses same OpenACC parallelism"
1470 " as containing loop")
1471 : G_("inner loop uses same OpenACC parallelism"
1472 " as containing loop"));
1473 inform (outer->loc, "containing loop here");
1475 else
1476 error_at (loop->loc,
1477 loop->routine
1478 ? G_("routine call uses OpenACC parallelism disallowed"
1479 " by containing routine")
1480 : G_("loop uses OpenACC parallelism disallowed"
1481 " by containing routine"));
1483 if (loop->routine)
1484 inform (DECL_SOURCE_LOCATION (loop->routine),
1485 "routine %qD declared here", loop->routine);
1487 this_mask &= ~outer_mask;
1489 else
1491 unsigned outermost = least_bit_hwi (this_mask);
1493 if (outermost && outermost <= outer_mask)
1495 if (noisy)
1497 error_at (loop->loc,
1498 "incorrectly nested OpenACC loop parallelism");
1500 const oacc_loop *outer;
1501 for (outer = loop->parent;
1502 outer->flags && outer->flags < outermost;
1503 outer = outer->parent)
1504 continue;
1505 inform (outer->loc, "containing loop here");
1508 this_mask &= ~outermost;
1512 mask_all |= this_mask;
1514 if (loop->flags & OLF_TILE)
1516 /* When tiling, vector goes to the element loop, and failing
1517 that we put worker there. The std doesn't contemplate
1518 specifying all three. We choose to put worker and vector on
1519 the element loops in that case. */
1520 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1521 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1522 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1524 loop->e_mask = this_e_mask;
1525 this_mask ^= this_e_mask;
1528 loop->mask = this_mask;
1530 if (dump_file)
1531 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1532 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1533 loop->mask, loop->e_mask);
1535 if (loop->child)
1537 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1538 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1539 mask_all |= loop->inner;
1542 if (loop->sibling)
1543 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1545 return mask_all;
1548 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1549 OUTER_MASK is the partitioning this loop is contained within.
1550 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1551 Return the cumulative partitioning used by this loop, siblings and
1552 children. */
1554 static unsigned
1555 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1556 bool outer_assign)
1558 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1559 bool noisy = true;
1560 bool tiling = loop->flags & OLF_TILE;
1562 #ifdef ACCEL_COMPILER
1563 /* When device_type is supported, we want the device compiler to be
1564 noisy, if the loop parameters are device_type-specific. */
1565 noisy = false;
1566 #endif
1568 if (assign && (!outer_assign || loop->inner))
1570 /* Allocate outermost and non-innermost loops at the outermost
1571 non-innermost available level. */
1572 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1574 /* Find the first outermost available partition. */
1575 while (this_mask <= outer_mask)
1576 this_mask <<= 1;
1578 /* Grab two axes if tiling, and we've not assigned anything */
1579 if (tiling && !(loop->mask | loop->e_mask))
1580 this_mask |= this_mask << 1;
1582 /* Prohibit the innermost partitioning at the moment. */
1583 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1585 /* Don't use any dimension explicitly claimed by an inner loop. */
1586 this_mask &= ~loop->inner;
1588 if (tiling && !loop->e_mask)
1590 /* If we got two axes, allocate the inner one to the element
1591 loop. */
1592 loop->e_mask = this_mask & (this_mask << 1);
1593 this_mask ^= loop->e_mask;
1596 loop->mask |= this_mask;
1599 if (loop->child)
1601 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1602 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1603 outer_assign | assign);
1606 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1608 /* Allocate the loop at the innermost available level. Note
1609 that we do this even if we already assigned this loop the
1610 outermost available level above. That way we'll partition
1611 this along 2 axes, if they are available. */
1612 unsigned this_mask = 0;
1614 /* Determine the outermost partitioning used within this loop. */
1615 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1616 this_mask = least_bit_hwi (this_mask);
1618 /* Pick the partitioning just inside that one. */
1619 this_mask >>= 1;
1621 /* And avoid picking one use by an outer loop. */
1622 this_mask &= ~outer_mask;
1624 /* If tiling and we failed completely above, grab the next one
1625 too. Making sure it doesn't hit an outer loop. */
1626 if (tiling)
1628 this_mask &= ~(loop->e_mask | loop->mask);
1629 unsigned tile_mask = ((this_mask >> 1)
1630 & ~(outer_mask | loop->e_mask | loop->mask));
1632 if (tile_mask || loop->mask)
1634 loop->e_mask |= this_mask;
1635 this_mask = tile_mask;
1637 if (!loop->e_mask && noisy)
1638 warning_at (loop->loc, 0,
1639 "insufficient partitioning available"
1640 " to parallelize element loop");
1643 loop->mask |= this_mask;
1644 if (!loop->mask && noisy)
1645 warning_at (loop->loc, 0,
1646 tiling
1647 ? G_("insufficient partitioning available"
1648 " to parallelize tile loop")
1649 : G_("insufficient partitioning available"
1650 " to parallelize loop"));
1653 if (assign && dump_file)
1654 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1655 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1656 loop->mask, loop->e_mask);
1658 unsigned inner_mask = 0;
1660 if (loop->sibling)
1661 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1662 outer_mask, outer_assign);
1664 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1666 return inner_mask;
1669 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1670 axes. Return mask of partitioning. */
1672 static unsigned
1673 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1675 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1677 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1679 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1680 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1682 return mask_all;
1685 /* Default fork/join early expander. Delete the function calls if
1686 there is no RTL expander. */
1688 bool
1689 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1690 const int *ARG_UNUSED (dims), bool is_fork)
1692 if (is_fork)
1693 return targetm.have_oacc_fork ();
1694 else
1695 return targetm.have_oacc_join ();
1698 /* Default goacc.reduction early expander.
1700 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1701 If RES_PTR is not integer-zerop:
1702 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1703 TEARDOWN - emit '*RES_PTR = VAR'
1704 If LHS is not NULL
1705 emit 'LHS = VAR' */
1707 void
1708 default_goacc_reduction (gcall *call)
1710 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1711 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1712 tree lhs = gimple_call_lhs (call);
1713 tree var = gimple_call_arg (call, 2);
1714 gimple_seq seq = NULL;
1716 if (code == IFN_GOACC_REDUCTION_SETUP
1717 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1719 /* Setup and Teardown need to copy from/to the receiver object,
1720 if there is one. */
1721 tree ref_to_res = gimple_call_arg (call, 1);
1723 if (!integer_zerop (ref_to_res))
1725 tree dst = build_simple_mem_ref (ref_to_res);
1726 tree src = var;
1728 if (code == IFN_GOACC_REDUCTION_SETUP)
1730 src = dst;
1731 dst = lhs;
1732 lhs = NULL;
1734 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1738 /* Copy VAR to LHS, if there is an LHS. */
1739 if (lhs)
1740 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1742 gsi_replace_with_seq (&gsi, seq, true);
1745 /* Main entry point for oacc transformations which run on the device
1746 compiler after LTO, so we know what the target device is at this
1747 point (including the host fallback). */
1749 static unsigned int
1750 execute_oacc_device_lower ()
1752 tree attrs = oacc_get_fn_attrib (current_function_decl);
1754 if (!attrs)
1755 /* Not an offloaded function. */
1756 return 0;
1758 /* Parse the default dim argument exactly once. */
1759 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1761 oacc_parse_default_dims (flag_openacc_dims);
1762 flag_openacc_dims = (char *)&flag_openacc_dims;
1765 bool is_oacc_parallel
1766 = (lookup_attribute ("oacc parallel",
1767 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1768 bool is_oacc_kernels
1769 = (lookup_attribute ("oacc kernels",
1770 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1771 bool is_oacc_serial
1772 = (lookup_attribute ("oacc serial",
1773 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1774 bool is_oacc_parallel_kernels_parallelized
1775 = (lookup_attribute ("oacc parallel_kernels_parallelized",
1776 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1777 bool is_oacc_parallel_kernels_gang_single
1778 = (lookup_attribute ("oacc parallel_kernels_gang_single",
1779 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1780 int fn_level = oacc_fn_attrib_level (attrs);
1781 bool is_oacc_routine = (fn_level >= 0);
1782 gcc_checking_assert (is_oacc_parallel
1783 + is_oacc_kernels
1784 + is_oacc_serial
1785 + is_oacc_parallel_kernels_parallelized
1786 + is_oacc_parallel_kernels_gang_single
1787 + is_oacc_routine
1788 == 1);
1790 bool is_oacc_kernels_parallelized
1791 = (lookup_attribute ("oacc kernels parallelized",
1792 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1793 if (is_oacc_kernels_parallelized)
1794 gcc_checking_assert (is_oacc_kernels);
1796 if (dump_file)
1798 if (is_oacc_parallel)
1799 fprintf (dump_file, "Function is OpenACC parallel offload\n");
1800 else if (is_oacc_kernels)
1801 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1802 (is_oacc_kernels_parallelized
1803 ? "parallelized" : "unparallelized"));
1804 else if (is_oacc_serial)
1805 fprintf (dump_file, "Function is OpenACC serial offload\n");
1806 else if (is_oacc_parallel_kernels_parallelized)
1807 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1808 "parallel_kernels_parallelized");
1809 else if (is_oacc_parallel_kernels_gang_single)
1810 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1811 "parallel_kernels_gang_single");
1812 else if (is_oacc_routine)
1813 fprintf (dump_file, "Function is OpenACC routine level %d\n",
1814 fn_level);
1815 else
1816 gcc_unreachable ();
1819 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
1820 kernels, so remove the parallelism dimensions function attributes
1821 potentially set earlier on. */
1822 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
1824 oacc_set_fn_attrib (current_function_decl, NULL, NULL);
1825 attrs = oacc_get_fn_attrib (current_function_decl);
1828 /* Discover, partition and process the loops. */
1829 oacc_loop *loops = oacc_loop_discovery ();
1831 unsigned outer_mask = 0;
1832 if (is_oacc_routine)
1833 outer_mask = GOMP_DIM_MASK (fn_level) - 1;
1834 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1835 /* OpenACC kernels constructs are special: they currently don't use the
1836 generic oacc_loop infrastructure and attribute/dimension processing. */
1837 if (is_oacc_kernels && is_oacc_kernels_parallelized)
1839 /* Parallelized OpenACC kernels constructs use gang parallelism. See
1840 also tree-parloops.c:create_parallel_loop. */
1841 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
1844 int dims[GOMP_DIM_MAX];
1845 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1847 if (dump_file)
1849 const char *comma = "Compute dimensions [";
1850 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1851 fprintf (dump_file, "%s%d", comma, dims[ix]);
1852 fprintf (dump_file, "]\n");
1855 /* Verify that for OpenACC 'kernels' decomposed "gang-single" parts we launch
1856 a single gang only. */
1857 if (is_oacc_parallel_kernels_gang_single)
1858 gcc_checking_assert (dims[GOMP_DIM_GANG] == 1);
1860 oacc_loop_process (loops);
1861 if (dump_file)
1863 fprintf (dump_file, "OpenACC loops\n");
1864 dump_oacc_loop (dump_file, loops, 0);
1865 fprintf (dump_file, "\n");
1867 if (dump_enabled_p ())
1869 oacc_loop *l = loops;
1870 /* OpenACC kernels constructs are special: they currently don't use the
1871 generic oacc_loop infrastructure. */
1872 if (is_oacc_kernels)
1874 /* Create a fake oacc_loop for diagnostic purposes. */
1875 l = new_oacc_loop_raw (NULL,
1876 DECL_SOURCE_LOCATION (current_function_decl));
1877 l->mask = used_mask;
1879 else
1881 /* Skip the outermost, dummy OpenACC loop */
1882 l = l->child;
1884 if (l)
1885 inform_oacc_loop (l);
1886 if (is_oacc_kernels)
1887 free_oacc_loop (l);
1890 /* Offloaded targets may introduce new basic blocks, which require
1891 dominance information to update SSA. */
1892 calculate_dominance_info (CDI_DOMINATORS);
1894 /* Now lower internal loop functions to target-specific code
1895 sequences. */
1896 basic_block bb;
1897 FOR_ALL_BB_FN (bb, cfun)
1898 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1900 gimple *stmt = gsi_stmt (gsi);
1901 if (!is_gimple_call (stmt))
1903 gsi_next (&gsi);
1904 continue;
1907 gcall *call = as_a <gcall *> (stmt);
1908 if (!gimple_call_internal_p (call))
1910 gsi_next (&gsi);
1911 continue;
1914 /* Rewind to allow rescan. */
1915 gsi_prev (&gsi);
1916 bool rescan = false, remove = false;
1917 enum internal_fn ifn_code = gimple_call_internal_fn (call);
1919 switch (ifn_code)
1921 default: break;
1923 case IFN_GOACC_TILE:
1924 oacc_xform_tile (call);
1925 rescan = true;
1926 break;
1928 case IFN_GOACC_LOOP:
1929 oacc_xform_loop (call);
1930 rescan = true;
1931 break;
1933 case IFN_GOACC_REDUCTION:
1934 /* Mark the function for SSA renaming. */
1935 mark_virtual_operands_for_renaming (cfun);
1937 /* If the level is -1, this ended up being an unused
1938 axis. Handle as a default. */
1939 if (integer_minus_onep (gimple_call_arg (call, 3)))
1940 default_goacc_reduction (call);
1941 else
1942 targetm.goacc.reduction (call);
1943 rescan = true;
1944 break;
1946 case IFN_UNIQUE:
1948 enum ifn_unique_kind kind
1949 = ((enum ifn_unique_kind)
1950 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1952 switch (kind)
1954 default:
1955 break;
1957 case IFN_UNIQUE_OACC_FORK:
1958 case IFN_UNIQUE_OACC_JOIN:
1959 if (integer_minus_onep (gimple_call_arg (call, 2)))
1960 remove = true;
1961 else if (!targetm.goacc.fork_join
1962 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1963 remove = true;
1964 break;
1966 case IFN_UNIQUE_OACC_HEAD_MARK:
1967 case IFN_UNIQUE_OACC_TAIL_MARK:
1968 remove = true;
1969 break;
1971 break;
1975 if (gsi_end_p (gsi))
1976 /* We rewound past the beginning of the BB. */
1977 gsi = gsi_start_bb (bb);
1978 else
1979 /* Undo the rewind. */
1980 gsi_next (&gsi);
1982 if (remove)
1984 if (gimple_vdef (call))
1985 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1986 if (gimple_call_lhs (call))
1988 /* Propagate the data dependency var. */
1989 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1990 gimple_call_arg (call, 1));
1991 gsi_replace (&gsi, ass, false);
1993 else
1994 gsi_remove (&gsi, true);
1996 else if (!rescan)
1997 /* If not rescanning, advance over the call. */
1998 gsi_next (&gsi);
2001 free_oacc_loop (loops);
2003 return 0;
2006 /* Default launch dimension validator. Force everything to 1. A
2007 backend that wants to provide larger dimensions must override this
2008 hook. */
2010 bool
2011 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
2012 int ARG_UNUSED (fn_level),
2013 unsigned ARG_UNUSED (used))
2015 bool changed = false;
2017 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
2019 if (dims[ix] != 1)
2021 dims[ix] = 1;
2022 changed = true;
2026 return changed;
2029 /* Default dimension bound is unknown on accelerator and 1 on host. */
2032 default_goacc_dim_limit (int ARG_UNUSED (axis))
2034 #ifdef ACCEL_COMPILER
2035 return 0;
2036 #else
2037 return 1;
2038 #endif
2041 namespace {
2043 const pass_data pass_data_oacc_device_lower =
2045 GIMPLE_PASS, /* type */
2046 "oaccdevlow", /* name */
2047 OPTGROUP_OMP, /* optinfo_flags */
2048 TV_NONE, /* tv_id */
2049 PROP_cfg, /* properties_required */
2050 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2051 0, /* properties_destroyed */
2052 0, /* todo_flags_start */
2053 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2056 class pass_oacc_device_lower : public gimple_opt_pass
2058 public:
2059 pass_oacc_device_lower (gcc::context *ctxt)
2060 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
2063 /* opt_pass methods: */
2064 virtual bool gate (function *) { return flag_openacc; };
2066 virtual unsigned int execute (function *)
2068 return execute_oacc_device_lower ();
2071 }; // class pass_oacc_device_lower
2073 } // anon namespace
2075 gimple_opt_pass *
2076 make_pass_oacc_device_lower (gcc::context *ctxt)
2078 return new pass_oacc_device_lower (ctxt);
2082 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
2083 GOMP_SIMT_ENTER call identifying the privatized variables, which are
2084 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
2085 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
2087 static void
2088 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
2090 gimple *alloc_stmt = gsi_stmt (*gsi);
2091 tree simtrec = gimple_call_lhs (alloc_stmt);
2092 tree simduid = gimple_call_arg (alloc_stmt, 0);
2093 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
2094 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
2095 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
2096 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
2097 TREE_ADDRESSABLE (rectype) = 1;
2098 TREE_TYPE (simtrec) = build_pointer_type (rectype);
2099 for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
2101 tree *argp = gimple_call_arg_ptr (enter_stmt, i);
2102 if (*argp == null_pointer_node)
2103 continue;
2104 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
2105 && VAR_P (TREE_OPERAND (*argp, 0)));
2106 tree var = TREE_OPERAND (*argp, 0);
2108 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
2109 DECL_NAME (var), TREE_TYPE (var));
2110 SET_DECL_ALIGN (field, DECL_ALIGN (var));
2111 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
2112 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
2114 insert_field_into_struct (rectype, field);
2116 tree t = build_simple_mem_ref (simtrec);
2117 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
2118 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
2119 SET_DECL_VALUE_EXPR (var, t);
2120 DECL_HAS_VALUE_EXPR_P (var) = 1;
2121 *regimplify = true;
2123 layout_type (rectype);
2124 tree size = TYPE_SIZE_UNIT (rectype);
2125 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
2127 alloc_stmt
2128 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
2129 gimple_call_set_lhs (alloc_stmt, simtrec);
2130 gsi_replace (gsi, alloc_stmt, false);
2131 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
2132 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
2133 gsi_replace (&enter_gsi, enter_stmt, false);
2135 use_operand_p use;
2136 gimple *exit_stmt;
2137 if (single_imm_use (simtrec, &use, &exit_stmt))
2139 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
2140 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
2141 tree clobber = build_clobber (rectype);
2142 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
2143 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
2145 else
2146 gcc_checking_assert (has_zero_uses (simtrec));
2149 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
2151 static tree
2152 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
2154 tree t = *tp;
2156 if (VAR_P (t)
2157 && DECL_HAS_VALUE_EXPR_P (t)
2158 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
2160 *walk_subtrees = 0;
2161 return t;
2163 return NULL_TREE;
2166 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
2167 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
2168 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
2169 internal functions on non-SIMT targets, and likewise some SIMD internal
2170 functions on SIMT targets. */
2172 static unsigned int
2173 execute_omp_device_lower ()
2175 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
2176 bool regimplify = false;
2177 basic_block bb;
2178 gimple_stmt_iterator gsi;
2179 bool calls_declare_variant_alt
2180 = cgraph_node::get (cfun->decl)->calls_declare_variant_alt;
2181 FOR_EACH_BB_FN (bb, cfun)
2182 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2184 gimple *stmt = gsi_stmt (gsi);
2185 if (!is_gimple_call (stmt))
2186 continue;
2187 if (!gimple_call_internal_p (stmt))
2189 if (calls_declare_variant_alt)
2190 if (tree fndecl = gimple_call_fndecl (stmt))
2192 tree new_fndecl = omp_resolve_declare_variant (fndecl);
2193 if (new_fndecl != fndecl)
2195 gimple_call_set_fndecl (stmt, new_fndecl);
2196 update_stmt (stmt);
2199 continue;
2201 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
2202 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
2203 switch (gimple_call_internal_fn (stmt))
2205 case IFN_GOMP_USE_SIMT:
2206 rhs = vf == 1 ? integer_zero_node : integer_one_node;
2207 break;
2208 case IFN_GOMP_SIMT_ENTER:
2209 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2210 goto simtreg_enter_exit;
2211 case IFN_GOMP_SIMT_ENTER_ALLOC:
2212 if (vf != 1)
2213 ompdevlow_adjust_simt_enter (&gsi, &regimplify);
2214 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
2215 goto simtreg_enter_exit;
2216 case IFN_GOMP_SIMT_EXIT:
2217 simtreg_enter_exit:
2218 if (vf != 1)
2219 continue;
2220 unlink_stmt_vdef (stmt);
2221 break;
2222 case IFN_GOMP_SIMT_LANE:
2223 case IFN_GOMP_SIMT_LAST_LANE:
2224 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
2225 break;
2226 case IFN_GOMP_SIMT_VF:
2227 rhs = build_int_cst (type, vf);
2228 break;
2229 case IFN_GOMP_SIMT_ORDERED_PRED:
2230 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
2231 if (rhs || !lhs)
2232 unlink_stmt_vdef (stmt);
2233 break;
2234 case IFN_GOMP_SIMT_VOTE_ANY:
2235 case IFN_GOMP_SIMT_XCHG_BFLY:
2236 case IFN_GOMP_SIMT_XCHG_IDX:
2237 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2238 break;
2239 case IFN_GOMP_SIMD_LANE:
2240 case IFN_GOMP_SIMD_LAST_LANE:
2241 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
2242 break;
2243 case IFN_GOMP_SIMD_VF:
2244 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
2245 break;
2246 default:
2247 continue;
2249 if (lhs && !rhs)
2250 continue;
2251 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
2252 gsi_replace (&gsi, stmt, false);
2254 if (regimplify)
2255 FOR_EACH_BB_REVERSE_FN (bb, cfun)
2256 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
2257 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
2259 if (gimple_clobber_p (gsi_stmt (gsi)))
2260 gsi_remove (&gsi, true);
2261 else
2262 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2264 if (vf != 1)
2265 cfun->has_force_vectorize_loops = false;
2266 return 0;
2269 namespace {
2271 const pass_data pass_data_omp_device_lower =
2273 GIMPLE_PASS, /* type */
2274 "ompdevlow", /* name */
2275 OPTGROUP_OMP, /* optinfo_flags */
2276 TV_NONE, /* tv_id */
2277 PROP_cfg, /* properties_required */
2278 PROP_gimple_lomp_dev, /* properties_provided */
2279 0, /* properties_destroyed */
2280 0, /* todo_flags_start */
2281 TODO_update_ssa, /* todo_flags_finish */
2284 class pass_omp_device_lower : public gimple_opt_pass
2286 public:
2287 pass_omp_device_lower (gcc::context *ctxt)
2288 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2291 /* opt_pass methods: */
2292 virtual bool gate (function *fun)
2294 return (!(fun->curr_properties & PROP_gimple_lomp_dev)
2295 || (flag_openmp
2296 && cgraph_node::get (fun->decl)->calls_declare_variant_alt));
2298 virtual unsigned int execute (function *)
2300 return execute_omp_device_lower ();
2303 }; // class pass_expand_omp_ssa
2305 } // anon namespace
2307 gimple_opt_pass *
2308 make_pass_omp_device_lower (gcc::context *ctxt)
2310 return new pass_omp_device_lower (ctxt);
2313 /* "omp declare target link" handling pass. */
2315 namespace {
2317 const pass_data pass_data_omp_target_link =
2319 GIMPLE_PASS, /* type */
2320 "omptargetlink", /* name */
2321 OPTGROUP_OMP, /* optinfo_flags */
2322 TV_NONE, /* tv_id */
2323 PROP_ssa, /* properties_required */
2324 0, /* properties_provided */
2325 0, /* properties_destroyed */
2326 0, /* todo_flags_start */
2327 TODO_update_ssa, /* todo_flags_finish */
2330 class pass_omp_target_link : public gimple_opt_pass
2332 public:
2333 pass_omp_target_link (gcc::context *ctxt)
2334 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2337 /* opt_pass methods: */
2338 virtual bool gate (function *fun)
2340 #ifdef ACCEL_COMPILER
2341 return offloading_function_p (fun->decl);
2342 #else
2343 (void) fun;
2344 return false;
2345 #endif
2348 virtual unsigned execute (function *);
2351 /* Callback for walk_gimple_stmt used to scan for link var operands. */
2353 static tree
2354 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2356 tree t = *tp;
2358 if (VAR_P (t)
2359 && DECL_HAS_VALUE_EXPR_P (t)
2360 && is_global_var (t)
2361 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2363 *walk_subtrees = 0;
2364 return t;
2367 return NULL_TREE;
2370 unsigned
2371 pass_omp_target_link::execute (function *fun)
2373 basic_block bb;
2374 FOR_EACH_BB_FN (bb, fun)
2376 gimple_stmt_iterator gsi;
2377 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2378 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2379 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2382 return 0;
2385 } // anon namespace
2387 gimple_opt_pass *
2388 make_pass_omp_target_link (gcc::context *ctxt)
2390 return new pass_omp_target_link (ctxt);