[RS6000] Non-pcrel tests when power10
[official-gcc.git] / gcc / omp-offload.c
blob3e9c31d2cbede772b6acaa067caee139d001f3fc
1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
4 Copyright (C) 2005-2020 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
55 #include "context.h"
57 /* Describe the OpenACC looping structure of a function. The entire
58 function is held in a 'NULL' loop. */
60 struct oacc_loop
62 oacc_loop *parent; /* Containing loop. */
64 oacc_loop *child; /* First inner loop. */
66 oacc_loop *sibling; /* Next loop within same parent. */
68 location_t loc; /* Location of the loop start. */
70 gcall *marker; /* Initial head marker. */
72 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
73 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
75 tree routine; /* Pseudo-loop enclosing a routine. */
77 unsigned mask; /* Partitioning mask. */
78 unsigned e_mask; /* Partitioning of element loops (when tiling). */
79 unsigned inner; /* Partitioning of inner loops. */
80 unsigned flags; /* Partitioning flags. */
81 vec<gcall *> ifns; /* Contained loop abstraction functions. */
82 tree chunk_size; /* Chunk size. */
83 gcall *head_end; /* Final marker of head sequence. */
86 /* Holds offload tables with decls. */
87 vec<tree, va_gc> *offload_funcs, *offload_vars;
89 /* Return level at which oacc routine may spawn a partitioned loop, or
90 -1 if it is not a routine (i.e. is an offload fn). */
92 int
93 oacc_fn_attrib_level (tree attr)
95 tree pos = TREE_VALUE (attr);
97 if (!TREE_PURPOSE (pos))
98 return -1;
100 int ix = 0;
101 for (ix = 0; ix != GOMP_DIM_MAX;
102 ix++, pos = TREE_CHAIN (pos))
103 if (!integer_zerop (TREE_PURPOSE (pos)))
104 break;
106 return ix;
109 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
110 adds their addresses and sizes to constructor-vector V_CTOR. */
112 static void
113 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
114 vec<constructor_elt, va_gc> *v_ctor)
116 unsigned len = vec_safe_length (v_decls);
117 for (unsigned i = 0; i < len; i++)
119 tree it = (*v_decls)[i];
120 bool is_var = VAR_P (it);
121 bool is_link_var
122 = is_var
123 #ifdef ACCEL_COMPILER
124 && DECL_HAS_VALUE_EXPR_P (it)
125 #endif
126 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
128 /* See also omp_finish_file and output_offload_tables in lto-cgraph.c. */
129 if (!in_lto_p && !symtab_node::get (it))
130 continue;
132 tree size = NULL_TREE;
133 if (is_var)
134 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
136 tree addr;
137 if (!is_link_var)
138 addr = build_fold_addr_expr (it);
139 else
141 #ifdef ACCEL_COMPILER
142 /* For "omp declare target link" vars add address of the pointer to
143 the target table, instead of address of the var. */
144 tree value_expr = DECL_VALUE_EXPR (it);
145 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
146 varpool_node::finalize_decl (link_ptr_decl);
147 addr = build_fold_addr_expr (link_ptr_decl);
148 #else
149 addr = build_fold_addr_expr (it);
150 #endif
152 /* Most significant bit of the size marks "omp declare target link"
153 vars in host and target tables. */
154 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
155 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
156 * BITS_PER_UNIT - 1);
157 size = wide_int_to_tree (const_ptr_type_node, isize);
160 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
161 if (is_var)
162 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
166 /* Return true if DECL is a function for which its references should be
167 analyzed. */
169 static bool
170 omp_declare_target_fn_p (tree decl)
172 return (TREE_CODE (decl) == FUNCTION_DECL
173 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
174 && !lookup_attribute ("omp declare target host",
175 DECL_ATTRIBUTES (decl))
176 && (!flag_openacc
177 || oacc_get_fn_attrib (decl) == NULL_TREE));
180 /* Return true if DECL Is a variable for which its initializer references
181 should be analyzed. */
183 static bool
184 omp_declare_target_var_p (tree decl)
186 return (VAR_P (decl)
187 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
188 && !lookup_attribute ("omp declare target link",
189 DECL_ATTRIBUTES (decl)));
192 /* Helper function for omp_discover_implicit_declare_target, called through
193 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
194 declare target to. */
196 static tree
197 omp_discover_declare_target_tgt_fn_r (tree *tp, int *walk_subtrees, void *data)
199 if (TREE_CODE (*tp) == FUNCTION_DECL)
201 tree decl = *tp;
202 tree id = get_identifier ("omp declare target");
203 symtab_node *node = symtab_node::get (*tp);
204 if (node != NULL)
206 while (node->alias_target
207 && TREE_CODE (node->alias_target) == FUNCTION_DECL)
209 if (!omp_declare_target_fn_p (node->decl)
210 && !lookup_attribute ("omp declare target host",
211 DECL_ATTRIBUTES (node->decl)))
213 node->offloadable = 1;
214 DECL_ATTRIBUTES (node->decl)
215 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
217 node = symtab_node::get (node->alias_target);
219 symtab_node *new_node = node->ultimate_alias_target ();
220 decl = new_node->decl;
221 while (node != new_node)
223 if (!omp_declare_target_fn_p (node->decl)
224 && !lookup_attribute ("omp declare target host",
225 DECL_ATTRIBUTES (node->decl)))
227 node->offloadable = 1;
228 DECL_ATTRIBUTES (node->decl)
229 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
231 gcc_assert (node->alias && node->analyzed);
232 node = node->get_alias_target ();
234 node->offloadable = 1;
235 if (ENABLE_OFFLOADING)
236 g->have_offload = true;
238 if (omp_declare_target_fn_p (decl)
239 || lookup_attribute ("omp declare target host",
240 DECL_ATTRIBUTES (decl)))
241 return NULL_TREE;
243 if (!DECL_EXTERNAL (decl) && DECL_SAVED_TREE (decl))
244 ((vec<tree> *) data)->safe_push (decl);
245 DECL_ATTRIBUTES (decl) = tree_cons (id, NULL_TREE,
246 DECL_ATTRIBUTES (decl));
248 else if (TYPE_P (*tp))
249 *walk_subtrees = 0;
250 /* else if (TREE_CODE (*tp) == OMP_TARGET)
252 if (tree dev = omp_find_clause (OMP_TARGET_CLAUSES (*tp)))
253 if (OMP_DEVICE_ANCESTOR (dev))
254 *walk_subtrees = 0;
255 } */
256 return NULL_TREE;
259 /* Similarly, but ignore references outside of OMP_TARGET regions. */
261 static tree
262 omp_discover_declare_target_fn_r (tree *tp, int *walk_subtrees, void *data)
264 if (TREE_CODE (*tp) == OMP_TARGET)
266 /* And not OMP_DEVICE_ANCESTOR. */
267 walk_tree_without_duplicates (&OMP_TARGET_BODY (*tp),
268 omp_discover_declare_target_tgt_fn_r,
269 data);
270 *walk_subtrees = 0;
272 else if (TYPE_P (*tp))
273 *walk_subtrees = 0;
274 return NULL_TREE;
277 /* Helper function for omp_discover_implicit_declare_target, called through
278 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
279 declare target to. */
281 static tree
282 omp_discover_declare_target_var_r (tree *tp, int *walk_subtrees, void *data)
284 if (TREE_CODE (*tp) == FUNCTION_DECL)
285 return omp_discover_declare_target_tgt_fn_r (tp, walk_subtrees, data);
286 else if (VAR_P (*tp)
287 && is_global_var (*tp)
288 && !omp_declare_target_var_p (*tp))
290 tree id = get_identifier ("omp declare target");
291 if (lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp)))
293 error_at (DECL_SOURCE_LOCATION (*tp),
294 "%qD specified both in declare target %<link%> and "
295 "implicitly in %<to%> clauses", *tp);
296 DECL_ATTRIBUTES (*tp)
297 = remove_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp));
299 if (TREE_STATIC (*tp) && DECL_INITIAL (*tp))
300 ((vec<tree> *) data)->safe_push (*tp);
301 DECL_ATTRIBUTES (*tp) = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (*tp));
302 symtab_node *node = symtab_node::get (*tp);
303 if (node != NULL && !node->offloadable)
305 node->offloadable = 1;
306 if (ENABLE_OFFLOADING)
308 g->have_offload = true;
309 if (is_a <varpool_node *> (node))
310 vec_safe_push (offload_vars, node->decl);
314 else if (TYPE_P (*tp))
315 *walk_subtrees = 0;
316 return NULL_TREE;
319 /* Perform the OpenMP implicit declare target to discovery. */
321 void
322 omp_discover_implicit_declare_target (void)
324 cgraph_node *node;
325 varpool_node *vnode;
326 auto_vec<tree> worklist;
328 FOR_EACH_DEFINED_FUNCTION (node)
329 if (DECL_SAVED_TREE (node->decl))
331 struct cgraph_node *cgn;
332 if (omp_declare_target_fn_p (node->decl))
333 worklist.safe_push (node->decl);
334 else if (DECL_STRUCT_FUNCTION (node->decl)
335 && DECL_STRUCT_FUNCTION (node->decl)->has_omp_target)
336 worklist.safe_push (node->decl);
337 for (cgn = first_nested_function (node);
338 cgn; cgn = next_nested_function (cgn))
339 if (omp_declare_target_fn_p (cgn->decl))
340 worklist.safe_push (cgn->decl);
341 else if (DECL_STRUCT_FUNCTION (cgn->decl)
342 && DECL_STRUCT_FUNCTION (cgn->decl)->has_omp_target)
343 worklist.safe_push (cgn->decl);
345 FOR_EACH_STATIC_INITIALIZER (vnode)
346 if (omp_declare_target_var_p (vnode->decl))
347 worklist.safe_push (vnode->decl);
348 while (!worklist.is_empty ())
350 tree decl = worklist.pop ();
351 if (VAR_P (decl))
352 walk_tree_without_duplicates (&DECL_INITIAL (decl),
353 omp_discover_declare_target_var_r,
354 &worklist);
355 else if (omp_declare_target_fn_p (decl))
356 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
357 omp_discover_declare_target_tgt_fn_r,
358 &worklist);
359 else
360 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
361 omp_discover_declare_target_fn_r,
362 &worklist);
367 /* Create new symbols containing (address, size) pairs for global variables,
368 marked with "omp declare target" attribute, as well as addresses for the
369 functions, which are outlined offloading regions. */
370 void
371 omp_finish_file (void)
373 unsigned num_funcs = vec_safe_length (offload_funcs);
374 unsigned num_vars = vec_safe_length (offload_vars);
376 if (num_funcs == 0 && num_vars == 0)
377 return;
379 if (targetm_common.have_named_sections)
381 vec<constructor_elt, va_gc> *v_f, *v_v;
382 vec_alloc (v_f, num_funcs);
383 vec_alloc (v_v, num_vars * 2);
385 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
386 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
388 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
389 vec_safe_length (v_v));
390 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
391 num_funcs);
392 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
393 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
394 tree ctor_v = build_constructor (vars_decl_type, v_v);
395 tree ctor_f = build_constructor (funcs_decl_type, v_f);
396 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
397 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
398 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
399 get_identifier (".offload_func_table"),
400 funcs_decl_type);
401 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
402 get_identifier (".offload_var_table"),
403 vars_decl_type);
404 TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
405 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
406 otherwise a joint table in a binary will contain padding between
407 tables from multiple object files. */
408 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
409 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
410 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
411 DECL_INITIAL (funcs_decl) = ctor_f;
412 DECL_INITIAL (vars_decl) = ctor_v;
413 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
414 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
416 varpool_node::finalize_decl (vars_decl);
417 varpool_node::finalize_decl (funcs_decl);
419 else
421 for (unsigned i = 0; i < num_funcs; i++)
423 tree it = (*offload_funcs)[i];
424 /* See also add_decls_addresses_to_decl_constructor
425 and output_offload_tables in lto-cgraph.c. */
426 if (!in_lto_p && !symtab_node::get (it))
427 continue;
428 targetm.record_offload_symbol (it);
430 for (unsigned i = 0; i < num_vars; i++)
432 tree it = (*offload_vars)[i];
433 if (!in_lto_p && !symtab_node::get (it))
434 continue;
435 #ifdef ACCEL_COMPILER
436 if (DECL_HAS_VALUE_EXPR_P (it)
437 && lookup_attribute ("omp declare target link",
438 DECL_ATTRIBUTES (it)))
440 tree value_expr = DECL_VALUE_EXPR (it);
441 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
442 targetm.record_offload_symbol (link_ptr_decl);
443 varpool_node::finalize_decl (link_ptr_decl);
445 else
446 #endif
447 targetm.record_offload_symbol (it);
452 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
453 axis DIM. Return a tmp var holding the result. */
455 static tree
456 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
458 tree arg = build_int_cst (unsigned_type_node, dim);
459 tree size = create_tmp_var (integer_type_node);
460 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
461 gimple *call = gimple_build_call_internal (fn, 1, arg);
463 gimple_call_set_lhs (call, size);
464 gimple_seq_add_stmt (seq, call);
466 return size;
469 /* Find the number of threads (POS = false), or thread number (POS =
470 true) for an OpenACC region partitioned as MASK. Setup code
471 required for the calculation is added to SEQ. */
473 static tree
474 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
476 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
477 unsigned ix;
479 /* Start at gang level, and examine relevant dimension indices. */
480 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
481 if (GOMP_DIM_MASK (ix) & mask)
483 if (res)
485 /* We had an outer index, so scale that by the size of
486 this dimension. */
487 tree n = oacc_dim_call (false, ix, seq);
488 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
490 if (pos)
492 /* Determine index in this dimension. */
493 tree id = oacc_dim_call (true, ix, seq);
494 if (res)
495 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
496 else
497 res = id;
501 if (res == NULL_TREE)
502 res = integer_zero_node;
504 return res;
507 /* Transform IFN_GOACC_LOOP calls to actual code. See
508 expand_oacc_for for where these are generated. At the vector
509 level, we stride loops, such that each member of a warp will
510 operate on adjacent iterations. At the worker and gang level,
511 each gang/warp executes a set of contiguous iterations. Chunking
512 can override this such that each iteration engine executes a
513 contiguous chunk, and then moves on to stride to the next chunk. */
515 static void
516 oacc_xform_loop (gcall *call)
518 gimple_stmt_iterator gsi = gsi_for_stmt (call);
519 enum ifn_goacc_loop_kind code
520 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
521 tree dir = gimple_call_arg (call, 1);
522 tree range = gimple_call_arg (call, 2);
523 tree step = gimple_call_arg (call, 3);
524 tree chunk_size = NULL_TREE;
525 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
526 tree lhs = gimple_call_lhs (call);
527 tree type = NULL_TREE;
528 tree diff_type = TREE_TYPE (range);
529 tree r = NULL_TREE;
530 gimple_seq seq = NULL;
531 bool chunking = false, striding = true;
532 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
533 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
535 /* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
536 if (!lhs)
538 gsi_replace_with_seq (&gsi, seq, true);
539 return;
542 type = TREE_TYPE (lhs);
544 #ifdef ACCEL_COMPILER
545 chunk_size = gimple_call_arg (call, 4);
546 if (integer_minus_onep (chunk_size) /* Force static allocation. */
547 || integer_zerop (chunk_size)) /* Default (also static). */
549 /* If we're at the gang level, we want each to execute a
550 contiguous run of iterations. Otherwise we want each element
551 to stride. */
552 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
553 chunking = false;
555 else
557 /* Chunk of size 1 is striding. */
558 striding = integer_onep (chunk_size);
559 chunking = !striding;
561 #endif
563 /* striding=true, chunking=true
564 -> invalid.
565 striding=true, chunking=false
566 -> chunks=1
567 striding=false,chunking=true
568 -> chunks=ceil (range/(chunksize*threads*step))
569 striding=false,chunking=false
570 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
571 push_gimplify_context (true);
573 switch (code)
575 default: gcc_unreachable ();
577 case IFN_GOACC_LOOP_CHUNKS:
578 if (!chunking)
579 r = build_int_cst (type, 1);
580 else
582 /* chunk_max
583 = (range - dir) / (chunks * step * num_threads) + dir */
584 tree per = oacc_thread_numbers (false, mask, &seq);
585 per = fold_convert (type, per);
586 chunk_size = fold_convert (type, chunk_size);
587 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
588 per = fold_build2 (MULT_EXPR, type, per, step);
589 r = build2 (MINUS_EXPR, type, range, dir);
590 r = build2 (PLUS_EXPR, type, r, per);
591 r = build2 (TRUNC_DIV_EXPR, type, r, per);
593 break;
595 case IFN_GOACC_LOOP_STEP:
597 /* If striding, step by the entire compute volume, otherwise
598 step by the inner volume. */
599 unsigned volume = striding ? mask : inner_mask;
601 r = oacc_thread_numbers (false, volume, &seq);
602 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
604 break;
606 case IFN_GOACC_LOOP_OFFSET:
607 /* Enable vectorization on non-SIMT targets. */
608 if (!targetm.simt.vf
609 && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
610 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
611 the loop. */
612 && (flag_tree_loop_vectorize
613 || !global_options_set.x_flag_tree_loop_vectorize))
615 basic_block bb = gsi_bb (gsi);
616 class loop *parent = bb->loop_father;
617 class loop *body = parent->inner;
619 parent->force_vectorize = true;
620 parent->safelen = INT_MAX;
622 /* "Chunking loops" may have inner loops. */
623 if (parent->inner)
625 body->force_vectorize = true;
626 body->safelen = INT_MAX;
629 cfun->has_force_vectorize_loops = true;
631 if (striding)
633 r = oacc_thread_numbers (true, mask, &seq);
634 r = fold_convert (diff_type, r);
636 else
638 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
639 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
640 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
641 inner_size, outer_size);
643 volume = fold_convert (diff_type, volume);
644 if (chunking)
645 chunk_size = fold_convert (diff_type, chunk_size);
646 else
648 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
650 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
651 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
652 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
655 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
656 fold_convert (diff_type, inner_size));
657 r = oacc_thread_numbers (true, outer_mask, &seq);
658 r = fold_convert (diff_type, r);
659 r = build2 (MULT_EXPR, diff_type, r, span);
661 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
662 inner = fold_convert (diff_type, inner);
663 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
665 if (chunking)
667 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
668 tree per
669 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
670 per = build2 (MULT_EXPR, diff_type, per, chunk);
672 r = build2 (PLUS_EXPR, diff_type, r, per);
675 r = fold_build2 (MULT_EXPR, diff_type, r, step);
676 if (type != diff_type)
677 r = fold_convert (type, r);
678 break;
680 case IFN_GOACC_LOOP_BOUND:
681 if (striding)
682 r = range;
683 else
685 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
686 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
687 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
688 inner_size, outer_size);
690 volume = fold_convert (diff_type, volume);
691 if (chunking)
692 chunk_size = fold_convert (diff_type, chunk_size);
693 else
695 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
697 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
698 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
699 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
702 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
703 fold_convert (diff_type, inner_size));
705 r = fold_build2 (MULT_EXPR, diff_type, span, step);
707 tree offset = gimple_call_arg (call, 6);
708 r = build2 (PLUS_EXPR, diff_type, r,
709 fold_convert (diff_type, offset));
710 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
711 diff_type, r, range);
713 if (diff_type != type)
714 r = fold_convert (type, r);
715 break;
718 gimplify_assign (lhs, r, &seq);
720 pop_gimplify_context (NULL);
722 gsi_replace_with_seq (&gsi, seq, true);
725 /* Transform a GOACC_TILE call. Determines the element loop span for
726 the specified loop of the nest. This is 1 if we're not tiling.
728 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
730 static void
731 oacc_xform_tile (gcall *call)
733 gimple_stmt_iterator gsi = gsi_for_stmt (call);
734 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
735 /* Inner loops have higher loop_nos. */
736 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
737 tree tile_size = gimple_call_arg (call, 2);
738 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
739 tree lhs = gimple_call_lhs (call);
740 tree type = TREE_TYPE (lhs);
741 gimple_seq seq = NULL;
742 tree span = build_int_cst (type, 1);
744 gcc_assert (!(e_mask
745 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
746 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
747 push_gimplify_context (!seen_error ());
749 #ifndef ACCEL_COMPILER
750 /* Partitioning disabled on host compilers. */
751 e_mask = 0;
752 #endif
753 if (!e_mask)
754 /* Not paritioning. */
755 span = integer_one_node;
756 else if (!integer_zerop (tile_size))
757 /* User explicitly specified size. */
758 span = tile_size;
759 else
761 /* Pick a size based on the paritioning of the element loop and
762 the number of loop nests. */
763 tree first_size = NULL_TREE;
764 tree second_size = NULL_TREE;
766 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
767 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
768 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
769 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
771 if (!first_size)
773 first_size = second_size;
774 second_size = NULL_TREE;
777 if (loop_no + 1 == collapse)
779 span = first_size;
780 if (!loop_no && second_size)
781 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
782 span, second_size);
784 else if (loop_no + 2 == collapse)
785 span = second_size;
786 else
787 span = NULL_TREE;
789 if (!span)
790 /* There's no obvious element size for this loop. Options
791 are 1, first_size or some non-unity constant (32 is my
792 favourite). We should gather some statistics. */
793 span = first_size;
796 span = fold_convert (type, span);
797 gimplify_assign (lhs, span, &seq);
799 pop_gimplify_context (NULL);
801 gsi_replace_with_seq (&gsi, seq, true);
804 /* Default partitioned and minimum partitioned dimensions. */
806 static int oacc_default_dims[GOMP_DIM_MAX];
807 static int oacc_min_dims[GOMP_DIM_MAX];
810 oacc_get_default_dim (int dim)
812 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
813 return oacc_default_dims[dim];
817 oacc_get_min_dim (int dim)
819 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
820 return oacc_min_dims[dim];
823 /* Parse the default dimension parameter. This is a set of
824 :-separated optional compute dimensions. Each specified dimension
825 is a positive integer. When device type support is added, it is
826 planned to be a comma separated list of such compute dimensions,
827 with all but the first prefixed by the colon-terminated device
828 type. */
830 static void
831 oacc_parse_default_dims (const char *dims)
833 int ix;
835 for (ix = GOMP_DIM_MAX; ix--;)
837 oacc_default_dims[ix] = -1;
838 oacc_min_dims[ix] = 1;
841 #ifndef ACCEL_COMPILER
842 /* Cannot be overridden on the host. */
843 dims = NULL;
844 #endif
845 if (dims)
847 const char *pos = dims;
849 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
851 if (ix)
853 if (*pos != ':')
854 goto malformed;
855 pos++;
858 if (*pos != ':')
860 long val;
861 const char *eptr;
863 errno = 0;
864 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
865 if (errno || val <= 0 || (int) val != val)
866 goto malformed;
867 pos = eptr;
868 oacc_default_dims[ix] = (int) val;
871 if (*pos)
873 malformed:
874 error_at (UNKNOWN_LOCATION,
875 "%<-fopenacc-dim%> operand is malformed at %qs", pos);
879 /* Allow the backend to validate the dimensions. */
880 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
881 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
884 /* Validate and update the dimensions for offloaded FN. ATTRS is the
885 raw attribute. DIMS is an array of dimensions, which is filled in.
886 LEVEL is the partitioning level of a routine, or -1 for an offload
887 region itself. USED is the mask of partitioned execution in the
888 function. */
890 static void
891 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
893 tree purpose[GOMP_DIM_MAX];
894 unsigned ix;
895 tree pos = TREE_VALUE (attrs);
897 /* Make sure the attribute creator attached the dimension
898 information. */
899 gcc_assert (pos);
901 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
903 purpose[ix] = TREE_PURPOSE (pos);
904 tree val = TREE_VALUE (pos);
905 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
906 pos = TREE_CHAIN (pos);
909 bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
911 /* Default anything left to 1 or a partitioned default. */
912 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
913 if (dims[ix] < 0)
915 /* The OpenACC spec says 'If the [num_gangs] clause is not
916 specified, an implementation-defined default will be used;
917 the default may depend on the code within the construct.'
918 (2.5.6). Thus an implementation is free to choose
919 non-unity default for a parallel region that doesn't have
920 any gang-partitioned loops. However, it appears that there
921 is a sufficient body of user code that expects non-gang
922 partitioned regions to not execute in gang-redundant mode.
923 So we (a) don't warn about the non-portability and (b) pick
924 the minimum permissible dimension size when there is no
925 partitioned execution. Otherwise we pick the global
926 default for the dimension, which the user can control. The
927 same wording and logic applies to num_workers and
928 vector_length, however the worker- or vector- single
929 execution doesn't have the same impact as gang-redundant
930 execution. (If the minimum gang-level partioning is not 1,
931 the target is probably too confusing.) */
932 dims[ix] = (used & GOMP_DIM_MASK (ix)
933 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
934 changed = true;
937 if (changed)
939 /* Replace the attribute with new values. */
940 pos = NULL_TREE;
941 for (ix = GOMP_DIM_MAX; ix--;)
942 pos = tree_cons (purpose[ix],
943 build_int_cst (integer_type_node, dims[ix]), pos);
944 oacc_replace_fn_attrib (fn, pos);
948 /* Create an empty OpenACC loop structure at LOC. */
950 static oacc_loop *
951 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
953 oacc_loop *loop = XCNEW (oacc_loop);
955 loop->parent = parent;
957 if (parent)
959 loop->sibling = parent->child;
960 parent->child = loop;
963 loop->loc = loc;
964 return loop;
967 /* Create an outermost, dummy OpenACC loop for offloaded function
968 DECL. */
970 static oacc_loop *
971 new_oacc_loop_outer (tree decl)
973 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
976 /* Start a new OpenACC loop structure beginning at head marker HEAD.
977 Link into PARENT loop. Return the new loop. */
979 static oacc_loop *
980 new_oacc_loop (oacc_loop *parent, gcall *marker)
982 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
984 loop->marker = marker;
986 /* TODO: This is where device_type flattening would occur for the loop
987 flags. */
989 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
991 tree chunk_size = integer_zero_node;
992 if (loop->flags & OLF_GANG_STATIC)
993 chunk_size = gimple_call_arg (marker, 4);
994 loop->chunk_size = chunk_size;
996 return loop;
999 /* Create a dummy loop encompassing a call to a openACC routine.
1000 Extract the routine's partitioning requirements. */
1002 static void
1003 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
1005 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
1006 int level = oacc_fn_attrib_level (attrs);
1008 gcc_assert (level >= 0);
1010 loop->marker = call;
1011 loop->routine = decl;
1012 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
1013 ^ (GOMP_DIM_MASK (level) - 1));
1016 /* Finish off the current OpenACC loop ending at tail marker TAIL.
1017 Return the parent loop. */
1019 static oacc_loop *
1020 finish_oacc_loop (oacc_loop *loop)
1022 /* If the loop has been collapsed, don't partition it. */
1023 if (loop->ifns.is_empty ())
1024 loop->mask = loop->flags = 0;
1025 return loop->parent;
1028 /* Free all OpenACC loop structures within LOOP (inclusive). */
1030 static void
1031 free_oacc_loop (oacc_loop *loop)
1033 if (loop->sibling)
1034 free_oacc_loop (loop->sibling);
1035 if (loop->child)
1036 free_oacc_loop (loop->child);
1038 loop->ifns.release ();
1039 free (loop);
1042 /* Dump out the OpenACC loop head or tail beginning at FROM. */
1044 static void
1045 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
1046 const char *title, int level)
1048 enum ifn_unique_kind kind
1049 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1051 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
1052 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1054 gimple *stmt = gsi_stmt (gsi);
1056 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1058 enum ifn_unique_kind k
1059 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
1060 (gimple_call_arg (stmt, 0)));
1062 if (k == kind && stmt != from)
1063 break;
1065 print_gimple_stmt (file, stmt, depth * 2 + 2);
1067 gsi_next (&gsi);
1068 while (gsi_end_p (gsi))
1069 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1073 /* Dump OpenACC loop LOOP, its children, and its siblings. */
1075 static void
1076 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
1078 int ix;
1080 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
1081 loop->flags, loop->mask,
1082 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
1084 if (loop->marker)
1085 print_gimple_stmt (file, loop->marker, depth * 2);
1087 if (loop->routine)
1088 fprintf (file, "%*sRoutine %s:%u:%s\n",
1089 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
1090 DECL_SOURCE_LINE (loop->routine),
1091 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
1093 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
1094 if (loop->heads[ix])
1095 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
1096 for (ix = GOMP_DIM_MAX; ix--;)
1097 if (loop->tails[ix])
1098 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
1100 if (loop->child)
1101 dump_oacc_loop (file, loop->child, depth + 1);
1102 if (loop->sibling)
1103 dump_oacc_loop (file, loop->sibling, depth);
1106 void debug_oacc_loop (oacc_loop *);
1108 /* Dump loops to stderr. */
1110 DEBUG_FUNCTION void
1111 debug_oacc_loop (oacc_loop *loop)
1113 dump_oacc_loop (stderr, loop, 0);
1116 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
1117 siblings. */
1119 static void
1120 inform_oacc_loop (const oacc_loop *loop)
1122 const char *gang
1123 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
1124 const char *worker
1125 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
1126 const char *vector
1127 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
1128 const char *seq = loop->mask == 0 ? " seq" : "";
1129 const dump_user_location_t loc
1130 = dump_user_location_t::from_location_t (loop->loc);
1131 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1132 "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
1133 vector, seq);
1135 if (loop->child)
1136 inform_oacc_loop (loop->child);
1137 if (loop->sibling)
1138 inform_oacc_loop (loop->sibling);
1141 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
1142 structures as we go. By construction these loops are properly
1143 nested. */
1145 static void
1146 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
1148 int marker = 0;
1149 int remaining = 0;
1151 if (bb->flags & BB_VISITED)
1152 return;
1154 follow:
1155 bb->flags |= BB_VISITED;
1157 /* Scan for loop markers. */
1158 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1159 gsi_next (&gsi))
1161 gimple *stmt = gsi_stmt (gsi);
1163 if (!is_gimple_call (stmt))
1164 continue;
1166 gcall *call = as_a <gcall *> (stmt);
1168 /* If this is a routine, make a dummy loop for it. */
1169 if (tree decl = gimple_call_fndecl (call))
1170 if (tree attrs = oacc_get_fn_attrib (decl))
1172 gcc_assert (!marker);
1173 new_oacc_loop_routine (loop, call, decl, attrs);
1176 if (!gimple_call_internal_p (call))
1177 continue;
1179 switch (gimple_call_internal_fn (call))
1181 default:
1182 break;
1184 case IFN_GOACC_LOOP:
1185 case IFN_GOACC_TILE:
1186 /* Record the abstraction function, so we can manipulate it
1187 later. */
1188 loop->ifns.safe_push (call);
1189 break;
1191 case IFN_UNIQUE:
1192 enum ifn_unique_kind kind
1193 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
1194 (gimple_call_arg (call, 0)));
1195 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
1196 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
1198 if (gimple_call_num_args (call) == 2)
1200 gcc_assert (marker && !remaining);
1201 marker = 0;
1202 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
1203 loop = finish_oacc_loop (loop);
1204 else
1205 loop->head_end = call;
1207 else
1209 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1211 if (!marker)
1213 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1214 loop = new_oacc_loop (loop, call);
1215 remaining = count;
1217 gcc_assert (count == remaining);
1218 if (remaining)
1220 remaining--;
1221 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1222 loop->heads[marker] = call;
1223 else
1224 loop->tails[remaining] = call;
1226 marker++;
1231 if (remaining || marker)
1233 bb = single_succ (bb);
1234 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1235 goto follow;
1238 /* Walk successor blocks. */
1239 edge e;
1240 edge_iterator ei;
1242 FOR_EACH_EDGE (e, ei, bb->succs)
1243 oacc_loop_discover_walk (loop, e->dest);
1246 /* LOOP is the first sibling. Reverse the order in place and return
1247 the new first sibling. Recurse to child loops. */
1249 static oacc_loop *
1250 oacc_loop_sibling_nreverse (oacc_loop *loop)
1252 oacc_loop *last = NULL;
1255 if (loop->child)
1256 loop->child = oacc_loop_sibling_nreverse (loop->child);
1258 oacc_loop *next = loop->sibling;
1259 loop->sibling = last;
1260 last = loop;
1261 loop = next;
1263 while (loop);
1265 return last;
1268 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1269 the current function. */
1271 static oacc_loop *
1272 oacc_loop_discovery ()
1274 /* Clear basic block flags, in particular BB_VISITED which we're going to use
1275 in the following. */
1276 clear_bb_flags ();
1278 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1279 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1281 /* The siblings were constructed in reverse order, reverse them so
1282 that diagnostics come out in an unsurprising order. */
1283 top = oacc_loop_sibling_nreverse (top);
1285 return top;
1288 /* Transform the abstract internal function markers starting at FROM
1289 to be for partitioning level LEVEL. Stop when we meet another HEAD
1290 or TAIL marker. */
1292 static void
1293 oacc_loop_xform_head_tail (gcall *from, int level)
1295 enum ifn_unique_kind kind
1296 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1297 tree replacement = build_int_cst (unsigned_type_node, level);
1299 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1301 gimple *stmt = gsi_stmt (gsi);
1303 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1305 enum ifn_unique_kind k
1306 = ((enum ifn_unique_kind)
1307 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1309 if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1310 *gimple_call_arg_ptr (stmt, 2) = replacement;
1311 else if (k == kind && stmt != from)
1312 break;
1314 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1315 *gimple_call_arg_ptr (stmt, 3) = replacement;
1317 gsi_next (&gsi);
1318 while (gsi_end_p (gsi))
1319 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1323 /* Process the discovered OpenACC loops, setting the correct
1324 partitioning level etc. */
1326 static void
1327 oacc_loop_process (oacc_loop *loop)
1329 if (loop->child)
1330 oacc_loop_process (loop->child);
1332 if (loop->mask && !loop->routine)
1334 int ix;
1335 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1336 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1337 tree chunk_arg = loop->chunk_size;
1338 gcall *call;
1340 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1341 switch (gimple_call_internal_fn (call))
1343 case IFN_GOACC_LOOP:
1345 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1346 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1347 if (!is_e)
1348 gimple_call_set_arg (call, 4, chunk_arg);
1350 break;
1352 case IFN_GOACC_TILE:
1353 gimple_call_set_arg (call, 3, mask_arg);
1354 gimple_call_set_arg (call, 4, e_mask_arg);
1355 break;
1357 default:
1358 gcc_unreachable ();
1361 unsigned dim = GOMP_DIM_GANG;
1362 unsigned mask = loop->mask | loop->e_mask;
1363 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1365 while (!(GOMP_DIM_MASK (dim) & mask))
1366 dim++;
1368 oacc_loop_xform_head_tail (loop->heads[ix], dim);
1369 oacc_loop_xform_head_tail (loop->tails[ix], dim);
1371 mask ^= GOMP_DIM_MASK (dim);
1375 if (loop->sibling)
1376 oacc_loop_process (loop->sibling);
1379 /* Walk the OpenACC loop heirarchy checking and assigning the
1380 programmer-specified partitionings. OUTER_MASK is the partitioning
1381 this loop is contained within. Return mask of partitioning
1382 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1383 bit. */
1385 static unsigned
1386 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1388 unsigned this_mask = loop->mask;
1389 unsigned mask_all = 0;
1390 bool noisy = true;
1392 #ifdef ACCEL_COMPILER
1393 /* When device_type is supported, we want the device compiler to be
1394 noisy, if the loop parameters are device_type-specific. */
1395 noisy = false;
1396 #endif
1398 if (!loop->routine)
1400 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1401 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1402 bool tiling = (loop->flags & OLF_TILE) != 0;
1404 this_mask = ((loop->flags >> OLF_DIM_BASE)
1405 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1407 /* Apply auto partitioning if this is a non-partitioned regular
1408 loop, or (no more than) single axis tiled loop. */
1409 bool maybe_auto
1410 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1412 if ((this_mask != 0) + auto_par + seq_par > 1)
1414 if (noisy)
1415 error_at (loop->loc,
1416 seq_par
1417 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1418 : G_("%<auto%> conflicts with other OpenACC loop "
1419 "specifiers"));
1420 maybe_auto = false;
1421 loop->flags &= ~OLF_AUTO;
1422 if (seq_par)
1424 loop->flags
1425 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1426 this_mask = 0;
1430 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1432 loop->flags |= OLF_AUTO;
1433 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1437 if (this_mask & outer_mask)
1439 const oacc_loop *outer;
1440 for (outer = loop->parent; outer; outer = outer->parent)
1441 if ((outer->mask | outer->e_mask) & this_mask)
1442 break;
1444 if (noisy)
1446 if (outer)
1448 error_at (loop->loc,
1449 loop->routine
1450 ? G_("routine call uses same OpenACC parallelism"
1451 " as containing loop")
1452 : G_("inner loop uses same OpenACC parallelism"
1453 " as containing loop"));
1454 inform (outer->loc, "containing loop here");
1456 else
1457 error_at (loop->loc,
1458 loop->routine
1459 ? G_("routine call uses OpenACC parallelism disallowed"
1460 " by containing routine")
1461 : G_("loop uses OpenACC parallelism disallowed"
1462 " by containing routine"));
1464 if (loop->routine)
1465 inform (DECL_SOURCE_LOCATION (loop->routine),
1466 "routine %qD declared here", loop->routine);
1468 this_mask &= ~outer_mask;
1470 else
1472 unsigned outermost = least_bit_hwi (this_mask);
1474 if (outermost && outermost <= outer_mask)
1476 if (noisy)
1478 error_at (loop->loc,
1479 "incorrectly nested OpenACC loop parallelism");
1481 const oacc_loop *outer;
1482 for (outer = loop->parent;
1483 outer->flags && outer->flags < outermost;
1484 outer = outer->parent)
1485 continue;
1486 inform (outer->loc, "containing loop here");
1489 this_mask &= ~outermost;
1493 mask_all |= this_mask;
1495 if (loop->flags & OLF_TILE)
1497 /* When tiling, vector goes to the element loop, and failing
1498 that we put worker there. The std doesn't contemplate
1499 specifying all three. We choose to put worker and vector on
1500 the element loops in that case. */
1501 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1502 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1503 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1505 loop->e_mask = this_e_mask;
1506 this_mask ^= this_e_mask;
1509 loop->mask = this_mask;
1511 if (dump_file)
1512 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1513 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1514 loop->mask, loop->e_mask);
1516 if (loop->child)
1518 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1519 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1520 mask_all |= loop->inner;
1523 if (loop->sibling)
1524 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1526 return mask_all;
1529 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1530 OUTER_MASK is the partitioning this loop is contained within.
1531 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1532 Return the cumulative partitioning used by this loop, siblings and
1533 children. */
1535 static unsigned
1536 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1537 bool outer_assign)
1539 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1540 bool noisy = true;
1541 bool tiling = loop->flags & OLF_TILE;
1543 #ifdef ACCEL_COMPILER
1544 /* When device_type is supported, we want the device compiler to be
1545 noisy, if the loop parameters are device_type-specific. */
1546 noisy = false;
1547 #endif
1549 if (assign && (!outer_assign || loop->inner))
1551 /* Allocate outermost and non-innermost loops at the outermost
1552 non-innermost available level. */
1553 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1555 /* Find the first outermost available partition. */
1556 while (this_mask <= outer_mask)
1557 this_mask <<= 1;
1559 /* Grab two axes if tiling, and we've not assigned anything */
1560 if (tiling && !(loop->mask | loop->e_mask))
1561 this_mask |= this_mask << 1;
1563 /* Prohibit the innermost partitioning at the moment. */
1564 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1566 /* Don't use any dimension explicitly claimed by an inner loop. */
1567 this_mask &= ~loop->inner;
1569 if (tiling && !loop->e_mask)
1571 /* If we got two axes, allocate the inner one to the element
1572 loop. */
1573 loop->e_mask = this_mask & (this_mask << 1);
1574 this_mask ^= loop->e_mask;
1577 loop->mask |= this_mask;
1580 if (loop->child)
1582 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1583 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1584 outer_assign | assign);
1587 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1589 /* Allocate the loop at the innermost available level. Note
1590 that we do this even if we already assigned this loop the
1591 outermost available level above. That way we'll partition
1592 this along 2 axes, if they are available. */
1593 unsigned this_mask = 0;
1595 /* Determine the outermost partitioning used within this loop. */
1596 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1597 this_mask = least_bit_hwi (this_mask);
1599 /* Pick the partitioning just inside that one. */
1600 this_mask >>= 1;
1602 /* And avoid picking one use by an outer loop. */
1603 this_mask &= ~outer_mask;
1605 /* If tiling and we failed completely above, grab the next one
1606 too. Making sure it doesn't hit an outer loop. */
1607 if (tiling)
1609 this_mask &= ~(loop->e_mask | loop->mask);
1610 unsigned tile_mask = ((this_mask >> 1)
1611 & ~(outer_mask | loop->e_mask | loop->mask));
1613 if (tile_mask || loop->mask)
1615 loop->e_mask |= this_mask;
1616 this_mask = tile_mask;
1618 if (!loop->e_mask && noisy)
1619 warning_at (loop->loc, 0,
1620 "insufficient partitioning available"
1621 " to parallelize element loop");
1624 loop->mask |= this_mask;
1625 if (!loop->mask && noisy)
1626 warning_at (loop->loc, 0,
1627 tiling
1628 ? G_("insufficient partitioning available"
1629 " to parallelize tile loop")
1630 : G_("insufficient partitioning available"
1631 " to parallelize loop"));
1634 if (assign && dump_file)
1635 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1636 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1637 loop->mask, loop->e_mask);
1639 unsigned inner_mask = 0;
1641 if (loop->sibling)
1642 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1643 outer_mask, outer_assign);
1645 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1647 return inner_mask;
1650 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1651 axes. Return mask of partitioning. */
1653 static unsigned
1654 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1656 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1658 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1660 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1661 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1663 return mask_all;
1666 /* Default fork/join early expander. Delete the function calls if
1667 there is no RTL expander. */
1669 bool
1670 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1671 const int *ARG_UNUSED (dims), bool is_fork)
1673 if (is_fork)
1674 return targetm.have_oacc_fork ();
1675 else
1676 return targetm.have_oacc_join ();
1679 /* Default goacc.reduction early expander.
1681 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1682 If RES_PTR is not integer-zerop:
1683 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1684 TEARDOWN - emit '*RES_PTR = VAR'
1685 If LHS is not NULL
1686 emit 'LHS = VAR' */
1688 void
1689 default_goacc_reduction (gcall *call)
1691 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1692 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1693 tree lhs = gimple_call_lhs (call);
1694 tree var = gimple_call_arg (call, 2);
1695 gimple_seq seq = NULL;
1697 if (code == IFN_GOACC_REDUCTION_SETUP
1698 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1700 /* Setup and Teardown need to copy from/to the receiver object,
1701 if there is one. */
1702 tree ref_to_res = gimple_call_arg (call, 1);
1704 if (!integer_zerop (ref_to_res))
1706 tree dst = build_simple_mem_ref (ref_to_res);
1707 tree src = var;
1709 if (code == IFN_GOACC_REDUCTION_SETUP)
1711 src = dst;
1712 dst = lhs;
1713 lhs = NULL;
1715 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1719 /* Copy VAR to LHS, if there is an LHS. */
1720 if (lhs)
1721 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1723 gsi_replace_with_seq (&gsi, seq, true);
1726 /* Main entry point for oacc transformations which run on the device
1727 compiler after LTO, so we know what the target device is at this
1728 point (including the host fallback). */
1730 static unsigned int
1731 execute_oacc_device_lower ()
1733 tree attrs = oacc_get_fn_attrib (current_function_decl);
1735 if (!attrs)
1736 /* Not an offloaded function. */
1737 return 0;
1739 /* Parse the default dim argument exactly once. */
1740 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1742 oacc_parse_default_dims (flag_openacc_dims);
1743 flag_openacc_dims = (char *)&flag_openacc_dims;
1746 bool is_oacc_kernels
1747 = (lookup_attribute ("oacc kernels",
1748 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1749 bool is_oacc_kernels_parallelized
1750 = (lookup_attribute ("oacc kernels parallelized",
1751 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1753 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
1754 kernels, so remove the parallelism dimensions function attributes
1755 potentially set earlier on. */
1756 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
1758 oacc_set_fn_attrib (current_function_decl, NULL, NULL);
1759 attrs = oacc_get_fn_attrib (current_function_decl);
1762 /* Discover, partition and process the loops. */
1763 oacc_loop *loops = oacc_loop_discovery ();
1764 int fn_level = oacc_fn_attrib_level (attrs);
1766 if (dump_file)
1768 if (fn_level >= 0)
1769 fprintf (dump_file, "Function is OpenACC routine level %d\n",
1770 fn_level);
1771 else if (is_oacc_kernels)
1772 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1773 (is_oacc_kernels_parallelized
1774 ? "parallelized" : "unparallelized"));
1775 else
1776 fprintf (dump_file, "Function is OpenACC parallel offload\n");
1779 unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
1780 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1781 /* OpenACC kernels constructs are special: they currently don't use the
1782 generic oacc_loop infrastructure and attribute/dimension processing. */
1783 if (is_oacc_kernels && is_oacc_kernels_parallelized)
1785 /* Parallelized OpenACC kernels constructs use gang parallelism. See
1786 also tree-parloops.c:create_parallel_loop. */
1787 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
1790 int dims[GOMP_DIM_MAX];
1791 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1793 if (dump_file)
1795 const char *comma = "Compute dimensions [";
1796 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1797 fprintf (dump_file, "%s%d", comma, dims[ix]);
1798 fprintf (dump_file, "]\n");
1801 oacc_loop_process (loops);
1802 if (dump_file)
1804 fprintf (dump_file, "OpenACC loops\n");
1805 dump_oacc_loop (dump_file, loops, 0);
1806 fprintf (dump_file, "\n");
1808 if (dump_enabled_p ())
1810 oacc_loop *l = loops;
1811 /* OpenACC kernels constructs are special: they currently don't use the
1812 generic oacc_loop infrastructure. */
1813 if (is_oacc_kernels)
1815 /* Create a fake oacc_loop for diagnostic purposes. */
1816 l = new_oacc_loop_raw (NULL,
1817 DECL_SOURCE_LOCATION (current_function_decl));
1818 l->mask = used_mask;
1820 else
1822 /* Skip the outermost, dummy OpenACC loop */
1823 l = l->child;
1825 if (l)
1826 inform_oacc_loop (l);
1827 if (is_oacc_kernels)
1828 free_oacc_loop (l);
1831 /* Offloaded targets may introduce new basic blocks, which require
1832 dominance information to update SSA. */
1833 calculate_dominance_info (CDI_DOMINATORS);
1835 /* Now lower internal loop functions to target-specific code
1836 sequences. */
1837 basic_block bb;
1838 FOR_ALL_BB_FN (bb, cfun)
1839 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1841 gimple *stmt = gsi_stmt (gsi);
1842 if (!is_gimple_call (stmt))
1844 gsi_next (&gsi);
1845 continue;
1848 gcall *call = as_a <gcall *> (stmt);
1849 if (!gimple_call_internal_p (call))
1851 gsi_next (&gsi);
1852 continue;
1855 /* Rewind to allow rescan. */
1856 gsi_prev (&gsi);
1857 bool rescan = false, remove = false;
1858 enum internal_fn ifn_code = gimple_call_internal_fn (call);
1860 switch (ifn_code)
1862 default: break;
1864 case IFN_GOACC_TILE:
1865 oacc_xform_tile (call);
1866 rescan = true;
1867 break;
1869 case IFN_GOACC_LOOP:
1870 oacc_xform_loop (call);
1871 rescan = true;
1872 break;
1874 case IFN_GOACC_REDUCTION:
1875 /* Mark the function for SSA renaming. */
1876 mark_virtual_operands_for_renaming (cfun);
1878 /* If the level is -1, this ended up being an unused
1879 axis. Handle as a default. */
1880 if (integer_minus_onep (gimple_call_arg (call, 3)))
1881 default_goacc_reduction (call);
1882 else
1883 targetm.goacc.reduction (call);
1884 rescan = true;
1885 break;
1887 case IFN_UNIQUE:
1889 enum ifn_unique_kind kind
1890 = ((enum ifn_unique_kind)
1891 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1893 switch (kind)
1895 default:
1896 break;
1898 case IFN_UNIQUE_OACC_FORK:
1899 case IFN_UNIQUE_OACC_JOIN:
1900 if (integer_minus_onep (gimple_call_arg (call, 2)))
1901 remove = true;
1902 else if (!targetm.goacc.fork_join
1903 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1904 remove = true;
1905 break;
1907 case IFN_UNIQUE_OACC_HEAD_MARK:
1908 case IFN_UNIQUE_OACC_TAIL_MARK:
1909 remove = true;
1910 break;
1912 break;
1916 if (gsi_end_p (gsi))
1917 /* We rewound past the beginning of the BB. */
1918 gsi = gsi_start_bb (bb);
1919 else
1920 /* Undo the rewind. */
1921 gsi_next (&gsi);
1923 if (remove)
1925 if (gimple_vdef (call))
1926 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1927 if (gimple_call_lhs (call))
1929 /* Propagate the data dependency var. */
1930 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1931 gimple_call_arg (call, 1));
1932 gsi_replace (&gsi, ass, false);
1934 else
1935 gsi_remove (&gsi, true);
1937 else if (!rescan)
1938 /* If not rescanning, advance over the call. */
1939 gsi_next (&gsi);
1942 free_oacc_loop (loops);
1944 return 0;
1947 /* Default launch dimension validator. Force everything to 1. A
1948 backend that wants to provide larger dimensions must override this
1949 hook. */
1951 bool
1952 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
1953 int ARG_UNUSED (fn_level),
1954 unsigned ARG_UNUSED (used))
1956 bool changed = false;
1958 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
1960 if (dims[ix] != 1)
1962 dims[ix] = 1;
1963 changed = true;
1967 return changed;
1970 /* Default dimension bound is unknown on accelerator and 1 on host. */
1973 default_goacc_dim_limit (int ARG_UNUSED (axis))
1975 #ifdef ACCEL_COMPILER
1976 return 0;
1977 #else
1978 return 1;
1979 #endif
1982 namespace {
1984 const pass_data pass_data_oacc_device_lower =
1986 GIMPLE_PASS, /* type */
1987 "oaccdevlow", /* name */
1988 OPTGROUP_OMP, /* optinfo_flags */
1989 TV_NONE, /* tv_id */
1990 PROP_cfg, /* properties_required */
1991 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
1992 0, /* properties_destroyed */
1993 0, /* todo_flags_start */
1994 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1997 class pass_oacc_device_lower : public gimple_opt_pass
1999 public:
2000 pass_oacc_device_lower (gcc::context *ctxt)
2001 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
2004 /* opt_pass methods: */
2005 virtual bool gate (function *) { return flag_openacc; };
2007 virtual unsigned int execute (function *)
2009 return execute_oacc_device_lower ();
2012 }; // class pass_oacc_device_lower
2014 } // anon namespace
2016 gimple_opt_pass *
2017 make_pass_oacc_device_lower (gcc::context *ctxt)
2019 return new pass_oacc_device_lower (ctxt);
2023 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
2024 GOMP_SIMT_ENTER call identifying the privatized variables, which are
2025 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
2026 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
2028 static void
2029 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
2031 gimple *alloc_stmt = gsi_stmt (*gsi);
2032 tree simtrec = gimple_call_lhs (alloc_stmt);
2033 tree simduid = gimple_call_arg (alloc_stmt, 0);
2034 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
2035 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
2036 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
2037 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
2038 TREE_ADDRESSABLE (rectype) = 1;
2039 TREE_TYPE (simtrec) = build_pointer_type (rectype);
2040 for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
2042 tree *argp = gimple_call_arg_ptr (enter_stmt, i);
2043 if (*argp == null_pointer_node)
2044 continue;
2045 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
2046 && VAR_P (TREE_OPERAND (*argp, 0)));
2047 tree var = TREE_OPERAND (*argp, 0);
2049 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
2050 DECL_NAME (var), TREE_TYPE (var));
2051 SET_DECL_ALIGN (field, DECL_ALIGN (var));
2052 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
2053 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
2055 insert_field_into_struct (rectype, field);
2057 tree t = build_simple_mem_ref (simtrec);
2058 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
2059 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
2060 SET_DECL_VALUE_EXPR (var, t);
2061 DECL_HAS_VALUE_EXPR_P (var) = 1;
2062 *regimplify = true;
2064 layout_type (rectype);
2065 tree size = TYPE_SIZE_UNIT (rectype);
2066 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
2068 alloc_stmt
2069 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
2070 gimple_call_set_lhs (alloc_stmt, simtrec);
2071 gsi_replace (gsi, alloc_stmt, false);
2072 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
2073 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
2074 gsi_replace (&enter_gsi, enter_stmt, false);
2076 use_operand_p use;
2077 gimple *exit_stmt;
2078 if (single_imm_use (simtrec, &use, &exit_stmt))
2080 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
2081 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
2082 tree clobber = build_clobber (rectype);
2083 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
2084 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
2086 else
2087 gcc_checking_assert (has_zero_uses (simtrec));
2090 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
2092 static tree
2093 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
2095 tree t = *tp;
2097 if (VAR_P (t)
2098 && DECL_HAS_VALUE_EXPR_P (t)
2099 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
2101 *walk_subtrees = 0;
2102 return t;
2104 return NULL_TREE;
2107 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
2108 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
2109 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
2110 internal functions on non-SIMT targets, and likewise some SIMD internal
2111 functions on SIMT targets. */
2113 static unsigned int
2114 execute_omp_device_lower ()
2116 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
2117 bool regimplify = false;
2118 basic_block bb;
2119 gimple_stmt_iterator gsi;
2120 bool calls_declare_variant_alt
2121 = cgraph_node::get (cfun->decl)->calls_declare_variant_alt;
2122 FOR_EACH_BB_FN (bb, cfun)
2123 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2125 gimple *stmt = gsi_stmt (gsi);
2126 if (!is_gimple_call (stmt))
2127 continue;
2128 if (!gimple_call_internal_p (stmt))
2130 if (calls_declare_variant_alt)
2131 if (tree fndecl = gimple_call_fndecl (stmt))
2133 tree new_fndecl = omp_resolve_declare_variant (fndecl);
2134 if (new_fndecl != fndecl)
2136 gimple_call_set_fndecl (stmt, new_fndecl);
2137 update_stmt (stmt);
2140 continue;
2142 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
2143 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
2144 switch (gimple_call_internal_fn (stmt))
2146 case IFN_GOMP_USE_SIMT:
2147 rhs = vf == 1 ? integer_zero_node : integer_one_node;
2148 break;
2149 case IFN_GOMP_SIMT_ENTER:
2150 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2151 goto simtreg_enter_exit;
2152 case IFN_GOMP_SIMT_ENTER_ALLOC:
2153 if (vf != 1)
2154 ompdevlow_adjust_simt_enter (&gsi, &regimplify);
2155 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
2156 goto simtreg_enter_exit;
2157 case IFN_GOMP_SIMT_EXIT:
2158 simtreg_enter_exit:
2159 if (vf != 1)
2160 continue;
2161 unlink_stmt_vdef (stmt);
2162 break;
2163 case IFN_GOMP_SIMT_LANE:
2164 case IFN_GOMP_SIMT_LAST_LANE:
2165 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
2166 break;
2167 case IFN_GOMP_SIMT_VF:
2168 rhs = build_int_cst (type, vf);
2169 break;
2170 case IFN_GOMP_SIMT_ORDERED_PRED:
2171 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
2172 if (rhs || !lhs)
2173 unlink_stmt_vdef (stmt);
2174 break;
2175 case IFN_GOMP_SIMT_VOTE_ANY:
2176 case IFN_GOMP_SIMT_XCHG_BFLY:
2177 case IFN_GOMP_SIMT_XCHG_IDX:
2178 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2179 break;
2180 case IFN_GOMP_SIMD_LANE:
2181 case IFN_GOMP_SIMD_LAST_LANE:
2182 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
2183 break;
2184 case IFN_GOMP_SIMD_VF:
2185 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
2186 break;
2187 default:
2188 continue;
2190 if (lhs && !rhs)
2191 continue;
2192 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
2193 gsi_replace (&gsi, stmt, false);
2195 if (regimplify)
2196 FOR_EACH_BB_REVERSE_FN (bb, cfun)
2197 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
2198 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
2200 if (gimple_clobber_p (gsi_stmt (gsi)))
2201 gsi_remove (&gsi, true);
2202 else
2203 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2205 if (vf != 1)
2206 cfun->has_force_vectorize_loops = false;
2207 return 0;
2210 namespace {
2212 const pass_data pass_data_omp_device_lower =
2214 GIMPLE_PASS, /* type */
2215 "ompdevlow", /* name */
2216 OPTGROUP_OMP, /* optinfo_flags */
2217 TV_NONE, /* tv_id */
2218 PROP_cfg, /* properties_required */
2219 PROP_gimple_lomp_dev, /* properties_provided */
2220 0, /* properties_destroyed */
2221 0, /* todo_flags_start */
2222 TODO_update_ssa, /* todo_flags_finish */
2225 class pass_omp_device_lower : public gimple_opt_pass
2227 public:
2228 pass_omp_device_lower (gcc::context *ctxt)
2229 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2232 /* opt_pass methods: */
2233 virtual bool gate (function *fun)
2235 return (!(fun->curr_properties & PROP_gimple_lomp_dev)
2236 || (flag_openmp
2237 && cgraph_node::get (fun->decl)->calls_declare_variant_alt));
2239 virtual unsigned int execute (function *)
2241 return execute_omp_device_lower ();
2244 }; // class pass_expand_omp_ssa
2246 } // anon namespace
2248 gimple_opt_pass *
2249 make_pass_omp_device_lower (gcc::context *ctxt)
2251 return new pass_omp_device_lower (ctxt);
2254 /* "omp declare target link" handling pass. */
2256 namespace {
2258 const pass_data pass_data_omp_target_link =
2260 GIMPLE_PASS, /* type */
2261 "omptargetlink", /* name */
2262 OPTGROUP_OMP, /* optinfo_flags */
2263 TV_NONE, /* tv_id */
2264 PROP_ssa, /* properties_required */
2265 0, /* properties_provided */
2266 0, /* properties_destroyed */
2267 0, /* todo_flags_start */
2268 TODO_update_ssa, /* todo_flags_finish */
2271 class pass_omp_target_link : public gimple_opt_pass
2273 public:
2274 pass_omp_target_link (gcc::context *ctxt)
2275 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2278 /* opt_pass methods: */
2279 virtual bool gate (function *fun)
2281 #ifdef ACCEL_COMPILER
2282 return offloading_function_p (fun->decl);
2283 #else
2284 (void) fun;
2285 return false;
2286 #endif
2289 virtual unsigned execute (function *);
2292 /* Callback for walk_gimple_stmt used to scan for link var operands. */
2294 static tree
2295 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2297 tree t = *tp;
2299 if (VAR_P (t)
2300 && DECL_HAS_VALUE_EXPR_P (t)
2301 && is_global_var (t)
2302 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2304 *walk_subtrees = 0;
2305 return t;
2308 return NULL_TREE;
2311 unsigned
2312 pass_omp_target_link::execute (function *fun)
2314 basic_block bb;
2315 FOR_EACH_BB_FN (bb, fun)
2317 gimple_stmt_iterator gsi;
2318 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2319 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2320 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2323 return 0;
2326 } // anon namespace
2328 gimple_opt_pass *
2329 make_pass_omp_target_link (gcc::context *ctxt)
2331 return new pass_omp_target_link (ctxt);