re PR fortran/88376 (ICE in is_illegal_recursion, at fortran/resolve.c:1689)
[official-gcc.git] / gcc / omp-offload.c
blob9cac5655c63e9f08b95d786e0255907608e5dfdc
1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
4 Copyright (C) 2005-2019 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
56 /* Describe the OpenACC looping structure of a function. The entire
57 function is held in a 'NULL' loop. */
59 struct oacc_loop
61 oacc_loop *parent; /* Containing loop. */
63 oacc_loop *child; /* First inner loop. */
65 oacc_loop *sibling; /* Next loop within same parent. */
67 location_t loc; /* Location of the loop start. */
69 gcall *marker; /* Initial head marker. */
71 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
72 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
74 tree routine; /* Pseudo-loop enclosing a routine. */
76 unsigned mask; /* Partitioning mask. */
77 unsigned e_mask; /* Partitioning of element loops (when tiling). */
78 unsigned inner; /* Partitioning of inner loops. */
79 unsigned flags; /* Partitioning flags. */
80 vec<gcall *> ifns; /* Contained loop abstraction functions. */
81 tree chunk_size; /* Chunk size. */
82 gcall *head_end; /* Final marker of head sequence. */
85 /* Holds offload tables with decls. */
86 vec<tree, va_gc> *offload_funcs, *offload_vars;
88 /* Return level at which oacc routine may spawn a partitioned loop, or
89 -1 if it is not a routine (i.e. is an offload fn). */
91 int
92 oacc_fn_attrib_level (tree attr)
94 tree pos = TREE_VALUE (attr);
96 if (!TREE_PURPOSE (pos))
97 return -1;
99 int ix = 0;
100 for (ix = 0; ix != GOMP_DIM_MAX;
101 ix++, pos = TREE_CHAIN (pos))
102 if (!integer_zerop (TREE_PURPOSE (pos)))
103 break;
105 return ix;
108 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
109 adds their addresses and sizes to constructor-vector V_CTOR. */
111 static void
112 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
113 vec<constructor_elt, va_gc> *v_ctor)
115 unsigned len = vec_safe_length (v_decls);
116 for (unsigned i = 0; i < len; i++)
118 tree it = (*v_decls)[i];
119 bool is_var = VAR_P (it);
120 bool is_link_var
121 = is_var
122 #ifdef ACCEL_COMPILER
123 && DECL_HAS_VALUE_EXPR_P (it)
124 #endif
125 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
127 tree size = NULL_TREE;
128 if (is_var)
129 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
131 tree addr;
132 if (!is_link_var)
133 addr = build_fold_addr_expr (it);
134 else
136 #ifdef ACCEL_COMPILER
137 /* For "omp declare target link" vars add address of the pointer to
138 the target table, instead of address of the var. */
139 tree value_expr = DECL_VALUE_EXPR (it);
140 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
141 varpool_node::finalize_decl (link_ptr_decl);
142 addr = build_fold_addr_expr (link_ptr_decl);
143 #else
144 addr = build_fold_addr_expr (it);
145 #endif
147 /* Most significant bit of the size marks "omp declare target link"
148 vars in host and target tables. */
149 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
150 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
151 * BITS_PER_UNIT - 1);
152 size = wide_int_to_tree (const_ptr_type_node, isize);
155 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
156 if (is_var)
157 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
161 /* Create new symbols containing (address, size) pairs for global variables,
162 marked with "omp declare target" attribute, as well as addresses for the
163 functions, which are outlined offloading regions. */
164 void
165 omp_finish_file (void)
167 unsigned num_funcs = vec_safe_length (offload_funcs);
168 unsigned num_vars = vec_safe_length (offload_vars);
170 if (num_funcs == 0 && num_vars == 0)
171 return;
173 if (targetm_common.have_named_sections)
175 vec<constructor_elt, va_gc> *v_f, *v_v;
176 vec_alloc (v_f, num_funcs);
177 vec_alloc (v_v, num_vars * 2);
179 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
180 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
182 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
183 num_vars * 2);
184 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
185 num_funcs);
186 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
187 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
188 tree ctor_v = build_constructor (vars_decl_type, v_v);
189 tree ctor_f = build_constructor (funcs_decl_type, v_f);
190 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
191 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
192 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
193 get_identifier (".offload_func_table"),
194 funcs_decl_type);
195 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
196 get_identifier (".offload_var_table"),
197 vars_decl_type);
198 TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
199 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
200 otherwise a joint table in a binary will contain padding between
201 tables from multiple object files. */
202 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
203 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
204 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
205 DECL_INITIAL (funcs_decl) = ctor_f;
206 DECL_INITIAL (vars_decl) = ctor_v;
207 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
208 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
210 varpool_node::finalize_decl (vars_decl);
211 varpool_node::finalize_decl (funcs_decl);
213 else
215 for (unsigned i = 0; i < num_funcs; i++)
217 tree it = (*offload_funcs)[i];
218 targetm.record_offload_symbol (it);
220 for (unsigned i = 0; i < num_vars; i++)
222 tree it = (*offload_vars)[i];
223 targetm.record_offload_symbol (it);
228 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
229 axis DIM. Return a tmp var holding the result. */
231 static tree
232 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
234 tree arg = build_int_cst (unsigned_type_node, dim);
235 tree size = create_tmp_var (integer_type_node);
236 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
237 gimple *call = gimple_build_call_internal (fn, 1, arg);
239 gimple_call_set_lhs (call, size);
240 gimple_seq_add_stmt (seq, call);
242 return size;
245 /* Find the number of threads (POS = false), or thread number (POS =
246 true) for an OpenACC region partitioned as MASK. Setup code
247 required for the calculation is added to SEQ. */
249 static tree
250 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
252 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
253 unsigned ix;
255 /* Start at gang level, and examine relevant dimension indices. */
256 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
257 if (GOMP_DIM_MASK (ix) & mask)
259 if (res)
261 /* We had an outer index, so scale that by the size of
262 this dimension. */
263 tree n = oacc_dim_call (false, ix, seq);
264 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
266 if (pos)
268 /* Determine index in this dimension. */
269 tree id = oacc_dim_call (true, ix, seq);
270 if (res)
271 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
272 else
273 res = id;
277 if (res == NULL_TREE)
278 res = integer_zero_node;
280 return res;
283 /* Transform IFN_GOACC_LOOP calls to actual code. See
284 expand_oacc_for for where these are generated. At the vector
285 level, we stride loops, such that each member of a warp will
286 operate on adjacent iterations. At the worker and gang level,
287 each gang/warp executes a set of contiguous iterations. Chunking
288 can override this such that each iteration engine executes a
289 contiguous chunk, and then moves on to stride to the next chunk. */
291 static void
292 oacc_xform_loop (gcall *call)
294 gimple_stmt_iterator gsi = gsi_for_stmt (call);
295 enum ifn_goacc_loop_kind code
296 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
297 tree dir = gimple_call_arg (call, 1);
298 tree range = gimple_call_arg (call, 2);
299 tree step = gimple_call_arg (call, 3);
300 tree chunk_size = NULL_TREE;
301 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
302 tree lhs = gimple_call_lhs (call);
303 tree type = TREE_TYPE (lhs);
304 tree diff_type = TREE_TYPE (range);
305 tree r = NULL_TREE;
306 gimple_seq seq = NULL;
307 bool chunking = false, striding = true;
308 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
309 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
311 #ifdef ACCEL_COMPILER
312 chunk_size = gimple_call_arg (call, 4);
313 if (integer_minus_onep (chunk_size) /* Force static allocation. */
314 || integer_zerop (chunk_size)) /* Default (also static). */
316 /* If we're at the gang level, we want each to execute a
317 contiguous run of iterations. Otherwise we want each element
318 to stride. */
319 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
320 chunking = false;
322 else
324 /* Chunk of size 1 is striding. */
325 striding = integer_onep (chunk_size);
326 chunking = !striding;
328 #endif
330 /* striding=true, chunking=true
331 -> invalid.
332 striding=true, chunking=false
333 -> chunks=1
334 striding=false,chunking=true
335 -> chunks=ceil (range/(chunksize*threads*step))
336 striding=false,chunking=false
337 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
338 push_gimplify_context (true);
340 switch (code)
342 default: gcc_unreachable ();
344 case IFN_GOACC_LOOP_CHUNKS:
345 if (!chunking)
346 r = build_int_cst (type, 1);
347 else
349 /* chunk_max
350 = (range - dir) / (chunks * step * num_threads) + dir */
351 tree per = oacc_thread_numbers (false, mask, &seq);
352 per = fold_convert (type, per);
353 chunk_size = fold_convert (type, chunk_size);
354 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
355 per = fold_build2 (MULT_EXPR, type, per, step);
356 r = build2 (MINUS_EXPR, type, range, dir);
357 r = build2 (PLUS_EXPR, type, r, per);
358 r = build2 (TRUNC_DIV_EXPR, type, r, per);
360 break;
362 case IFN_GOACC_LOOP_STEP:
364 /* If striding, step by the entire compute volume, otherwise
365 step by the inner volume. */
366 unsigned volume = striding ? mask : inner_mask;
368 r = oacc_thread_numbers (false, volume, &seq);
369 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
371 break;
373 case IFN_GOACC_LOOP_OFFSET:
374 /* Enable vectorization on non-SIMT targets. */
375 if (!targetm.simt.vf
376 && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
377 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
378 the loop. */
379 && (flag_tree_loop_vectorize
380 || !global_options_set.x_flag_tree_loop_vectorize))
382 basic_block bb = gsi_bb (gsi);
383 struct loop *parent = bb->loop_father;
384 struct loop *body = parent->inner;
386 parent->force_vectorize = true;
387 parent->safelen = INT_MAX;
389 /* "Chunking loops" may have inner loops. */
390 if (parent->inner)
392 body->force_vectorize = true;
393 body->safelen = INT_MAX;
396 cfun->has_force_vectorize_loops = true;
398 if (striding)
400 r = oacc_thread_numbers (true, mask, &seq);
401 r = fold_convert (diff_type, r);
403 else
405 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
406 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
407 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
408 inner_size, outer_size);
410 volume = fold_convert (diff_type, volume);
411 if (chunking)
412 chunk_size = fold_convert (diff_type, chunk_size);
413 else
415 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
417 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
418 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
419 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
422 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
423 fold_convert (diff_type, inner_size));
424 r = oacc_thread_numbers (true, outer_mask, &seq);
425 r = fold_convert (diff_type, r);
426 r = build2 (MULT_EXPR, diff_type, r, span);
428 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
429 inner = fold_convert (diff_type, inner);
430 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
432 if (chunking)
434 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
435 tree per
436 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
437 per = build2 (MULT_EXPR, diff_type, per, chunk);
439 r = build2 (PLUS_EXPR, diff_type, r, per);
442 r = fold_build2 (MULT_EXPR, diff_type, r, step);
443 if (type != diff_type)
444 r = fold_convert (type, r);
445 break;
447 case IFN_GOACC_LOOP_BOUND:
448 if (striding)
449 r = range;
450 else
452 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
453 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
454 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
455 inner_size, outer_size);
457 volume = fold_convert (diff_type, volume);
458 if (chunking)
459 chunk_size = fold_convert (diff_type, chunk_size);
460 else
462 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
464 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
465 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
466 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
469 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
470 fold_convert (diff_type, inner_size));
472 r = fold_build2 (MULT_EXPR, diff_type, span, step);
474 tree offset = gimple_call_arg (call, 6);
475 r = build2 (PLUS_EXPR, diff_type, r,
476 fold_convert (diff_type, offset));
477 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
478 diff_type, r, range);
480 if (diff_type != type)
481 r = fold_convert (type, r);
482 break;
485 gimplify_assign (lhs, r, &seq);
487 pop_gimplify_context (NULL);
489 gsi_replace_with_seq (&gsi, seq, true);
492 /* Transform a GOACC_TILE call. Determines the element loop span for
493 the specified loop of the nest. This is 1 if we're not tiling.
495 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
497 static void
498 oacc_xform_tile (gcall *call)
500 gimple_stmt_iterator gsi = gsi_for_stmt (call);
501 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
502 /* Inner loops have higher loop_nos. */
503 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
504 tree tile_size = gimple_call_arg (call, 2);
505 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
506 tree lhs = gimple_call_lhs (call);
507 tree type = TREE_TYPE (lhs);
508 gimple_seq seq = NULL;
509 tree span = build_int_cst (type, 1);
511 gcc_assert (!(e_mask
512 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
513 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
514 push_gimplify_context (!seen_error ());
516 #ifndef ACCEL_COMPILER
517 /* Partitioning disabled on host compilers. */
518 e_mask = 0;
519 #endif
520 if (!e_mask)
521 /* Not paritioning. */
522 span = integer_one_node;
523 else if (!integer_zerop (tile_size))
524 /* User explicitly specified size. */
525 span = tile_size;
526 else
528 /* Pick a size based on the paritioning of the element loop and
529 the number of loop nests. */
530 tree first_size = NULL_TREE;
531 tree second_size = NULL_TREE;
533 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
534 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
535 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
536 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
538 if (!first_size)
540 first_size = second_size;
541 second_size = NULL_TREE;
544 if (loop_no + 1 == collapse)
546 span = first_size;
547 if (!loop_no && second_size)
548 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
549 span, second_size);
551 else if (loop_no + 2 == collapse)
552 span = second_size;
553 else
554 span = NULL_TREE;
556 if (!span)
557 /* There's no obvious element size for this loop. Options
558 are 1, first_size or some non-unity constant (32 is my
559 favourite). We should gather some statistics. */
560 span = first_size;
563 span = fold_convert (type, span);
564 gimplify_assign (lhs, span, &seq);
566 pop_gimplify_context (NULL);
568 gsi_replace_with_seq (&gsi, seq, true);
571 /* Default partitioned and minimum partitioned dimensions. */
573 static int oacc_default_dims[GOMP_DIM_MAX];
574 static int oacc_min_dims[GOMP_DIM_MAX];
577 oacc_get_default_dim (int dim)
579 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
580 return oacc_default_dims[dim];
584 oacc_get_min_dim (int dim)
586 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
587 return oacc_min_dims[dim];
590 /* Parse the default dimension parameter. This is a set of
591 :-separated optional compute dimensions. Each specified dimension
592 is a positive integer. When device type support is added, it is
593 planned to be a comma separated list of such compute dimensions,
594 with all but the first prefixed by the colon-terminated device
595 type. */
597 static void
598 oacc_parse_default_dims (const char *dims)
600 int ix;
602 for (ix = GOMP_DIM_MAX; ix--;)
604 oacc_default_dims[ix] = -1;
605 oacc_min_dims[ix] = 1;
608 #ifndef ACCEL_COMPILER
609 /* Cannot be overridden on the host. */
610 dims = NULL;
611 #endif
612 if (dims)
614 const char *pos = dims;
616 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
618 if (ix)
620 if (*pos != ':')
621 goto malformed;
622 pos++;
625 if (*pos != ':')
627 long val;
628 const char *eptr;
630 errno = 0;
631 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
632 if (errno || val <= 0 || (int) val != val)
633 goto malformed;
634 pos = eptr;
635 oacc_default_dims[ix] = (int) val;
638 if (*pos)
640 malformed:
641 error_at (UNKNOWN_LOCATION,
642 "-fopenacc-dim operand is malformed at '%s'", pos);
646 /* Allow the backend to validate the dimensions. */
647 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1);
648 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2);
651 /* Validate and update the dimensions for offloaded FN. ATTRS is the
652 raw attribute. DIMS is an array of dimensions, which is filled in.
653 LEVEL is the partitioning level of a routine, or -1 for an offload
654 region itself. USED is the mask of partitioned execution in the
655 function. */
657 static void
658 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
660 tree purpose[GOMP_DIM_MAX];
661 unsigned ix;
662 tree pos = TREE_VALUE (attrs);
664 /* Make sure the attribute creator attached the dimension
665 information. */
666 gcc_assert (pos);
668 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
670 purpose[ix] = TREE_PURPOSE (pos);
671 tree val = TREE_VALUE (pos);
672 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
673 pos = TREE_CHAIN (pos);
676 bool changed = targetm.goacc.validate_dims (fn, dims, level);
678 /* Default anything left to 1 or a partitioned default. */
679 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
680 if (dims[ix] < 0)
682 /* The OpenACC spec says 'If the [num_gangs] clause is not
683 specified, an implementation-defined default will be used;
684 the default may depend on the code within the construct.'
685 (2.5.6). Thus an implementation is free to choose
686 non-unity default for a parallel region that doesn't have
687 any gang-partitioned loops. However, it appears that there
688 is a sufficient body of user code that expects non-gang
689 partitioned regions to not execute in gang-redundant mode.
690 So we (a) don't warn about the non-portability and (b) pick
691 the minimum permissible dimension size when there is no
692 partitioned execution. Otherwise we pick the global
693 default for the dimension, which the user can control. The
694 same wording and logic applies to num_workers and
695 vector_length, however the worker- or vector- single
696 execution doesn't have the same impact as gang-redundant
697 execution. (If the minimum gang-level partioning is not 1,
698 the target is probably too confusing.) */
699 dims[ix] = (used & GOMP_DIM_MASK (ix)
700 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
701 changed = true;
704 if (changed)
706 /* Replace the attribute with new values. */
707 pos = NULL_TREE;
708 for (ix = GOMP_DIM_MAX; ix--;)
709 pos = tree_cons (purpose[ix],
710 build_int_cst (integer_type_node, dims[ix]), pos);
711 oacc_replace_fn_attrib (fn, pos);
715 /* Create an empty OpenACC loop structure at LOC. */
717 static oacc_loop *
718 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
720 oacc_loop *loop = XCNEW (oacc_loop);
722 loop->parent = parent;
724 if (parent)
726 loop->sibling = parent->child;
727 parent->child = loop;
730 loop->loc = loc;
731 return loop;
734 /* Create an outermost, dummy OpenACC loop for offloaded function
735 DECL. */
737 static oacc_loop *
738 new_oacc_loop_outer (tree decl)
740 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
743 /* Start a new OpenACC loop structure beginning at head marker HEAD.
744 Link into PARENT loop. Return the new loop. */
746 static oacc_loop *
747 new_oacc_loop (oacc_loop *parent, gcall *marker)
749 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
751 loop->marker = marker;
753 /* TODO: This is where device_type flattening would occur for the loop
754 flags. */
756 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
758 tree chunk_size = integer_zero_node;
759 if (loop->flags & OLF_GANG_STATIC)
760 chunk_size = gimple_call_arg (marker, 4);
761 loop->chunk_size = chunk_size;
763 return loop;
766 /* Create a dummy loop encompassing a call to a openACC routine.
767 Extract the routine's partitioning requirements. */
769 static void
770 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
772 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
773 int level = oacc_fn_attrib_level (attrs);
775 gcc_assert (level >= 0);
777 loop->marker = call;
778 loop->routine = decl;
779 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
780 ^ (GOMP_DIM_MASK (level) - 1));
783 /* Finish off the current OpenACC loop ending at tail marker TAIL.
784 Return the parent loop. */
786 static oacc_loop *
787 finish_oacc_loop (oacc_loop *loop)
789 /* If the loop has been collapsed, don't partition it. */
790 if (loop->ifns.is_empty ())
791 loop->mask = loop->flags = 0;
792 return loop->parent;
795 /* Free all OpenACC loop structures within LOOP (inclusive). */
797 static void
798 free_oacc_loop (oacc_loop *loop)
800 if (loop->sibling)
801 free_oacc_loop (loop->sibling);
802 if (loop->child)
803 free_oacc_loop (loop->child);
805 loop->ifns.release ();
806 free (loop);
809 /* Dump out the OpenACC loop head or tail beginning at FROM. */
811 static void
812 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
813 const char *title, int level)
815 enum ifn_unique_kind kind
816 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
818 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
819 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
821 gimple *stmt = gsi_stmt (gsi);
823 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
825 enum ifn_unique_kind k
826 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
827 (gimple_call_arg (stmt, 0)));
829 if (k == kind && stmt != from)
830 break;
832 print_gimple_stmt (file, stmt, depth * 2 + 2);
834 gsi_next (&gsi);
835 while (gsi_end_p (gsi))
836 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
840 /* Dump OpenACC loop LOOP, its children, and its siblings. */
842 static void
843 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
845 int ix;
847 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
848 loop->flags, loop->mask,
849 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
851 if (loop->marker)
852 print_gimple_stmt (file, loop->marker, depth * 2);
854 if (loop->routine)
855 fprintf (file, "%*sRoutine %s:%u:%s\n",
856 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
857 DECL_SOURCE_LINE (loop->routine),
858 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
860 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
861 if (loop->heads[ix])
862 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
863 for (ix = GOMP_DIM_MAX; ix--;)
864 if (loop->tails[ix])
865 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
867 if (loop->child)
868 dump_oacc_loop (file, loop->child, depth + 1);
869 if (loop->sibling)
870 dump_oacc_loop (file, loop->sibling, depth);
873 void debug_oacc_loop (oacc_loop *);
875 /* Dump loops to stderr. */
877 DEBUG_FUNCTION void
878 debug_oacc_loop (oacc_loop *loop)
880 dump_oacc_loop (stderr, loop, 0);
883 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
884 siblings. */
886 static void
887 inform_oacc_loop (const oacc_loop *loop)
889 const char *gang
890 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
891 const char *worker
892 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
893 const char *vector
894 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
895 const char *seq = loop->mask == 0 ? " seq" : "";
896 const dump_user_location_t loc
897 = dump_user_location_t::from_location_t (loop->loc);
898 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
899 "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
900 vector, seq);
902 if (loop->child)
903 inform_oacc_loop (loop->child);
904 if (loop->sibling)
905 inform_oacc_loop (loop->sibling);
908 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
909 structures as we go. By construction these loops are properly
910 nested. */
912 static void
913 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
915 int marker = 0;
916 int remaining = 0;
918 if (bb->flags & BB_VISITED)
919 return;
921 follow:
922 bb->flags |= BB_VISITED;
924 /* Scan for loop markers. */
925 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
926 gsi_next (&gsi))
928 gimple *stmt = gsi_stmt (gsi);
930 if (!is_gimple_call (stmt))
931 continue;
933 gcall *call = as_a <gcall *> (stmt);
935 /* If this is a routine, make a dummy loop for it. */
936 if (tree decl = gimple_call_fndecl (call))
937 if (tree attrs = oacc_get_fn_attrib (decl))
939 gcc_assert (!marker);
940 new_oacc_loop_routine (loop, call, decl, attrs);
943 if (!gimple_call_internal_p (call))
944 continue;
946 switch (gimple_call_internal_fn (call))
948 default:
949 break;
951 case IFN_GOACC_LOOP:
952 case IFN_GOACC_TILE:
953 /* Record the abstraction function, so we can manipulate it
954 later. */
955 loop->ifns.safe_push (call);
956 break;
958 case IFN_UNIQUE:
959 enum ifn_unique_kind kind
960 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
961 (gimple_call_arg (call, 0)));
962 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
963 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
965 if (gimple_call_num_args (call) == 2)
967 gcc_assert (marker && !remaining);
968 marker = 0;
969 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
970 loop = finish_oacc_loop (loop);
971 else
972 loop->head_end = call;
974 else
976 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
978 if (!marker)
980 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
981 loop = new_oacc_loop (loop, call);
982 remaining = count;
984 gcc_assert (count == remaining);
985 if (remaining)
987 remaining--;
988 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
989 loop->heads[marker] = call;
990 else
991 loop->tails[remaining] = call;
993 marker++;
998 if (remaining || marker)
1000 bb = single_succ (bb);
1001 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1002 goto follow;
1005 /* Walk successor blocks. */
1006 edge e;
1007 edge_iterator ei;
1009 FOR_EACH_EDGE (e, ei, bb->succs)
1010 oacc_loop_discover_walk (loop, e->dest);
1013 /* LOOP is the first sibling. Reverse the order in place and return
1014 the new first sibling. Recurse to child loops. */
1016 static oacc_loop *
1017 oacc_loop_sibling_nreverse (oacc_loop *loop)
1019 oacc_loop *last = NULL;
1022 if (loop->child)
1023 loop->child = oacc_loop_sibling_nreverse (loop->child);
1025 oacc_loop *next = loop->sibling;
1026 loop->sibling = last;
1027 last = loop;
1028 loop = next;
1030 while (loop);
1032 return last;
1035 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1036 the current function. */
1038 static oacc_loop *
1039 oacc_loop_discovery ()
1041 /* Clear basic block flags, in particular BB_VISITED which we're going to use
1042 in the following. */
1043 clear_bb_flags ();
1045 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1046 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1048 /* The siblings were constructed in reverse order, reverse them so
1049 that diagnostics come out in an unsurprising order. */
1050 top = oacc_loop_sibling_nreverse (top);
1052 return top;
1055 /* Transform the abstract internal function markers starting at FROM
1056 to be for partitioning level LEVEL. Stop when we meet another HEAD
1057 or TAIL marker. */
1059 static void
1060 oacc_loop_xform_head_tail (gcall *from, int level)
1062 enum ifn_unique_kind kind
1063 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1064 tree replacement = build_int_cst (unsigned_type_node, level);
1066 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1068 gimple *stmt = gsi_stmt (gsi);
1070 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1072 enum ifn_unique_kind k
1073 = ((enum ifn_unique_kind)
1074 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1076 if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1077 *gimple_call_arg_ptr (stmt, 2) = replacement;
1078 else if (k == kind && stmt != from)
1079 break;
1081 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1082 *gimple_call_arg_ptr (stmt, 3) = replacement;
1084 gsi_next (&gsi);
1085 while (gsi_end_p (gsi))
1086 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1090 /* Process the discovered OpenACC loops, setting the correct
1091 partitioning level etc. */
1093 static void
1094 oacc_loop_process (oacc_loop *loop)
1096 if (loop->child)
1097 oacc_loop_process (loop->child);
1099 if (loop->mask && !loop->routine)
1101 int ix;
1102 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1103 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1104 tree chunk_arg = loop->chunk_size;
1105 gcall *call;
1107 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1108 switch (gimple_call_internal_fn (call))
1110 case IFN_GOACC_LOOP:
1112 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1113 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1114 if (!is_e)
1115 gimple_call_set_arg (call, 4, chunk_arg);
1117 break;
1119 case IFN_GOACC_TILE:
1120 gimple_call_set_arg (call, 3, mask_arg);
1121 gimple_call_set_arg (call, 4, e_mask_arg);
1122 break;
1124 default:
1125 gcc_unreachable ();
1128 unsigned dim = GOMP_DIM_GANG;
1129 unsigned mask = loop->mask | loop->e_mask;
1130 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1132 while (!(GOMP_DIM_MASK (dim) & mask))
1133 dim++;
1135 oacc_loop_xform_head_tail (loop->heads[ix], dim);
1136 oacc_loop_xform_head_tail (loop->tails[ix], dim);
1138 mask ^= GOMP_DIM_MASK (dim);
1142 if (loop->sibling)
1143 oacc_loop_process (loop->sibling);
1146 /* Walk the OpenACC loop heirarchy checking and assigning the
1147 programmer-specified partitionings. OUTER_MASK is the partitioning
1148 this loop is contained within. Return mask of partitioning
1149 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1150 bit. */
1152 static unsigned
1153 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1155 unsigned this_mask = loop->mask;
1156 unsigned mask_all = 0;
1157 bool noisy = true;
1159 #ifdef ACCEL_COMPILER
1160 /* When device_type is supported, we want the device compiler to be
1161 noisy, if the loop parameters are device_type-specific. */
1162 noisy = false;
1163 #endif
1165 if (!loop->routine)
1167 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1168 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1169 bool tiling = (loop->flags & OLF_TILE) != 0;
1171 this_mask = ((loop->flags >> OLF_DIM_BASE)
1172 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1174 /* Apply auto partitioning if this is a non-partitioned regular
1175 loop, or (no more than) single axis tiled loop. */
1176 bool maybe_auto
1177 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1179 if ((this_mask != 0) + auto_par + seq_par > 1)
1181 if (noisy)
1182 error_at (loop->loc,
1183 seq_par
1184 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1185 : G_("%<auto%> conflicts with other OpenACC loop "
1186 "specifiers"));
1187 maybe_auto = false;
1188 loop->flags &= ~OLF_AUTO;
1189 if (seq_par)
1191 loop->flags
1192 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1193 this_mask = 0;
1197 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1199 loop->flags |= OLF_AUTO;
1200 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1204 if (this_mask & outer_mask)
1206 const oacc_loop *outer;
1207 for (outer = loop->parent; outer; outer = outer->parent)
1208 if ((outer->mask | outer->e_mask) & this_mask)
1209 break;
1211 if (noisy)
1213 if (outer)
1215 error_at (loop->loc,
1216 loop->routine
1217 ? G_("routine call uses same OpenACC parallelism"
1218 " as containing loop")
1219 : G_("inner loop uses same OpenACC parallelism"
1220 " as containing loop"));
1221 inform (outer->loc, "containing loop here");
1223 else
1224 error_at (loop->loc,
1225 loop->routine
1226 ? G_("routine call uses OpenACC parallelism disallowed"
1227 " by containing routine")
1228 : G_("loop uses OpenACC parallelism disallowed"
1229 " by containing routine"));
1231 if (loop->routine)
1232 inform (DECL_SOURCE_LOCATION (loop->routine),
1233 "routine %qD declared here", loop->routine);
1235 this_mask &= ~outer_mask;
1237 else
1239 unsigned outermost = least_bit_hwi (this_mask);
1241 if (outermost && outermost <= outer_mask)
1243 if (noisy)
1245 error_at (loop->loc,
1246 "incorrectly nested OpenACC loop parallelism");
1248 const oacc_loop *outer;
1249 for (outer = loop->parent;
1250 outer->flags && outer->flags < outermost;
1251 outer = outer->parent)
1252 continue;
1253 inform (outer->loc, "containing loop here");
1256 this_mask &= ~outermost;
1260 mask_all |= this_mask;
1262 if (loop->flags & OLF_TILE)
1264 /* When tiling, vector goes to the element loop, and failing
1265 that we put worker there. The std doesn't contemplate
1266 specifying all three. We choose to put worker and vector on
1267 the element loops in that case. */
1268 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1269 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1270 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1272 loop->e_mask = this_e_mask;
1273 this_mask ^= this_e_mask;
1276 loop->mask = this_mask;
1278 if (dump_file)
1279 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1280 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1281 loop->mask, loop->e_mask);
1283 if (loop->child)
1285 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1286 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1287 mask_all |= loop->inner;
1290 if (loop->sibling)
1291 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1293 return mask_all;
1296 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1297 OUTER_MASK is the partitioning this loop is contained within.
1298 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1299 Return the cumulative partitioning used by this loop, siblings and
1300 children. */
1302 static unsigned
1303 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1304 bool outer_assign)
1306 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1307 bool noisy = true;
1308 bool tiling = loop->flags & OLF_TILE;
1310 #ifdef ACCEL_COMPILER
1311 /* When device_type is supported, we want the device compiler to be
1312 noisy, if the loop parameters are device_type-specific. */
1313 noisy = false;
1314 #endif
1316 if (assign && (!outer_assign || loop->inner))
1318 /* Allocate outermost and non-innermost loops at the outermost
1319 non-innermost available level. */
1320 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1322 /* Find the first outermost available partition. */
1323 while (this_mask <= outer_mask)
1324 this_mask <<= 1;
1326 /* Grab two axes if tiling, and we've not assigned anything */
1327 if (tiling && !(loop->mask | loop->e_mask))
1328 this_mask |= this_mask << 1;
1330 /* Prohibit the innermost partitioning at the moment. */
1331 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1333 /* Don't use any dimension explicitly claimed by an inner loop. */
1334 this_mask &= ~loop->inner;
1336 if (tiling && !loop->e_mask)
1338 /* If we got two axes, allocate the inner one to the element
1339 loop. */
1340 loop->e_mask = this_mask & (this_mask << 1);
1341 this_mask ^= loop->e_mask;
1344 loop->mask |= this_mask;
1347 if (loop->child)
1349 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1350 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1351 outer_assign | assign);
1354 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1356 /* Allocate the loop at the innermost available level. Note
1357 that we do this even if we already assigned this loop the
1358 outermost available level above. That way we'll partition
1359 this along 2 axes, if they are available. */
1360 unsigned this_mask = 0;
1362 /* Determine the outermost partitioning used within this loop. */
1363 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1364 this_mask = least_bit_hwi (this_mask);
1366 /* Pick the partitioning just inside that one. */
1367 this_mask >>= 1;
1369 /* And avoid picking one use by an outer loop. */
1370 this_mask &= ~outer_mask;
1372 /* If tiling and we failed completely above, grab the next one
1373 too. Making sure it doesn't hit an outer loop. */
1374 if (tiling)
1376 this_mask &= ~(loop->e_mask | loop->mask);
1377 unsigned tile_mask = ((this_mask >> 1)
1378 & ~(outer_mask | loop->e_mask | loop->mask));
1380 if (tile_mask || loop->mask)
1382 loop->e_mask |= this_mask;
1383 this_mask = tile_mask;
1385 if (!loop->e_mask && noisy)
1386 warning_at (loop->loc, 0,
1387 "insufficient partitioning available"
1388 " to parallelize element loop");
1391 loop->mask |= this_mask;
1392 if (!loop->mask && noisy)
1393 warning_at (loop->loc, 0,
1394 tiling
1395 ? G_("insufficient partitioning available"
1396 " to parallelize tile loop")
1397 : G_("insufficient partitioning available"
1398 " to parallelize loop"));
1401 if (assign && dump_file)
1402 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1403 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1404 loop->mask, loop->e_mask);
1406 unsigned inner_mask = 0;
1408 if (loop->sibling)
1409 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1410 outer_mask, outer_assign);
1412 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1414 return inner_mask;
1417 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1418 axes. Return mask of partitioning. */
1420 static unsigned
1421 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1423 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1425 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1427 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1428 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1430 return mask_all;
1433 /* Default fork/join early expander. Delete the function calls if
1434 there is no RTL expander. */
1436 bool
1437 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1438 const int *ARG_UNUSED (dims), bool is_fork)
1440 if (is_fork)
1441 return targetm.have_oacc_fork ();
1442 else
1443 return targetm.have_oacc_join ();
1446 /* Default goacc.reduction early expander.
1448 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1449 If RES_PTR is not integer-zerop:
1450 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1451 TEARDOWN - emit '*RES_PTR = VAR'
1452 If LHS is not NULL
1453 emit 'LHS = VAR' */
1455 void
1456 default_goacc_reduction (gcall *call)
1458 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1459 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1460 tree lhs = gimple_call_lhs (call);
1461 tree var = gimple_call_arg (call, 2);
1462 gimple_seq seq = NULL;
1464 if (code == IFN_GOACC_REDUCTION_SETUP
1465 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1467 /* Setup and Teardown need to copy from/to the receiver object,
1468 if there is one. */
1469 tree ref_to_res = gimple_call_arg (call, 1);
1471 if (!integer_zerop (ref_to_res))
1473 tree dst = build_simple_mem_ref (ref_to_res);
1474 tree src = var;
1476 if (code == IFN_GOACC_REDUCTION_SETUP)
1478 src = dst;
1479 dst = lhs;
1480 lhs = NULL;
1482 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1486 /* Copy VAR to LHS, if there is an LHS. */
1487 if (lhs)
1488 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1490 gsi_replace_with_seq (&gsi, seq, true);
1493 /* Main entry point for oacc transformations which run on the device
1494 compiler after LTO, so we know what the target device is at this
1495 point (including the host fallback). */
1497 static unsigned int
1498 execute_oacc_device_lower ()
1500 tree attrs = oacc_get_fn_attrib (current_function_decl);
1502 if (!attrs)
1503 /* Not an offloaded function. */
1504 return 0;
1506 /* Parse the default dim argument exactly once. */
1507 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1509 oacc_parse_default_dims (flag_openacc_dims);
1510 flag_openacc_dims = (char *)&flag_openacc_dims;
1513 bool is_oacc_kernels
1514 = (lookup_attribute ("oacc kernels",
1515 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1516 bool is_oacc_kernels_parallelized
1517 = (lookup_attribute ("oacc kernels parallelized",
1518 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1520 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
1521 kernels, so remove the parallelism dimensions function attributes
1522 potentially set earlier on. */
1523 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
1525 oacc_set_fn_attrib (current_function_decl, NULL, NULL);
1526 attrs = oacc_get_fn_attrib (current_function_decl);
1529 /* Discover, partition and process the loops. */
1530 oacc_loop *loops = oacc_loop_discovery ();
1531 int fn_level = oacc_fn_attrib_level (attrs);
1533 if (dump_file)
1535 if (fn_level >= 0)
1536 fprintf (dump_file, "Function is OpenACC routine level %d\n",
1537 fn_level);
1538 else if (is_oacc_kernels)
1539 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1540 (is_oacc_kernels_parallelized
1541 ? "parallelized" : "unparallelized"));
1542 else
1543 fprintf (dump_file, "Function is OpenACC parallel offload\n");
1546 unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
1547 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1548 /* OpenACC kernels constructs are special: they currently don't use the
1549 generic oacc_loop infrastructure and attribute/dimension processing. */
1550 if (is_oacc_kernels && is_oacc_kernels_parallelized)
1552 /* Parallelized OpenACC kernels constructs use gang parallelism. See
1553 also tree-parloops.c:create_parallel_loop. */
1554 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
1557 int dims[GOMP_DIM_MAX];
1558 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1560 if (dump_file)
1562 const char *comma = "Compute dimensions [";
1563 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1564 fprintf (dump_file, "%s%d", comma, dims[ix]);
1565 fprintf (dump_file, "]\n");
1568 oacc_loop_process (loops);
1569 if (dump_file)
1571 fprintf (dump_file, "OpenACC loops\n");
1572 dump_oacc_loop (dump_file, loops, 0);
1573 fprintf (dump_file, "\n");
1575 if (dump_enabled_p ())
1577 oacc_loop *l = loops;
1578 /* OpenACC kernels constructs are special: they currently don't use the
1579 generic oacc_loop infrastructure. */
1580 if (is_oacc_kernels)
1582 /* Create a fake oacc_loop for diagnostic purposes. */
1583 l = new_oacc_loop_raw (NULL,
1584 DECL_SOURCE_LOCATION (current_function_decl));
1585 l->mask = used_mask;
1587 else
1589 /* Skip the outermost, dummy OpenACC loop */
1590 l = l->child;
1592 if (l)
1593 inform_oacc_loop (l);
1594 if (is_oacc_kernels)
1595 free_oacc_loop (l);
1598 /* Offloaded targets may introduce new basic blocks, which require
1599 dominance information to update SSA. */
1600 calculate_dominance_info (CDI_DOMINATORS);
1602 /* Now lower internal loop functions to target-specific code
1603 sequences. */
1604 basic_block bb;
1605 FOR_ALL_BB_FN (bb, cfun)
1606 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1608 gimple *stmt = gsi_stmt (gsi);
1609 if (!is_gimple_call (stmt))
1611 gsi_next (&gsi);
1612 continue;
1615 gcall *call = as_a <gcall *> (stmt);
1616 if (!gimple_call_internal_p (call))
1618 gsi_next (&gsi);
1619 continue;
1622 /* Rewind to allow rescan. */
1623 gsi_prev (&gsi);
1624 bool rescan = false, remove = false;
1625 enum internal_fn ifn_code = gimple_call_internal_fn (call);
1627 switch (ifn_code)
1629 default: break;
1631 case IFN_GOACC_TILE:
1632 oacc_xform_tile (call);
1633 rescan = true;
1634 break;
1636 case IFN_GOACC_LOOP:
1637 oacc_xform_loop (call);
1638 rescan = true;
1639 break;
1641 case IFN_GOACC_REDUCTION:
1642 /* Mark the function for SSA renaming. */
1643 mark_virtual_operands_for_renaming (cfun);
1645 /* If the level is -1, this ended up being an unused
1646 axis. Handle as a default. */
1647 if (integer_minus_onep (gimple_call_arg (call, 3)))
1648 default_goacc_reduction (call);
1649 else
1650 targetm.goacc.reduction (call);
1651 rescan = true;
1652 break;
1654 case IFN_UNIQUE:
1656 enum ifn_unique_kind kind
1657 = ((enum ifn_unique_kind)
1658 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1660 switch (kind)
1662 default:
1663 break;
1665 case IFN_UNIQUE_OACC_FORK:
1666 case IFN_UNIQUE_OACC_JOIN:
1667 if (integer_minus_onep (gimple_call_arg (call, 2)))
1668 remove = true;
1669 else if (!targetm.goacc.fork_join
1670 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1671 remove = true;
1672 break;
1674 case IFN_UNIQUE_OACC_HEAD_MARK:
1675 case IFN_UNIQUE_OACC_TAIL_MARK:
1676 remove = true;
1677 break;
1679 break;
1683 if (gsi_end_p (gsi))
1684 /* We rewound past the beginning of the BB. */
1685 gsi = gsi_start_bb (bb);
1686 else
1687 /* Undo the rewind. */
1688 gsi_next (&gsi);
1690 if (remove)
1692 if (gimple_vdef (call))
1693 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1694 if (gimple_call_lhs (call))
1696 /* Propagate the data dependency var. */
1697 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1698 gimple_call_arg (call, 1));
1699 gsi_replace (&gsi, ass, false);
1701 else
1702 gsi_remove (&gsi, true);
1704 else if (!rescan)
1705 /* If not rescanning, advance over the call. */
1706 gsi_next (&gsi);
1709 free_oacc_loop (loops);
1711 return 0;
1714 /* Default launch dimension validator. Force everything to 1. A
1715 backend that wants to provide larger dimensions must override this
1716 hook. */
1718 bool
1719 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
1720 int ARG_UNUSED (fn_level))
1722 bool changed = false;
1724 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
1726 if (dims[ix] != 1)
1728 dims[ix] = 1;
1729 changed = true;
1733 return changed;
1736 /* Default dimension bound is unknown on accelerator and 1 on host. */
1739 default_goacc_dim_limit (int ARG_UNUSED (axis))
1741 #ifdef ACCEL_COMPILER
1742 return 0;
1743 #else
1744 return 1;
1745 #endif
1748 namespace {
1750 const pass_data pass_data_oacc_device_lower =
1752 GIMPLE_PASS, /* type */
1753 "oaccdevlow", /* name */
1754 OPTGROUP_OMP, /* optinfo_flags */
1755 TV_NONE, /* tv_id */
1756 PROP_cfg, /* properties_required */
1757 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
1758 0, /* properties_destroyed */
1759 0, /* todo_flags_start */
1760 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1763 class pass_oacc_device_lower : public gimple_opt_pass
1765 public:
1766 pass_oacc_device_lower (gcc::context *ctxt)
1767 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
1770 /* opt_pass methods: */
1771 virtual bool gate (function *) { return flag_openacc; };
1773 virtual unsigned int execute (function *)
1775 return execute_oacc_device_lower ();
1778 }; // class pass_oacc_device_lower
1780 } // anon namespace
1782 gimple_opt_pass *
1783 make_pass_oacc_device_lower (gcc::context *ctxt)
1785 return new pass_oacc_device_lower (ctxt);
1789 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
1790 GOMP_SIMT_ENTER call identifying the privatized variables, which are
1791 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
1792 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
1794 static void
1795 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
1797 gimple *alloc_stmt = gsi_stmt (*gsi);
1798 tree simtrec = gimple_call_lhs (alloc_stmt);
1799 tree simduid = gimple_call_arg (alloc_stmt, 0);
1800 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
1801 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
1802 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
1803 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
1804 TREE_ADDRESSABLE (rectype) = 1;
1805 TREE_TYPE (simtrec) = build_pointer_type (rectype);
1806 for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
1808 tree *argp = gimple_call_arg_ptr (enter_stmt, i);
1809 if (*argp == null_pointer_node)
1810 continue;
1811 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
1812 && VAR_P (TREE_OPERAND (*argp, 0)));
1813 tree var = TREE_OPERAND (*argp, 0);
1815 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
1816 DECL_NAME (var), TREE_TYPE (var));
1817 SET_DECL_ALIGN (field, DECL_ALIGN (var));
1818 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
1819 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
1821 insert_field_into_struct (rectype, field);
1823 tree t = build_simple_mem_ref (simtrec);
1824 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
1825 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
1826 SET_DECL_VALUE_EXPR (var, t);
1827 DECL_HAS_VALUE_EXPR_P (var) = 1;
1828 *regimplify = true;
1830 layout_type (rectype);
1831 tree size = TYPE_SIZE_UNIT (rectype);
1832 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
1834 alloc_stmt
1835 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
1836 gimple_call_set_lhs (alloc_stmt, simtrec);
1837 gsi_replace (gsi, alloc_stmt, false);
1838 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
1839 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
1840 gsi_replace (&enter_gsi, enter_stmt, false);
1842 use_operand_p use;
1843 gimple *exit_stmt;
1844 if (single_imm_use (simtrec, &use, &exit_stmt))
1846 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
1847 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
1848 tree clobber = build_constructor (rectype, NULL);
1849 TREE_THIS_VOLATILE (clobber) = 1;
1850 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
1851 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
1853 else
1854 gcc_checking_assert (has_zero_uses (simtrec));
1857 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
1859 static tree
1860 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
1862 tree t = *tp;
1864 if (VAR_P (t)
1865 && DECL_HAS_VALUE_EXPR_P (t)
1866 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
1868 *walk_subtrees = 0;
1869 return t;
1871 return NULL_TREE;
1874 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
1875 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
1876 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
1877 internal functions on non-SIMT targets, and likewise some SIMD internal
1878 functions on SIMT targets. */
1880 static unsigned int
1881 execute_omp_device_lower ()
1883 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
1884 bool regimplify = false;
1885 basic_block bb;
1886 gimple_stmt_iterator gsi;
1887 FOR_EACH_BB_FN (bb, cfun)
1888 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1890 gimple *stmt = gsi_stmt (gsi);
1891 if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
1892 continue;
1893 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
1894 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
1895 switch (gimple_call_internal_fn (stmt))
1897 case IFN_GOMP_USE_SIMT:
1898 rhs = vf == 1 ? integer_zero_node : integer_one_node;
1899 break;
1900 case IFN_GOMP_SIMT_ENTER:
1901 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1902 goto simtreg_enter_exit;
1903 case IFN_GOMP_SIMT_ENTER_ALLOC:
1904 if (vf != 1)
1905 ompdevlow_adjust_simt_enter (&gsi, &regimplify);
1906 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
1907 goto simtreg_enter_exit;
1908 case IFN_GOMP_SIMT_EXIT:
1909 simtreg_enter_exit:
1910 if (vf != 1)
1911 continue;
1912 unlink_stmt_vdef (stmt);
1913 break;
1914 case IFN_GOMP_SIMT_LANE:
1915 case IFN_GOMP_SIMT_LAST_LANE:
1916 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
1917 break;
1918 case IFN_GOMP_SIMT_VF:
1919 rhs = build_int_cst (type, vf);
1920 break;
1921 case IFN_GOMP_SIMT_ORDERED_PRED:
1922 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
1923 if (rhs || !lhs)
1924 unlink_stmt_vdef (stmt);
1925 break;
1926 case IFN_GOMP_SIMT_VOTE_ANY:
1927 case IFN_GOMP_SIMT_XCHG_BFLY:
1928 case IFN_GOMP_SIMT_XCHG_IDX:
1929 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1930 break;
1931 case IFN_GOMP_SIMD_LANE:
1932 case IFN_GOMP_SIMD_LAST_LANE:
1933 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
1934 break;
1935 case IFN_GOMP_SIMD_VF:
1936 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
1937 break;
1938 default:
1939 continue;
1941 if (lhs && !rhs)
1942 continue;
1943 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
1944 gsi_replace (&gsi, stmt, false);
1946 if (regimplify)
1947 FOR_EACH_BB_REVERSE_FN (bb, cfun)
1948 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1949 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
1951 if (gimple_clobber_p (gsi_stmt (gsi)))
1952 gsi_remove (&gsi, true);
1953 else
1954 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
1956 if (vf != 1)
1957 cfun->has_force_vectorize_loops = false;
1958 return 0;
1961 namespace {
1963 const pass_data pass_data_omp_device_lower =
1965 GIMPLE_PASS, /* type */
1966 "ompdevlow", /* name */
1967 OPTGROUP_OMP, /* optinfo_flags */
1968 TV_NONE, /* tv_id */
1969 PROP_cfg, /* properties_required */
1970 PROP_gimple_lomp_dev, /* properties_provided */
1971 0, /* properties_destroyed */
1972 0, /* todo_flags_start */
1973 TODO_update_ssa, /* todo_flags_finish */
1976 class pass_omp_device_lower : public gimple_opt_pass
1978 public:
1979 pass_omp_device_lower (gcc::context *ctxt)
1980 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
1983 /* opt_pass methods: */
1984 virtual bool gate (function *fun)
1986 return !(fun->curr_properties & PROP_gimple_lomp_dev);
1988 virtual unsigned int execute (function *)
1990 return execute_omp_device_lower ();
1993 }; // class pass_expand_omp_ssa
1995 } // anon namespace
1997 gimple_opt_pass *
1998 make_pass_omp_device_lower (gcc::context *ctxt)
2000 return new pass_omp_device_lower (ctxt);
2003 /* "omp declare target link" handling pass. */
2005 namespace {
2007 const pass_data pass_data_omp_target_link =
2009 GIMPLE_PASS, /* type */
2010 "omptargetlink", /* name */
2011 OPTGROUP_OMP, /* optinfo_flags */
2012 TV_NONE, /* tv_id */
2013 PROP_ssa, /* properties_required */
2014 0, /* properties_provided */
2015 0, /* properties_destroyed */
2016 0, /* todo_flags_start */
2017 TODO_update_ssa, /* todo_flags_finish */
2020 class pass_omp_target_link : public gimple_opt_pass
2022 public:
2023 pass_omp_target_link (gcc::context *ctxt)
2024 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2027 /* opt_pass methods: */
2028 virtual bool gate (function *fun)
2030 #ifdef ACCEL_COMPILER
2031 return offloading_function_p (fun->decl);
2032 #else
2033 (void) fun;
2034 return false;
2035 #endif
2038 virtual unsigned execute (function *);
2041 /* Callback for walk_gimple_stmt used to scan for link var operands. */
2043 static tree
2044 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2046 tree t = *tp;
2048 if (VAR_P (t)
2049 && DECL_HAS_VALUE_EXPR_P (t)
2050 && is_global_var (t)
2051 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2053 *walk_subtrees = 0;
2054 return t;
2057 return NULL_TREE;
2060 unsigned
2061 pass_omp_target_link::execute (function *fun)
2063 basic_block bb;
2064 FOR_EACH_BB_FN (bb, fun)
2066 gimple_stmt_iterator gsi;
2067 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2068 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2069 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2072 return 0;
2075 } // anon namespace
2077 gimple_opt_pass *
2078 make_pass_omp_target_link (gcc::context *ctxt)
2080 return new pass_omp_target_link (ctxt);