Remove extra newline
[official-gcc.git] / gcc / omp-offload.c
blobc66f38b6f0cd3d16cba366a1ae5c22968fdce15c
1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
4 Copyright (C) 2005-2020 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
56 /* Describe the OpenACC looping structure of a function. The entire
57 function is held in a 'NULL' loop. */
59 struct oacc_loop
61 oacc_loop *parent; /* Containing loop. */
63 oacc_loop *child; /* First inner loop. */
65 oacc_loop *sibling; /* Next loop within same parent. */
67 location_t loc; /* Location of the loop start. */
69 gcall *marker; /* Initial head marker. */
71 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
72 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
74 tree routine; /* Pseudo-loop enclosing a routine. */
76 unsigned mask; /* Partitioning mask. */
77 unsigned e_mask; /* Partitioning of element loops (when tiling). */
78 unsigned inner; /* Partitioning of inner loops. */
79 unsigned flags; /* Partitioning flags. */
80 vec<gcall *> ifns; /* Contained loop abstraction functions. */
81 tree chunk_size; /* Chunk size. */
82 gcall *head_end; /* Final marker of head sequence. */
85 /* Holds offload tables with decls. */
86 vec<tree, va_gc> *offload_funcs, *offload_vars;
88 /* Return level at which oacc routine may spawn a partitioned loop, or
89 -1 if it is not a routine (i.e. is an offload fn). */
91 int
92 oacc_fn_attrib_level (tree attr)
94 tree pos = TREE_VALUE (attr);
96 if (!TREE_PURPOSE (pos))
97 return -1;
99 int ix = 0;
100 for (ix = 0; ix != GOMP_DIM_MAX;
101 ix++, pos = TREE_CHAIN (pos))
102 if (!integer_zerop (TREE_PURPOSE (pos)))
103 break;
105 return ix;
108 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
109 adds their addresses and sizes to constructor-vector V_CTOR. */
111 static void
112 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
113 vec<constructor_elt, va_gc> *v_ctor)
115 unsigned len = vec_safe_length (v_decls);
116 for (unsigned i = 0; i < len; i++)
118 tree it = (*v_decls)[i];
119 bool is_var = VAR_P (it);
120 bool is_link_var
121 = is_var
122 #ifdef ACCEL_COMPILER
123 && DECL_HAS_VALUE_EXPR_P (it)
124 #endif
125 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
127 tree size = NULL_TREE;
128 if (is_var)
129 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
131 tree addr;
132 if (!is_link_var)
133 addr = build_fold_addr_expr (it);
134 else
136 #ifdef ACCEL_COMPILER
137 /* For "omp declare target link" vars add address of the pointer to
138 the target table, instead of address of the var. */
139 tree value_expr = DECL_VALUE_EXPR (it);
140 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
141 varpool_node::finalize_decl (link_ptr_decl);
142 addr = build_fold_addr_expr (link_ptr_decl);
143 #else
144 addr = build_fold_addr_expr (it);
145 #endif
147 /* Most significant bit of the size marks "omp declare target link"
148 vars in host and target tables. */
149 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
150 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
151 * BITS_PER_UNIT - 1);
152 size = wide_int_to_tree (const_ptr_type_node, isize);
155 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
156 if (is_var)
157 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
161 /* Create new symbols containing (address, size) pairs for global variables,
162 marked with "omp declare target" attribute, as well as addresses for the
163 functions, which are outlined offloading regions. */
164 void
165 omp_finish_file (void)
167 unsigned num_funcs = vec_safe_length (offload_funcs);
168 unsigned num_vars = vec_safe_length (offload_vars);
170 if (num_funcs == 0 && num_vars == 0)
171 return;
173 if (targetm_common.have_named_sections)
175 vec<constructor_elt, va_gc> *v_f, *v_v;
176 vec_alloc (v_f, num_funcs);
177 vec_alloc (v_v, num_vars * 2);
179 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
180 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
182 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
183 num_vars * 2);
184 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
185 num_funcs);
186 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
187 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
188 tree ctor_v = build_constructor (vars_decl_type, v_v);
189 tree ctor_f = build_constructor (funcs_decl_type, v_f);
190 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
191 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
192 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
193 get_identifier (".offload_func_table"),
194 funcs_decl_type);
195 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
196 get_identifier (".offload_var_table"),
197 vars_decl_type);
198 TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
199 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
200 otherwise a joint table in a binary will contain padding between
201 tables from multiple object files. */
202 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
203 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
204 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
205 DECL_INITIAL (funcs_decl) = ctor_f;
206 DECL_INITIAL (vars_decl) = ctor_v;
207 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
208 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
210 varpool_node::finalize_decl (vars_decl);
211 varpool_node::finalize_decl (funcs_decl);
213 else
215 for (unsigned i = 0; i < num_funcs; i++)
217 tree it = (*offload_funcs)[i];
218 targetm.record_offload_symbol (it);
220 for (unsigned i = 0; i < num_vars; i++)
222 tree it = (*offload_vars)[i];
223 #ifdef ACCEL_COMPILER
224 if (DECL_HAS_VALUE_EXPR_P (it)
225 && lookup_attribute ("omp declare target link",
226 DECL_ATTRIBUTES (it)))
228 tree value_expr = DECL_VALUE_EXPR (it);
229 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
230 targetm.record_offload_symbol (link_ptr_decl);
231 varpool_node::finalize_decl (link_ptr_decl);
233 else
234 #endif
235 targetm.record_offload_symbol (it);
240 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
241 axis DIM. Return a tmp var holding the result. */
243 static tree
244 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
246 tree arg = build_int_cst (unsigned_type_node, dim);
247 tree size = create_tmp_var (integer_type_node);
248 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
249 gimple *call = gimple_build_call_internal (fn, 1, arg);
251 gimple_call_set_lhs (call, size);
252 gimple_seq_add_stmt (seq, call);
254 return size;
257 /* Find the number of threads (POS = false), or thread number (POS =
258 true) for an OpenACC region partitioned as MASK. Setup code
259 required for the calculation is added to SEQ. */
261 static tree
262 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
264 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
265 unsigned ix;
267 /* Start at gang level, and examine relevant dimension indices. */
268 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
269 if (GOMP_DIM_MASK (ix) & mask)
271 if (res)
273 /* We had an outer index, so scale that by the size of
274 this dimension. */
275 tree n = oacc_dim_call (false, ix, seq);
276 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
278 if (pos)
280 /* Determine index in this dimension. */
281 tree id = oacc_dim_call (true, ix, seq);
282 if (res)
283 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
284 else
285 res = id;
289 if (res == NULL_TREE)
290 res = integer_zero_node;
292 return res;
295 /* Transform IFN_GOACC_LOOP calls to actual code. See
296 expand_oacc_for for where these are generated. At the vector
297 level, we stride loops, such that each member of a warp will
298 operate on adjacent iterations. At the worker and gang level,
299 each gang/warp executes a set of contiguous iterations. Chunking
300 can override this such that each iteration engine executes a
301 contiguous chunk, and then moves on to stride to the next chunk. */
303 static void
304 oacc_xform_loop (gcall *call)
306 gimple_stmt_iterator gsi = gsi_for_stmt (call);
307 enum ifn_goacc_loop_kind code
308 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
309 tree dir = gimple_call_arg (call, 1);
310 tree range = gimple_call_arg (call, 2);
311 tree step = gimple_call_arg (call, 3);
312 tree chunk_size = NULL_TREE;
313 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
314 tree lhs = gimple_call_lhs (call);
315 tree type = NULL_TREE;
316 tree diff_type = TREE_TYPE (range);
317 tree r = NULL_TREE;
318 gimple_seq seq = NULL;
319 bool chunking = false, striding = true;
320 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
321 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
323 /* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
324 if (!lhs)
326 gsi_replace_with_seq (&gsi, seq, true);
327 return;
330 type = TREE_TYPE (lhs);
332 #ifdef ACCEL_COMPILER
333 chunk_size = gimple_call_arg (call, 4);
334 if (integer_minus_onep (chunk_size) /* Force static allocation. */
335 || integer_zerop (chunk_size)) /* Default (also static). */
337 /* If we're at the gang level, we want each to execute a
338 contiguous run of iterations. Otherwise we want each element
339 to stride. */
340 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
341 chunking = false;
343 else
345 /* Chunk of size 1 is striding. */
346 striding = integer_onep (chunk_size);
347 chunking = !striding;
349 #endif
351 /* striding=true, chunking=true
352 -> invalid.
353 striding=true, chunking=false
354 -> chunks=1
355 striding=false,chunking=true
356 -> chunks=ceil (range/(chunksize*threads*step))
357 striding=false,chunking=false
358 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
359 push_gimplify_context (true);
361 switch (code)
363 default: gcc_unreachable ();
365 case IFN_GOACC_LOOP_CHUNKS:
366 if (!chunking)
367 r = build_int_cst (type, 1);
368 else
370 /* chunk_max
371 = (range - dir) / (chunks * step * num_threads) + dir */
372 tree per = oacc_thread_numbers (false, mask, &seq);
373 per = fold_convert (type, per);
374 chunk_size = fold_convert (type, chunk_size);
375 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
376 per = fold_build2 (MULT_EXPR, type, per, step);
377 r = build2 (MINUS_EXPR, type, range, dir);
378 r = build2 (PLUS_EXPR, type, r, per);
379 r = build2 (TRUNC_DIV_EXPR, type, r, per);
381 break;
383 case IFN_GOACC_LOOP_STEP:
385 /* If striding, step by the entire compute volume, otherwise
386 step by the inner volume. */
387 unsigned volume = striding ? mask : inner_mask;
389 r = oacc_thread_numbers (false, volume, &seq);
390 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
392 break;
394 case IFN_GOACC_LOOP_OFFSET:
395 /* Enable vectorization on non-SIMT targets. */
396 if (!targetm.simt.vf
397 && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
398 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
399 the loop. */
400 && (flag_tree_loop_vectorize
401 || !global_options_set.x_flag_tree_loop_vectorize))
403 basic_block bb = gsi_bb (gsi);
404 class loop *parent = bb->loop_father;
405 class loop *body = parent->inner;
407 parent->force_vectorize = true;
408 parent->safelen = INT_MAX;
410 /* "Chunking loops" may have inner loops. */
411 if (parent->inner)
413 body->force_vectorize = true;
414 body->safelen = INT_MAX;
417 cfun->has_force_vectorize_loops = true;
419 if (striding)
421 r = oacc_thread_numbers (true, mask, &seq);
422 r = fold_convert (diff_type, r);
424 else
426 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
427 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
428 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
429 inner_size, outer_size);
431 volume = fold_convert (diff_type, volume);
432 if (chunking)
433 chunk_size = fold_convert (diff_type, chunk_size);
434 else
436 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
438 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
439 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
440 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
443 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
444 fold_convert (diff_type, inner_size));
445 r = oacc_thread_numbers (true, outer_mask, &seq);
446 r = fold_convert (diff_type, r);
447 r = build2 (MULT_EXPR, diff_type, r, span);
449 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
450 inner = fold_convert (diff_type, inner);
451 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
453 if (chunking)
455 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
456 tree per
457 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
458 per = build2 (MULT_EXPR, diff_type, per, chunk);
460 r = build2 (PLUS_EXPR, diff_type, r, per);
463 r = fold_build2 (MULT_EXPR, diff_type, r, step);
464 if (type != diff_type)
465 r = fold_convert (type, r);
466 break;
468 case IFN_GOACC_LOOP_BOUND:
469 if (striding)
470 r = range;
471 else
473 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
474 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
475 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
476 inner_size, outer_size);
478 volume = fold_convert (diff_type, volume);
479 if (chunking)
480 chunk_size = fold_convert (diff_type, chunk_size);
481 else
483 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
485 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
486 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
487 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
490 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
491 fold_convert (diff_type, inner_size));
493 r = fold_build2 (MULT_EXPR, diff_type, span, step);
495 tree offset = gimple_call_arg (call, 6);
496 r = build2 (PLUS_EXPR, diff_type, r,
497 fold_convert (diff_type, offset));
498 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
499 diff_type, r, range);
501 if (diff_type != type)
502 r = fold_convert (type, r);
503 break;
506 gimplify_assign (lhs, r, &seq);
508 pop_gimplify_context (NULL);
510 gsi_replace_with_seq (&gsi, seq, true);
513 /* Transform a GOACC_TILE call. Determines the element loop span for
514 the specified loop of the nest. This is 1 if we're not tiling.
516 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
518 static void
519 oacc_xform_tile (gcall *call)
521 gimple_stmt_iterator gsi = gsi_for_stmt (call);
522 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
523 /* Inner loops have higher loop_nos. */
524 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
525 tree tile_size = gimple_call_arg (call, 2);
526 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
527 tree lhs = gimple_call_lhs (call);
528 tree type = TREE_TYPE (lhs);
529 gimple_seq seq = NULL;
530 tree span = build_int_cst (type, 1);
532 gcc_assert (!(e_mask
533 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
534 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
535 push_gimplify_context (!seen_error ());
537 #ifndef ACCEL_COMPILER
538 /* Partitioning disabled on host compilers. */
539 e_mask = 0;
540 #endif
541 if (!e_mask)
542 /* Not paritioning. */
543 span = integer_one_node;
544 else if (!integer_zerop (tile_size))
545 /* User explicitly specified size. */
546 span = tile_size;
547 else
549 /* Pick a size based on the paritioning of the element loop and
550 the number of loop nests. */
551 tree first_size = NULL_TREE;
552 tree second_size = NULL_TREE;
554 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
555 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
556 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
557 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
559 if (!first_size)
561 first_size = second_size;
562 second_size = NULL_TREE;
565 if (loop_no + 1 == collapse)
567 span = first_size;
568 if (!loop_no && second_size)
569 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
570 span, second_size);
572 else if (loop_no + 2 == collapse)
573 span = second_size;
574 else
575 span = NULL_TREE;
577 if (!span)
578 /* There's no obvious element size for this loop. Options
579 are 1, first_size or some non-unity constant (32 is my
580 favourite). We should gather some statistics. */
581 span = first_size;
584 span = fold_convert (type, span);
585 gimplify_assign (lhs, span, &seq);
587 pop_gimplify_context (NULL);
589 gsi_replace_with_seq (&gsi, seq, true);
592 /* Default partitioned and minimum partitioned dimensions. */
594 static int oacc_default_dims[GOMP_DIM_MAX];
595 static int oacc_min_dims[GOMP_DIM_MAX];
598 oacc_get_default_dim (int dim)
600 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
601 return oacc_default_dims[dim];
605 oacc_get_min_dim (int dim)
607 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
608 return oacc_min_dims[dim];
611 /* Parse the default dimension parameter. This is a set of
612 :-separated optional compute dimensions. Each specified dimension
613 is a positive integer. When device type support is added, it is
614 planned to be a comma separated list of such compute dimensions,
615 with all but the first prefixed by the colon-terminated device
616 type. */
618 static void
619 oacc_parse_default_dims (const char *dims)
621 int ix;
623 for (ix = GOMP_DIM_MAX; ix--;)
625 oacc_default_dims[ix] = -1;
626 oacc_min_dims[ix] = 1;
629 #ifndef ACCEL_COMPILER
630 /* Cannot be overridden on the host. */
631 dims = NULL;
632 #endif
633 if (dims)
635 const char *pos = dims;
637 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
639 if (ix)
641 if (*pos != ':')
642 goto malformed;
643 pos++;
646 if (*pos != ':')
648 long val;
649 const char *eptr;
651 errno = 0;
652 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
653 if (errno || val <= 0 || (int) val != val)
654 goto malformed;
655 pos = eptr;
656 oacc_default_dims[ix] = (int) val;
659 if (*pos)
661 malformed:
662 error_at (UNKNOWN_LOCATION,
663 "%<-fopenacc-dim%> operand is malformed at %qs", pos);
667 /* Allow the backend to validate the dimensions. */
668 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
669 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
672 /* Validate and update the dimensions for offloaded FN. ATTRS is the
673 raw attribute. DIMS is an array of dimensions, which is filled in.
674 LEVEL is the partitioning level of a routine, or -1 for an offload
675 region itself. USED is the mask of partitioned execution in the
676 function. */
678 static void
679 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
681 tree purpose[GOMP_DIM_MAX];
682 unsigned ix;
683 tree pos = TREE_VALUE (attrs);
685 /* Make sure the attribute creator attached the dimension
686 information. */
687 gcc_assert (pos);
689 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
691 purpose[ix] = TREE_PURPOSE (pos);
692 tree val = TREE_VALUE (pos);
693 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
694 pos = TREE_CHAIN (pos);
697 bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
699 /* Default anything left to 1 or a partitioned default. */
700 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
701 if (dims[ix] < 0)
703 /* The OpenACC spec says 'If the [num_gangs] clause is not
704 specified, an implementation-defined default will be used;
705 the default may depend on the code within the construct.'
706 (2.5.6). Thus an implementation is free to choose
707 non-unity default for a parallel region that doesn't have
708 any gang-partitioned loops. However, it appears that there
709 is a sufficient body of user code that expects non-gang
710 partitioned regions to not execute in gang-redundant mode.
711 So we (a) don't warn about the non-portability and (b) pick
712 the minimum permissible dimension size when there is no
713 partitioned execution. Otherwise we pick the global
714 default for the dimension, which the user can control. The
715 same wording and logic applies to num_workers and
716 vector_length, however the worker- or vector- single
717 execution doesn't have the same impact as gang-redundant
718 execution. (If the minimum gang-level partioning is not 1,
719 the target is probably too confusing.) */
720 dims[ix] = (used & GOMP_DIM_MASK (ix)
721 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
722 changed = true;
725 if (changed)
727 /* Replace the attribute with new values. */
728 pos = NULL_TREE;
729 for (ix = GOMP_DIM_MAX; ix--;)
730 pos = tree_cons (purpose[ix],
731 build_int_cst (integer_type_node, dims[ix]), pos);
732 oacc_replace_fn_attrib (fn, pos);
736 /* Create an empty OpenACC loop structure at LOC. */
738 static oacc_loop *
739 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
741 oacc_loop *loop = XCNEW (oacc_loop);
743 loop->parent = parent;
745 if (parent)
747 loop->sibling = parent->child;
748 parent->child = loop;
751 loop->loc = loc;
752 return loop;
755 /* Create an outermost, dummy OpenACC loop for offloaded function
756 DECL. */
758 static oacc_loop *
759 new_oacc_loop_outer (tree decl)
761 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
764 /* Start a new OpenACC loop structure beginning at head marker HEAD.
765 Link into PARENT loop. Return the new loop. */
767 static oacc_loop *
768 new_oacc_loop (oacc_loop *parent, gcall *marker)
770 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
772 loop->marker = marker;
774 /* TODO: This is where device_type flattening would occur for the loop
775 flags. */
777 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
779 tree chunk_size = integer_zero_node;
780 if (loop->flags & OLF_GANG_STATIC)
781 chunk_size = gimple_call_arg (marker, 4);
782 loop->chunk_size = chunk_size;
784 return loop;
787 /* Create a dummy loop encompassing a call to a openACC routine.
788 Extract the routine's partitioning requirements. */
790 static void
791 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
793 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
794 int level = oacc_fn_attrib_level (attrs);
796 gcc_assert (level >= 0);
798 loop->marker = call;
799 loop->routine = decl;
800 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
801 ^ (GOMP_DIM_MASK (level) - 1));
804 /* Finish off the current OpenACC loop ending at tail marker TAIL.
805 Return the parent loop. */
807 static oacc_loop *
808 finish_oacc_loop (oacc_loop *loop)
810 /* If the loop has been collapsed, don't partition it. */
811 if (loop->ifns.is_empty ())
812 loop->mask = loop->flags = 0;
813 return loop->parent;
816 /* Free all OpenACC loop structures within LOOP (inclusive). */
818 static void
819 free_oacc_loop (oacc_loop *loop)
821 if (loop->sibling)
822 free_oacc_loop (loop->sibling);
823 if (loop->child)
824 free_oacc_loop (loop->child);
826 loop->ifns.release ();
827 free (loop);
830 /* Dump out the OpenACC loop head or tail beginning at FROM. */
832 static void
833 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
834 const char *title, int level)
836 enum ifn_unique_kind kind
837 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
839 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
840 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
842 gimple *stmt = gsi_stmt (gsi);
844 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
846 enum ifn_unique_kind k
847 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
848 (gimple_call_arg (stmt, 0)));
850 if (k == kind && stmt != from)
851 break;
853 print_gimple_stmt (file, stmt, depth * 2 + 2);
855 gsi_next (&gsi);
856 while (gsi_end_p (gsi))
857 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
861 /* Dump OpenACC loop LOOP, its children, and its siblings. */
863 static void
864 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
866 int ix;
868 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
869 loop->flags, loop->mask,
870 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
872 if (loop->marker)
873 print_gimple_stmt (file, loop->marker, depth * 2);
875 if (loop->routine)
876 fprintf (file, "%*sRoutine %s:%u:%s\n",
877 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
878 DECL_SOURCE_LINE (loop->routine),
879 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
881 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
882 if (loop->heads[ix])
883 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
884 for (ix = GOMP_DIM_MAX; ix--;)
885 if (loop->tails[ix])
886 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
888 if (loop->child)
889 dump_oacc_loop (file, loop->child, depth + 1);
890 if (loop->sibling)
891 dump_oacc_loop (file, loop->sibling, depth);
894 void debug_oacc_loop (oacc_loop *);
896 /* Dump loops to stderr. */
898 DEBUG_FUNCTION void
899 debug_oacc_loop (oacc_loop *loop)
901 dump_oacc_loop (stderr, loop, 0);
904 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
905 siblings. */
907 static void
908 inform_oacc_loop (const oacc_loop *loop)
910 const char *gang
911 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
912 const char *worker
913 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
914 const char *vector
915 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
916 const char *seq = loop->mask == 0 ? " seq" : "";
917 const dump_user_location_t loc
918 = dump_user_location_t::from_location_t (loop->loc);
919 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
920 "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
921 vector, seq);
923 if (loop->child)
924 inform_oacc_loop (loop->child);
925 if (loop->sibling)
926 inform_oacc_loop (loop->sibling);
929 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
930 structures as we go. By construction these loops are properly
931 nested. */
933 static void
934 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
936 int marker = 0;
937 int remaining = 0;
939 if (bb->flags & BB_VISITED)
940 return;
942 follow:
943 bb->flags |= BB_VISITED;
945 /* Scan for loop markers. */
946 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
947 gsi_next (&gsi))
949 gimple *stmt = gsi_stmt (gsi);
951 if (!is_gimple_call (stmt))
952 continue;
954 gcall *call = as_a <gcall *> (stmt);
956 /* If this is a routine, make a dummy loop for it. */
957 if (tree decl = gimple_call_fndecl (call))
958 if (tree attrs = oacc_get_fn_attrib (decl))
960 gcc_assert (!marker);
961 new_oacc_loop_routine (loop, call, decl, attrs);
964 if (!gimple_call_internal_p (call))
965 continue;
967 switch (gimple_call_internal_fn (call))
969 default:
970 break;
972 case IFN_GOACC_LOOP:
973 case IFN_GOACC_TILE:
974 /* Record the abstraction function, so we can manipulate it
975 later. */
976 loop->ifns.safe_push (call);
977 break;
979 case IFN_UNIQUE:
980 enum ifn_unique_kind kind
981 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
982 (gimple_call_arg (call, 0)));
983 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
984 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
986 if (gimple_call_num_args (call) == 2)
988 gcc_assert (marker && !remaining);
989 marker = 0;
990 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
991 loop = finish_oacc_loop (loop);
992 else
993 loop->head_end = call;
995 else
997 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
999 if (!marker)
1001 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1002 loop = new_oacc_loop (loop, call);
1003 remaining = count;
1005 gcc_assert (count == remaining);
1006 if (remaining)
1008 remaining--;
1009 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1010 loop->heads[marker] = call;
1011 else
1012 loop->tails[remaining] = call;
1014 marker++;
1019 if (remaining || marker)
1021 bb = single_succ (bb);
1022 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1023 goto follow;
1026 /* Walk successor blocks. */
1027 edge e;
1028 edge_iterator ei;
1030 FOR_EACH_EDGE (e, ei, bb->succs)
1031 oacc_loop_discover_walk (loop, e->dest);
1034 /* LOOP is the first sibling. Reverse the order in place and return
1035 the new first sibling. Recurse to child loops. */
1037 static oacc_loop *
1038 oacc_loop_sibling_nreverse (oacc_loop *loop)
1040 oacc_loop *last = NULL;
1043 if (loop->child)
1044 loop->child = oacc_loop_sibling_nreverse (loop->child);
1046 oacc_loop *next = loop->sibling;
1047 loop->sibling = last;
1048 last = loop;
1049 loop = next;
1051 while (loop);
1053 return last;
1056 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1057 the current function. */
1059 static oacc_loop *
1060 oacc_loop_discovery ()
1062 /* Clear basic block flags, in particular BB_VISITED which we're going to use
1063 in the following. */
1064 clear_bb_flags ();
1066 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1067 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1069 /* The siblings were constructed in reverse order, reverse them so
1070 that diagnostics come out in an unsurprising order. */
1071 top = oacc_loop_sibling_nreverse (top);
1073 return top;
1076 /* Transform the abstract internal function markers starting at FROM
1077 to be for partitioning level LEVEL. Stop when we meet another HEAD
1078 or TAIL marker. */
1080 static void
1081 oacc_loop_xform_head_tail (gcall *from, int level)
1083 enum ifn_unique_kind kind
1084 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1085 tree replacement = build_int_cst (unsigned_type_node, level);
1087 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1089 gimple *stmt = gsi_stmt (gsi);
1091 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1093 enum ifn_unique_kind k
1094 = ((enum ifn_unique_kind)
1095 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1097 if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1098 *gimple_call_arg_ptr (stmt, 2) = replacement;
1099 else if (k == kind && stmt != from)
1100 break;
1102 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1103 *gimple_call_arg_ptr (stmt, 3) = replacement;
1105 gsi_next (&gsi);
1106 while (gsi_end_p (gsi))
1107 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1111 /* Process the discovered OpenACC loops, setting the correct
1112 partitioning level etc. */
1114 static void
1115 oacc_loop_process (oacc_loop *loop)
1117 if (loop->child)
1118 oacc_loop_process (loop->child);
1120 if (loop->mask && !loop->routine)
1122 int ix;
1123 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1124 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1125 tree chunk_arg = loop->chunk_size;
1126 gcall *call;
1128 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1129 switch (gimple_call_internal_fn (call))
1131 case IFN_GOACC_LOOP:
1133 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1134 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1135 if (!is_e)
1136 gimple_call_set_arg (call, 4, chunk_arg);
1138 break;
1140 case IFN_GOACC_TILE:
1141 gimple_call_set_arg (call, 3, mask_arg);
1142 gimple_call_set_arg (call, 4, e_mask_arg);
1143 break;
1145 default:
1146 gcc_unreachable ();
1149 unsigned dim = GOMP_DIM_GANG;
1150 unsigned mask = loop->mask | loop->e_mask;
1151 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1153 while (!(GOMP_DIM_MASK (dim) & mask))
1154 dim++;
1156 oacc_loop_xform_head_tail (loop->heads[ix], dim);
1157 oacc_loop_xform_head_tail (loop->tails[ix], dim);
1159 mask ^= GOMP_DIM_MASK (dim);
1163 if (loop->sibling)
1164 oacc_loop_process (loop->sibling);
1167 /* Walk the OpenACC loop heirarchy checking and assigning the
1168 programmer-specified partitionings. OUTER_MASK is the partitioning
1169 this loop is contained within. Return mask of partitioning
1170 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1171 bit. */
1173 static unsigned
1174 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1176 unsigned this_mask = loop->mask;
1177 unsigned mask_all = 0;
1178 bool noisy = true;
1180 #ifdef ACCEL_COMPILER
1181 /* When device_type is supported, we want the device compiler to be
1182 noisy, if the loop parameters are device_type-specific. */
1183 noisy = false;
1184 #endif
1186 if (!loop->routine)
1188 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1189 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1190 bool tiling = (loop->flags & OLF_TILE) != 0;
1192 this_mask = ((loop->flags >> OLF_DIM_BASE)
1193 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1195 /* Apply auto partitioning if this is a non-partitioned regular
1196 loop, or (no more than) single axis tiled loop. */
1197 bool maybe_auto
1198 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1200 if ((this_mask != 0) + auto_par + seq_par > 1)
1202 if (noisy)
1203 error_at (loop->loc,
1204 seq_par
1205 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1206 : G_("%<auto%> conflicts with other OpenACC loop "
1207 "specifiers"));
1208 maybe_auto = false;
1209 loop->flags &= ~OLF_AUTO;
1210 if (seq_par)
1212 loop->flags
1213 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1214 this_mask = 0;
1218 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1220 loop->flags |= OLF_AUTO;
1221 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1225 if (this_mask & outer_mask)
1227 const oacc_loop *outer;
1228 for (outer = loop->parent; outer; outer = outer->parent)
1229 if ((outer->mask | outer->e_mask) & this_mask)
1230 break;
1232 if (noisy)
1234 if (outer)
1236 error_at (loop->loc,
1237 loop->routine
1238 ? G_("routine call uses same OpenACC parallelism"
1239 " as containing loop")
1240 : G_("inner loop uses same OpenACC parallelism"
1241 " as containing loop"));
1242 inform (outer->loc, "containing loop here");
1244 else
1245 error_at (loop->loc,
1246 loop->routine
1247 ? G_("routine call uses OpenACC parallelism disallowed"
1248 " by containing routine")
1249 : G_("loop uses OpenACC parallelism disallowed"
1250 " by containing routine"));
1252 if (loop->routine)
1253 inform (DECL_SOURCE_LOCATION (loop->routine),
1254 "routine %qD declared here", loop->routine);
1256 this_mask &= ~outer_mask;
1258 else
1260 unsigned outermost = least_bit_hwi (this_mask);
1262 if (outermost && outermost <= outer_mask)
1264 if (noisy)
1266 error_at (loop->loc,
1267 "incorrectly nested OpenACC loop parallelism");
1269 const oacc_loop *outer;
1270 for (outer = loop->parent;
1271 outer->flags && outer->flags < outermost;
1272 outer = outer->parent)
1273 continue;
1274 inform (outer->loc, "containing loop here");
1277 this_mask &= ~outermost;
1281 mask_all |= this_mask;
1283 if (loop->flags & OLF_TILE)
1285 /* When tiling, vector goes to the element loop, and failing
1286 that we put worker there. The std doesn't contemplate
1287 specifying all three. We choose to put worker and vector on
1288 the element loops in that case. */
1289 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1290 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1291 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1293 loop->e_mask = this_e_mask;
1294 this_mask ^= this_e_mask;
1297 loop->mask = this_mask;
1299 if (dump_file)
1300 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1301 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1302 loop->mask, loop->e_mask);
1304 if (loop->child)
1306 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1307 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1308 mask_all |= loop->inner;
1311 if (loop->sibling)
1312 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1314 return mask_all;
1317 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1318 OUTER_MASK is the partitioning this loop is contained within.
1319 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1320 Return the cumulative partitioning used by this loop, siblings and
1321 children. */
1323 static unsigned
1324 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1325 bool outer_assign)
1327 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1328 bool noisy = true;
1329 bool tiling = loop->flags & OLF_TILE;
1331 #ifdef ACCEL_COMPILER
1332 /* When device_type is supported, we want the device compiler to be
1333 noisy, if the loop parameters are device_type-specific. */
1334 noisy = false;
1335 #endif
1337 if (assign && (!outer_assign || loop->inner))
1339 /* Allocate outermost and non-innermost loops at the outermost
1340 non-innermost available level. */
1341 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1343 /* Find the first outermost available partition. */
1344 while (this_mask <= outer_mask)
1345 this_mask <<= 1;
1347 /* Grab two axes if tiling, and we've not assigned anything */
1348 if (tiling && !(loop->mask | loop->e_mask))
1349 this_mask |= this_mask << 1;
1351 /* Prohibit the innermost partitioning at the moment. */
1352 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1354 /* Don't use any dimension explicitly claimed by an inner loop. */
1355 this_mask &= ~loop->inner;
1357 if (tiling && !loop->e_mask)
1359 /* If we got two axes, allocate the inner one to the element
1360 loop. */
1361 loop->e_mask = this_mask & (this_mask << 1);
1362 this_mask ^= loop->e_mask;
1365 loop->mask |= this_mask;
1368 if (loop->child)
1370 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1371 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1372 outer_assign | assign);
1375 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1377 /* Allocate the loop at the innermost available level. Note
1378 that we do this even if we already assigned this loop the
1379 outermost available level above. That way we'll partition
1380 this along 2 axes, if they are available. */
1381 unsigned this_mask = 0;
1383 /* Determine the outermost partitioning used within this loop. */
1384 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1385 this_mask = least_bit_hwi (this_mask);
1387 /* Pick the partitioning just inside that one. */
1388 this_mask >>= 1;
1390 /* And avoid picking one use by an outer loop. */
1391 this_mask &= ~outer_mask;
1393 /* If tiling and we failed completely above, grab the next one
1394 too. Making sure it doesn't hit an outer loop. */
1395 if (tiling)
1397 this_mask &= ~(loop->e_mask | loop->mask);
1398 unsigned tile_mask = ((this_mask >> 1)
1399 & ~(outer_mask | loop->e_mask | loop->mask));
1401 if (tile_mask || loop->mask)
1403 loop->e_mask |= this_mask;
1404 this_mask = tile_mask;
1406 if (!loop->e_mask && noisy)
1407 warning_at (loop->loc, 0,
1408 "insufficient partitioning available"
1409 " to parallelize element loop");
1412 loop->mask |= this_mask;
1413 if (!loop->mask && noisy)
1414 warning_at (loop->loc, 0,
1415 tiling
1416 ? G_("insufficient partitioning available"
1417 " to parallelize tile loop")
1418 : G_("insufficient partitioning available"
1419 " to parallelize loop"));
1422 if (assign && dump_file)
1423 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1424 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1425 loop->mask, loop->e_mask);
1427 unsigned inner_mask = 0;
1429 if (loop->sibling)
1430 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1431 outer_mask, outer_assign);
1433 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1435 return inner_mask;
1438 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1439 axes. Return mask of partitioning. */
1441 static unsigned
1442 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1444 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1446 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1448 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1449 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1451 return mask_all;
1454 /* Default fork/join early expander. Delete the function calls if
1455 there is no RTL expander. */
1457 bool
1458 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1459 const int *ARG_UNUSED (dims), bool is_fork)
1461 if (is_fork)
1462 return targetm.have_oacc_fork ();
1463 else
1464 return targetm.have_oacc_join ();
1467 /* Default goacc.reduction early expander.
1469 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1470 If RES_PTR is not integer-zerop:
1471 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1472 TEARDOWN - emit '*RES_PTR = VAR'
1473 If LHS is not NULL
1474 emit 'LHS = VAR' */
1476 void
1477 default_goacc_reduction (gcall *call)
1479 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1480 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1481 tree lhs = gimple_call_lhs (call);
1482 tree var = gimple_call_arg (call, 2);
1483 gimple_seq seq = NULL;
1485 if (code == IFN_GOACC_REDUCTION_SETUP
1486 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1488 /* Setup and Teardown need to copy from/to the receiver object,
1489 if there is one. */
1490 tree ref_to_res = gimple_call_arg (call, 1);
1492 if (!integer_zerop (ref_to_res))
1494 tree dst = build_simple_mem_ref (ref_to_res);
1495 tree src = var;
1497 if (code == IFN_GOACC_REDUCTION_SETUP)
1499 src = dst;
1500 dst = lhs;
1501 lhs = NULL;
1503 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1507 /* Copy VAR to LHS, if there is an LHS. */
1508 if (lhs)
1509 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1511 gsi_replace_with_seq (&gsi, seq, true);
1514 /* Main entry point for oacc transformations which run on the device
1515 compiler after LTO, so we know what the target device is at this
1516 point (including the host fallback). */
1518 static unsigned int
1519 execute_oacc_device_lower ()
1521 tree attrs = oacc_get_fn_attrib (current_function_decl);
1523 if (!attrs)
1524 /* Not an offloaded function. */
1525 return 0;
1527 /* Parse the default dim argument exactly once. */
1528 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1530 oacc_parse_default_dims (flag_openacc_dims);
1531 flag_openacc_dims = (char *)&flag_openacc_dims;
1534 bool is_oacc_kernels
1535 = (lookup_attribute ("oacc kernels",
1536 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1537 bool is_oacc_kernels_parallelized
1538 = (lookup_attribute ("oacc kernels parallelized",
1539 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1541 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
1542 kernels, so remove the parallelism dimensions function attributes
1543 potentially set earlier on. */
1544 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
1546 oacc_set_fn_attrib (current_function_decl, NULL, NULL);
1547 attrs = oacc_get_fn_attrib (current_function_decl);
1550 /* Discover, partition and process the loops. */
1551 oacc_loop *loops = oacc_loop_discovery ();
1552 int fn_level = oacc_fn_attrib_level (attrs);
1554 if (dump_file)
1556 if (fn_level >= 0)
1557 fprintf (dump_file, "Function is OpenACC routine level %d\n",
1558 fn_level);
1559 else if (is_oacc_kernels)
1560 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1561 (is_oacc_kernels_parallelized
1562 ? "parallelized" : "unparallelized"));
1563 else
1564 fprintf (dump_file, "Function is OpenACC parallel offload\n");
1567 unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
1568 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1569 /* OpenACC kernels constructs are special: they currently don't use the
1570 generic oacc_loop infrastructure and attribute/dimension processing. */
1571 if (is_oacc_kernels && is_oacc_kernels_parallelized)
1573 /* Parallelized OpenACC kernels constructs use gang parallelism. See
1574 also tree-parloops.c:create_parallel_loop. */
1575 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
1578 int dims[GOMP_DIM_MAX];
1579 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1581 if (dump_file)
1583 const char *comma = "Compute dimensions [";
1584 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1585 fprintf (dump_file, "%s%d", comma, dims[ix]);
1586 fprintf (dump_file, "]\n");
1589 oacc_loop_process (loops);
1590 if (dump_file)
1592 fprintf (dump_file, "OpenACC loops\n");
1593 dump_oacc_loop (dump_file, loops, 0);
1594 fprintf (dump_file, "\n");
1596 if (dump_enabled_p ())
1598 oacc_loop *l = loops;
1599 /* OpenACC kernels constructs are special: they currently don't use the
1600 generic oacc_loop infrastructure. */
1601 if (is_oacc_kernels)
1603 /* Create a fake oacc_loop for diagnostic purposes. */
1604 l = new_oacc_loop_raw (NULL,
1605 DECL_SOURCE_LOCATION (current_function_decl));
1606 l->mask = used_mask;
1608 else
1610 /* Skip the outermost, dummy OpenACC loop */
1611 l = l->child;
1613 if (l)
1614 inform_oacc_loop (l);
1615 if (is_oacc_kernels)
1616 free_oacc_loop (l);
1619 /* Offloaded targets may introduce new basic blocks, which require
1620 dominance information to update SSA. */
1621 calculate_dominance_info (CDI_DOMINATORS);
1623 /* Now lower internal loop functions to target-specific code
1624 sequences. */
1625 basic_block bb;
1626 FOR_ALL_BB_FN (bb, cfun)
1627 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1629 gimple *stmt = gsi_stmt (gsi);
1630 if (!is_gimple_call (stmt))
1632 gsi_next (&gsi);
1633 continue;
1636 gcall *call = as_a <gcall *> (stmt);
1637 if (!gimple_call_internal_p (call))
1639 gsi_next (&gsi);
1640 continue;
1643 /* Rewind to allow rescan. */
1644 gsi_prev (&gsi);
1645 bool rescan = false, remove = false;
1646 enum internal_fn ifn_code = gimple_call_internal_fn (call);
1648 switch (ifn_code)
1650 default: break;
1652 case IFN_GOACC_TILE:
1653 oacc_xform_tile (call);
1654 rescan = true;
1655 break;
1657 case IFN_GOACC_LOOP:
1658 oacc_xform_loop (call);
1659 rescan = true;
1660 break;
1662 case IFN_GOACC_REDUCTION:
1663 /* Mark the function for SSA renaming. */
1664 mark_virtual_operands_for_renaming (cfun);
1666 /* If the level is -1, this ended up being an unused
1667 axis. Handle as a default. */
1668 if (integer_minus_onep (gimple_call_arg (call, 3)))
1669 default_goacc_reduction (call);
1670 else
1671 targetm.goacc.reduction (call);
1672 rescan = true;
1673 break;
1675 case IFN_UNIQUE:
1677 enum ifn_unique_kind kind
1678 = ((enum ifn_unique_kind)
1679 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1681 switch (kind)
1683 default:
1684 break;
1686 case IFN_UNIQUE_OACC_FORK:
1687 case IFN_UNIQUE_OACC_JOIN:
1688 if (integer_minus_onep (gimple_call_arg (call, 2)))
1689 remove = true;
1690 else if (!targetm.goacc.fork_join
1691 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1692 remove = true;
1693 break;
1695 case IFN_UNIQUE_OACC_HEAD_MARK:
1696 case IFN_UNIQUE_OACC_TAIL_MARK:
1697 remove = true;
1698 break;
1700 break;
1704 if (gsi_end_p (gsi))
1705 /* We rewound past the beginning of the BB. */
1706 gsi = gsi_start_bb (bb);
1707 else
1708 /* Undo the rewind. */
1709 gsi_next (&gsi);
1711 if (remove)
1713 if (gimple_vdef (call))
1714 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1715 if (gimple_call_lhs (call))
1717 /* Propagate the data dependency var. */
1718 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1719 gimple_call_arg (call, 1));
1720 gsi_replace (&gsi, ass, false);
1722 else
1723 gsi_remove (&gsi, true);
1725 else if (!rescan)
1726 /* If not rescanning, advance over the call. */
1727 gsi_next (&gsi);
1730 free_oacc_loop (loops);
1732 return 0;
1735 /* Default launch dimension validator. Force everything to 1. A
1736 backend that wants to provide larger dimensions must override this
1737 hook. */
1739 bool
1740 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
1741 int ARG_UNUSED (fn_level),
1742 unsigned ARG_UNUSED (used))
1744 bool changed = false;
1746 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
1748 if (dims[ix] != 1)
1750 dims[ix] = 1;
1751 changed = true;
1755 return changed;
1758 /* Default dimension bound is unknown on accelerator and 1 on host. */
1761 default_goacc_dim_limit (int ARG_UNUSED (axis))
1763 #ifdef ACCEL_COMPILER
1764 return 0;
1765 #else
1766 return 1;
1767 #endif
1770 namespace {
1772 const pass_data pass_data_oacc_device_lower =
1774 GIMPLE_PASS, /* type */
1775 "oaccdevlow", /* name */
1776 OPTGROUP_OMP, /* optinfo_flags */
1777 TV_NONE, /* tv_id */
1778 PROP_cfg, /* properties_required */
1779 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
1780 0, /* properties_destroyed */
1781 0, /* todo_flags_start */
1782 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1785 class pass_oacc_device_lower : public gimple_opt_pass
1787 public:
1788 pass_oacc_device_lower (gcc::context *ctxt)
1789 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
1792 /* opt_pass methods: */
1793 virtual bool gate (function *) { return flag_openacc; };
1795 virtual unsigned int execute (function *)
1797 return execute_oacc_device_lower ();
1800 }; // class pass_oacc_device_lower
1802 } // anon namespace
1804 gimple_opt_pass *
1805 make_pass_oacc_device_lower (gcc::context *ctxt)
1807 return new pass_oacc_device_lower (ctxt);
1811 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
1812 GOMP_SIMT_ENTER call identifying the privatized variables, which are
1813 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
1814 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
1816 static void
1817 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
1819 gimple *alloc_stmt = gsi_stmt (*gsi);
1820 tree simtrec = gimple_call_lhs (alloc_stmt);
1821 tree simduid = gimple_call_arg (alloc_stmt, 0);
1822 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
1823 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
1824 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
1825 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
1826 TREE_ADDRESSABLE (rectype) = 1;
1827 TREE_TYPE (simtrec) = build_pointer_type (rectype);
1828 for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
1830 tree *argp = gimple_call_arg_ptr (enter_stmt, i);
1831 if (*argp == null_pointer_node)
1832 continue;
1833 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
1834 && VAR_P (TREE_OPERAND (*argp, 0)));
1835 tree var = TREE_OPERAND (*argp, 0);
1837 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
1838 DECL_NAME (var), TREE_TYPE (var));
1839 SET_DECL_ALIGN (field, DECL_ALIGN (var));
1840 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
1841 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
1843 insert_field_into_struct (rectype, field);
1845 tree t = build_simple_mem_ref (simtrec);
1846 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
1847 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
1848 SET_DECL_VALUE_EXPR (var, t);
1849 DECL_HAS_VALUE_EXPR_P (var) = 1;
1850 *regimplify = true;
1852 layout_type (rectype);
1853 tree size = TYPE_SIZE_UNIT (rectype);
1854 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
1856 alloc_stmt
1857 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
1858 gimple_call_set_lhs (alloc_stmt, simtrec);
1859 gsi_replace (gsi, alloc_stmt, false);
1860 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
1861 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
1862 gsi_replace (&enter_gsi, enter_stmt, false);
1864 use_operand_p use;
1865 gimple *exit_stmt;
1866 if (single_imm_use (simtrec, &use, &exit_stmt))
1868 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
1869 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
1870 tree clobber = build_clobber (rectype);
1871 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
1872 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
1874 else
1875 gcc_checking_assert (has_zero_uses (simtrec));
1878 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
1880 static tree
1881 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
1883 tree t = *tp;
1885 if (VAR_P (t)
1886 && DECL_HAS_VALUE_EXPR_P (t)
1887 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
1889 *walk_subtrees = 0;
1890 return t;
1892 return NULL_TREE;
1895 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
1896 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
1897 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
1898 internal functions on non-SIMT targets, and likewise some SIMD internal
1899 functions on SIMT targets. */
1901 static unsigned int
1902 execute_omp_device_lower ()
1904 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
1905 bool regimplify = false;
1906 basic_block bb;
1907 gimple_stmt_iterator gsi;
1908 FOR_EACH_BB_FN (bb, cfun)
1909 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1911 gimple *stmt = gsi_stmt (gsi);
1912 if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
1913 continue;
1914 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
1915 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
1916 switch (gimple_call_internal_fn (stmt))
1918 case IFN_GOMP_USE_SIMT:
1919 rhs = vf == 1 ? integer_zero_node : integer_one_node;
1920 break;
1921 case IFN_GOMP_SIMT_ENTER:
1922 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1923 goto simtreg_enter_exit;
1924 case IFN_GOMP_SIMT_ENTER_ALLOC:
1925 if (vf != 1)
1926 ompdevlow_adjust_simt_enter (&gsi, &regimplify);
1927 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
1928 goto simtreg_enter_exit;
1929 case IFN_GOMP_SIMT_EXIT:
1930 simtreg_enter_exit:
1931 if (vf != 1)
1932 continue;
1933 unlink_stmt_vdef (stmt);
1934 break;
1935 case IFN_GOMP_SIMT_LANE:
1936 case IFN_GOMP_SIMT_LAST_LANE:
1937 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
1938 break;
1939 case IFN_GOMP_SIMT_VF:
1940 rhs = build_int_cst (type, vf);
1941 break;
1942 case IFN_GOMP_SIMT_ORDERED_PRED:
1943 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
1944 if (rhs || !lhs)
1945 unlink_stmt_vdef (stmt);
1946 break;
1947 case IFN_GOMP_SIMT_VOTE_ANY:
1948 case IFN_GOMP_SIMT_XCHG_BFLY:
1949 case IFN_GOMP_SIMT_XCHG_IDX:
1950 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1951 break;
1952 case IFN_GOMP_SIMD_LANE:
1953 case IFN_GOMP_SIMD_LAST_LANE:
1954 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
1955 break;
1956 case IFN_GOMP_SIMD_VF:
1957 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
1958 break;
1959 default:
1960 continue;
1962 if (lhs && !rhs)
1963 continue;
1964 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
1965 gsi_replace (&gsi, stmt, false);
1967 if (regimplify)
1968 FOR_EACH_BB_REVERSE_FN (bb, cfun)
1969 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1970 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
1972 if (gimple_clobber_p (gsi_stmt (gsi)))
1973 gsi_remove (&gsi, true);
1974 else
1975 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
1977 if (vf != 1)
1978 cfun->has_force_vectorize_loops = false;
1979 return 0;
1982 namespace {
1984 const pass_data pass_data_omp_device_lower =
1986 GIMPLE_PASS, /* type */
1987 "ompdevlow", /* name */
1988 OPTGROUP_OMP, /* optinfo_flags */
1989 TV_NONE, /* tv_id */
1990 PROP_cfg, /* properties_required */
1991 PROP_gimple_lomp_dev, /* properties_provided */
1992 0, /* properties_destroyed */
1993 0, /* todo_flags_start */
1994 TODO_update_ssa, /* todo_flags_finish */
1997 class pass_omp_device_lower : public gimple_opt_pass
1999 public:
2000 pass_omp_device_lower (gcc::context *ctxt)
2001 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2004 /* opt_pass methods: */
2005 virtual bool gate (function *fun)
2007 return !(fun->curr_properties & PROP_gimple_lomp_dev);
2009 virtual unsigned int execute (function *)
2011 return execute_omp_device_lower ();
2014 }; // class pass_expand_omp_ssa
2016 } // anon namespace
2018 gimple_opt_pass *
2019 make_pass_omp_device_lower (gcc::context *ctxt)
2021 return new pass_omp_device_lower (ctxt);
2024 /* "omp declare target link" handling pass. */
2026 namespace {
2028 const pass_data pass_data_omp_target_link =
2030 GIMPLE_PASS, /* type */
2031 "omptargetlink", /* name */
2032 OPTGROUP_OMP, /* optinfo_flags */
2033 TV_NONE, /* tv_id */
2034 PROP_ssa, /* properties_required */
2035 0, /* properties_provided */
2036 0, /* properties_destroyed */
2037 0, /* todo_flags_start */
2038 TODO_update_ssa, /* todo_flags_finish */
2041 class pass_omp_target_link : public gimple_opt_pass
2043 public:
2044 pass_omp_target_link (gcc::context *ctxt)
2045 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2048 /* opt_pass methods: */
2049 virtual bool gate (function *fun)
2051 #ifdef ACCEL_COMPILER
2052 return offloading_function_p (fun->decl);
2053 #else
2054 (void) fun;
2055 return false;
2056 #endif
2059 virtual unsigned execute (function *);
2062 /* Callback for walk_gimple_stmt used to scan for link var operands. */
2064 static tree
2065 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2067 tree t = *tp;
2069 if (VAR_P (t)
2070 && DECL_HAS_VALUE_EXPR_P (t)
2071 && is_global_var (t)
2072 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2074 *walk_subtrees = 0;
2075 return t;
2078 return NULL_TREE;
2081 unsigned
2082 pass_omp_target_link::execute (function *fun)
2084 basic_block bb;
2085 FOR_EACH_BB_FN (bb, fun)
2087 gimple_stmt_iterator gsi;
2088 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2089 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2090 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2093 return 0;
2096 } // anon namespace
2098 gimple_opt_pass *
2099 make_pass_omp_target_link (gcc::context *ctxt)
2101 return new pass_omp_target_link (ctxt);