* tree-ssa-loop-ivopts.c (ivopts_estimate_reg_pressure): New
[official-gcc.git] / gcc / omp-offload.c
blob54a4e90f70ca2853181699a75aa0a34e6246d994
1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
4 Copyright (C) 2005-2017 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
53 /* Describe the OpenACC looping structure of a function. The entire
54 function is held in a 'NULL' loop. */
56 struct oacc_loop
58 oacc_loop *parent; /* Containing loop. */
60 oacc_loop *child; /* First inner loop. */
62 oacc_loop *sibling; /* Next loop within same parent. */
64 location_t loc; /* Location of the loop start. */
66 gcall *marker; /* Initial head marker. */
68 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
69 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
71 tree routine; /* Pseudo-loop enclosing a routine. */
73 unsigned mask; /* Partitioning mask. */
74 unsigned e_mask; /* Partitioning of element loops (when tiling). */
75 unsigned inner; /* Partitioning of inner loops. */
76 unsigned flags; /* Partitioning flags. */
77 vec<gcall *> ifns; /* Contained loop abstraction functions. */
78 tree chunk_size; /* Chunk size. */
79 gcall *head_end; /* Final marker of head sequence. */
82 /* Holds offload tables with decls. */
83 vec<tree, va_gc> *offload_funcs, *offload_vars;
85 /* Return level at which oacc routine may spawn a partitioned loop, or
86 -1 if it is not a routine (i.e. is an offload fn). */
88 static int
89 oacc_fn_attrib_level (tree attr)
91 tree pos = TREE_VALUE (attr);
93 if (!TREE_PURPOSE (pos))
94 return -1;
96 int ix = 0;
97 for (ix = 0; ix != GOMP_DIM_MAX;
98 ix++, pos = TREE_CHAIN (pos))
99 if (!integer_zerop (TREE_PURPOSE (pos)))
100 break;
102 return ix;
105 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
106 adds their addresses and sizes to constructor-vector V_CTOR. */
108 static void
109 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
110 vec<constructor_elt, va_gc> *v_ctor)
112 unsigned len = vec_safe_length (v_decls);
113 for (unsigned i = 0; i < len; i++)
115 tree it = (*v_decls)[i];
116 bool is_var = VAR_P (it);
117 bool is_link_var
118 = is_var
119 #ifdef ACCEL_COMPILER
120 && DECL_HAS_VALUE_EXPR_P (it)
121 #endif
122 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
124 tree size = NULL_TREE;
125 if (is_var)
126 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
128 tree addr;
129 if (!is_link_var)
130 addr = build_fold_addr_expr (it);
131 else
133 #ifdef ACCEL_COMPILER
134 /* For "omp declare target link" vars add address of the pointer to
135 the target table, instead of address of the var. */
136 tree value_expr = DECL_VALUE_EXPR (it);
137 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
138 varpool_node::finalize_decl (link_ptr_decl);
139 addr = build_fold_addr_expr (link_ptr_decl);
140 #else
141 addr = build_fold_addr_expr (it);
142 #endif
144 /* Most significant bit of the size marks "omp declare target link"
145 vars in host and target tables. */
146 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
147 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
148 * BITS_PER_UNIT - 1);
149 size = wide_int_to_tree (const_ptr_type_node, isize);
152 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
153 if (is_var)
154 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
158 /* Create new symbols containing (address, size) pairs for global variables,
159 marked with "omp declare target" attribute, as well as addresses for the
160 functions, which are outlined offloading regions. */
161 void
162 omp_finish_file (void)
164 unsigned num_funcs = vec_safe_length (offload_funcs);
165 unsigned num_vars = vec_safe_length (offload_vars);
167 if (num_funcs == 0 && num_vars == 0)
168 return;
170 if (targetm_common.have_named_sections)
172 vec<constructor_elt, va_gc> *v_f, *v_v;
173 vec_alloc (v_f, num_funcs);
174 vec_alloc (v_v, num_vars * 2);
176 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
177 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
179 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
180 num_vars * 2);
181 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
182 num_funcs);
183 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
184 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
185 tree ctor_v = build_constructor (vars_decl_type, v_v);
186 tree ctor_f = build_constructor (funcs_decl_type, v_f);
187 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
188 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
189 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
190 get_identifier (".offload_func_table"),
191 funcs_decl_type);
192 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
193 get_identifier (".offload_var_table"),
194 vars_decl_type);
195 TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
196 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
197 otherwise a joint table in a binary will contain padding between
198 tables from multiple object files. */
199 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
200 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
201 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
202 DECL_INITIAL (funcs_decl) = ctor_f;
203 DECL_INITIAL (vars_decl) = ctor_v;
204 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
205 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
207 varpool_node::finalize_decl (vars_decl);
208 varpool_node::finalize_decl (funcs_decl);
210 else
212 for (unsigned i = 0; i < num_funcs; i++)
214 tree it = (*offload_funcs)[i];
215 targetm.record_offload_symbol (it);
217 for (unsigned i = 0; i < num_vars; i++)
219 tree it = (*offload_vars)[i];
220 targetm.record_offload_symbol (it);
225 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
226 axis DIM. Return a tmp var holding the result. */
228 static tree
229 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
231 tree arg = build_int_cst (unsigned_type_node, dim);
232 tree size = create_tmp_var (integer_type_node);
233 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
234 gimple *call = gimple_build_call_internal (fn, 1, arg);
236 gimple_call_set_lhs (call, size);
237 gimple_seq_add_stmt (seq, call);
239 return size;
242 /* Find the number of threads (POS = false), or thread number (POS =
243 true) for an OpenACC region partitioned as MASK. Setup code
244 required for the calculation is added to SEQ. */
246 static tree
247 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
249 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
250 unsigned ix;
252 /* Start at gang level, and examine relevant dimension indices. */
253 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
254 if (GOMP_DIM_MASK (ix) & mask)
256 if (res)
258 /* We had an outer index, so scale that by the size of
259 this dimension. */
260 tree n = oacc_dim_call (false, ix, seq);
261 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
263 if (pos)
265 /* Determine index in this dimension. */
266 tree id = oacc_dim_call (true, ix, seq);
267 if (res)
268 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
269 else
270 res = id;
274 if (res == NULL_TREE)
275 res = integer_zero_node;
277 return res;
280 /* Transform IFN_GOACC_LOOP calls to actual code. See
281 expand_oacc_for for where these are generated. At the vector
282 level, we stride loops, such that each member of a warp will
283 operate on adjacent iterations. At the worker and gang level,
284 each gang/warp executes a set of contiguous iterations. Chunking
285 can override this such that each iteration engine executes a
286 contiguous chunk, and then moves on to stride to the next chunk. */
288 static void
289 oacc_xform_loop (gcall *call)
291 gimple_stmt_iterator gsi = gsi_for_stmt (call);
292 enum ifn_goacc_loop_kind code
293 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
294 tree dir = gimple_call_arg (call, 1);
295 tree range = gimple_call_arg (call, 2);
296 tree step = gimple_call_arg (call, 3);
297 tree chunk_size = NULL_TREE;
298 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
299 tree lhs = gimple_call_lhs (call);
300 tree type = TREE_TYPE (lhs);
301 tree diff_type = TREE_TYPE (range);
302 tree r = NULL_TREE;
303 gimple_seq seq = NULL;
304 bool chunking = false, striding = true;
305 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
306 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
308 #ifdef ACCEL_COMPILER
309 chunk_size = gimple_call_arg (call, 4);
310 if (integer_minus_onep (chunk_size) /* Force static allocation. */
311 || integer_zerop (chunk_size)) /* Default (also static). */
313 /* If we're at the gang level, we want each to execute a
314 contiguous run of iterations. Otherwise we want each element
315 to stride. */
316 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
317 chunking = false;
319 else
321 /* Chunk of size 1 is striding. */
322 striding = integer_onep (chunk_size);
323 chunking = !striding;
325 #endif
327 /* striding=true, chunking=true
328 -> invalid.
329 striding=true, chunking=false
330 -> chunks=1
331 striding=false,chunking=true
332 -> chunks=ceil (range/(chunksize*threads*step))
333 striding=false,chunking=false
334 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
335 push_gimplify_context (true);
337 switch (code)
339 default: gcc_unreachable ();
341 case IFN_GOACC_LOOP_CHUNKS:
342 if (!chunking)
343 r = build_int_cst (type, 1);
344 else
346 /* chunk_max
347 = (range - dir) / (chunks * step * num_threads) + dir */
348 tree per = oacc_thread_numbers (false, mask, &seq);
349 per = fold_convert (type, per);
350 chunk_size = fold_convert (type, chunk_size);
351 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
352 per = fold_build2 (MULT_EXPR, type, per, step);
353 r = build2 (MINUS_EXPR, type, range, dir);
354 r = build2 (PLUS_EXPR, type, r, per);
355 r = build2 (TRUNC_DIV_EXPR, type, r, per);
357 break;
359 case IFN_GOACC_LOOP_STEP:
361 /* If striding, step by the entire compute volume, otherwise
362 step by the inner volume. */
363 unsigned volume = striding ? mask : inner_mask;
365 r = oacc_thread_numbers (false, volume, &seq);
366 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
368 break;
370 case IFN_GOACC_LOOP_OFFSET:
371 if (striding)
373 r = oacc_thread_numbers (true, mask, &seq);
374 r = fold_convert (diff_type, r);
376 else
378 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
379 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
380 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
381 inner_size, outer_size);
383 volume = fold_convert (diff_type, volume);
384 if (chunking)
385 chunk_size = fold_convert (diff_type, chunk_size);
386 else
388 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
390 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
391 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
392 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
395 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
396 fold_convert (diff_type, inner_size));
397 r = oacc_thread_numbers (true, outer_mask, &seq);
398 r = fold_convert (diff_type, r);
399 r = build2 (MULT_EXPR, diff_type, r, span);
401 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
402 inner = fold_convert (diff_type, inner);
403 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
405 if (chunking)
407 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
408 tree per
409 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
410 per = build2 (MULT_EXPR, diff_type, per, chunk);
412 r = build2 (PLUS_EXPR, diff_type, r, per);
415 r = fold_build2 (MULT_EXPR, diff_type, r, step);
416 if (type != diff_type)
417 r = fold_convert (type, r);
418 break;
420 case IFN_GOACC_LOOP_BOUND:
421 if (striding)
422 r = range;
423 else
425 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
426 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
427 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
428 inner_size, outer_size);
430 volume = fold_convert (diff_type, volume);
431 if (chunking)
432 chunk_size = fold_convert (diff_type, chunk_size);
433 else
435 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
437 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
438 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
439 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
442 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
443 fold_convert (diff_type, inner_size));
445 r = fold_build2 (MULT_EXPR, diff_type, span, step);
447 tree offset = gimple_call_arg (call, 6);
448 r = build2 (PLUS_EXPR, diff_type, r,
449 fold_convert (diff_type, offset));
450 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
451 diff_type, r, range);
453 if (diff_type != type)
454 r = fold_convert (type, r);
455 break;
458 gimplify_assign (lhs, r, &seq);
460 pop_gimplify_context (NULL);
462 gsi_replace_with_seq (&gsi, seq, true);
465 /* Transform a GOACC_TILE call. Determines the element loop span for
466 the specified loop of the nest. This is 1 if we're not tiling.
468 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
470 static void
471 oacc_xform_tile (gcall *call)
473 gimple_stmt_iterator gsi = gsi_for_stmt (call);
474 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
475 /* Inner loops have higher loop_nos. */
476 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
477 tree tile_size = gimple_call_arg (call, 2);
478 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
479 tree lhs = gimple_call_lhs (call);
480 tree type = TREE_TYPE (lhs);
481 gimple_seq seq = NULL;
482 tree span = build_int_cst (type, 1);
484 gcc_assert (!(e_mask
485 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
486 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
487 push_gimplify_context (!seen_error ());
489 #ifndef ACCEL_COMPILER
490 /* Partitioning disabled on host compilers. */
491 e_mask = 0;
492 #endif
493 if (!e_mask)
494 /* Not paritioning. */
495 span = integer_one_node;
496 else if (!integer_zerop (tile_size))
497 /* User explicitly specified size. */
498 span = tile_size;
499 else
501 /* Pick a size based on the paritioning of the element loop and
502 the number of loop nests. */
503 tree first_size = NULL_TREE;
504 tree second_size = NULL_TREE;
506 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
507 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
508 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
509 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
511 if (!first_size)
513 first_size = second_size;
514 second_size = NULL_TREE;
517 if (loop_no + 1 == collapse)
519 span = first_size;
520 if (!loop_no && second_size)
521 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
522 span, second_size);
524 else if (loop_no + 2 == collapse)
525 span = second_size;
526 else
527 span = NULL_TREE;
529 if (!span)
530 /* There's no obvious element size for this loop. Options
531 are 1, first_size or some non-unity constant (32 is my
532 favourite). We should gather some statistics. */
533 span = first_size;
536 span = fold_convert (type, span);
537 gimplify_assign (lhs, span, &seq);
539 pop_gimplify_context (NULL);
541 gsi_replace_with_seq (&gsi, seq, true);
544 /* Default partitioned and minimum partitioned dimensions. */
546 static int oacc_default_dims[GOMP_DIM_MAX];
547 static int oacc_min_dims[GOMP_DIM_MAX];
549 /* Parse the default dimension parameter. This is a set of
550 :-separated optional compute dimensions. Each specified dimension
551 is a positive integer. When device type support is added, it is
552 planned to be a comma separated list of such compute dimensions,
553 with all but the first prefixed by the colon-terminated device
554 type. */
556 static void
557 oacc_parse_default_dims (const char *dims)
559 int ix;
561 for (ix = GOMP_DIM_MAX; ix--;)
563 oacc_default_dims[ix] = -1;
564 oacc_min_dims[ix] = 1;
567 #ifndef ACCEL_COMPILER
568 /* Cannot be overridden on the host. */
569 dims = NULL;
570 #endif
571 if (dims)
573 const char *pos = dims;
575 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
577 if (ix)
579 if (*pos != ':')
580 goto malformed;
581 pos++;
584 if (*pos != ':')
586 long val;
587 const char *eptr;
589 errno = 0;
590 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
591 if (errno || val <= 0 || (int) val != val)
592 goto malformed;
593 pos = eptr;
594 oacc_default_dims[ix] = (int) val;
597 if (*pos)
599 malformed:
600 error_at (UNKNOWN_LOCATION,
601 "-fopenacc-dim operand is malformed at '%s'", pos);
605 /* Allow the backend to validate the dimensions. */
606 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1);
607 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2);
610 /* Validate and update the dimensions for offloaded FN. ATTRS is the
611 raw attribute. DIMS is an array of dimensions, which is filled in.
612 LEVEL is the partitioning level of a routine, or -1 for an offload
613 region itself. USED is the mask of partitioned execution in the
614 function. */
616 static void
617 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
619 tree purpose[GOMP_DIM_MAX];
620 unsigned ix;
621 tree pos = TREE_VALUE (attrs);
623 /* Make sure the attribute creator attached the dimension
624 information. */
625 gcc_assert (pos);
627 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
629 purpose[ix] = TREE_PURPOSE (pos);
630 tree val = TREE_VALUE (pos);
631 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
632 pos = TREE_CHAIN (pos);
635 bool changed = targetm.goacc.validate_dims (fn, dims, level);
637 /* Default anything left to 1 or a partitioned default. */
638 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
639 if (dims[ix] < 0)
641 /* The OpenACC spec says 'If the [num_gangs] clause is not
642 specified, an implementation-defined default will be used;
643 the default may depend on the code within the construct.'
644 (2.5.6). Thus an implementation is free to choose
645 non-unity default for a parallel region that doesn't have
646 any gang-partitioned loops. However, it appears that there
647 is a sufficient body of user code that expects non-gang
648 partitioned regions to not execute in gang-redundant mode.
649 So we (a) don't warn about the non-portability and (b) pick
650 the minimum permissible dimension size when there is no
651 partitioned execution. Otherwise we pick the global
652 default for the dimension, which the user can control. The
653 same wording and logic applies to num_workers and
654 vector_length, however the worker- or vector- single
655 execution doesn't have the same impact as gang-redundant
656 execution. (If the minimum gang-level partioning is not 1,
657 the target is probably too confusing.) */
658 dims[ix] = (used & GOMP_DIM_MASK (ix)
659 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
660 changed = true;
663 if (changed)
665 /* Replace the attribute with new values. */
666 pos = NULL_TREE;
667 for (ix = GOMP_DIM_MAX; ix--;)
668 pos = tree_cons (purpose[ix],
669 build_int_cst (integer_type_node, dims[ix]), pos);
670 oacc_replace_fn_attrib (fn, pos);
674 /* Create an empty OpenACC loop structure at LOC. */
676 static oacc_loop *
677 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
679 oacc_loop *loop = XCNEW (oacc_loop);
681 loop->parent = parent;
683 if (parent)
685 loop->sibling = parent->child;
686 parent->child = loop;
689 loop->loc = loc;
690 return loop;
693 /* Create an outermost, dummy OpenACC loop for offloaded function
694 DECL. */
696 static oacc_loop *
697 new_oacc_loop_outer (tree decl)
699 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
702 /* Start a new OpenACC loop structure beginning at head marker HEAD.
703 Link into PARENT loop. Return the new loop. */
705 static oacc_loop *
706 new_oacc_loop (oacc_loop *parent, gcall *marker)
708 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
710 loop->marker = marker;
712 /* TODO: This is where device_type flattening would occur for the loop
713 flags. */
715 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
717 tree chunk_size = integer_zero_node;
718 if (loop->flags & OLF_GANG_STATIC)
719 chunk_size = gimple_call_arg (marker, 4);
720 loop->chunk_size = chunk_size;
722 return loop;
725 /* Create a dummy loop encompassing a call to a openACC routine.
726 Extract the routine's partitioning requirements. */
728 static void
729 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
731 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
732 int level = oacc_fn_attrib_level (attrs);
734 gcc_assert (level >= 0);
736 loop->marker = call;
737 loop->routine = decl;
738 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
739 ^ (GOMP_DIM_MASK (level) - 1));
742 /* Finish off the current OpenACC loop ending at tail marker TAIL.
743 Return the parent loop. */
745 static oacc_loop *
746 finish_oacc_loop (oacc_loop *loop)
748 /* If the loop has been collapsed, don't partition it. */
749 if (loop->ifns.is_empty ())
750 loop->mask = loop->flags = 0;
751 return loop->parent;
754 /* Free all OpenACC loop structures within LOOP (inclusive). */
756 static void
757 free_oacc_loop (oacc_loop *loop)
759 if (loop->sibling)
760 free_oacc_loop (loop->sibling);
761 if (loop->child)
762 free_oacc_loop (loop->child);
764 loop->ifns.release ();
765 free (loop);
768 /* Dump out the OpenACC loop head or tail beginning at FROM. */
770 static void
771 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
772 const char *title, int level)
774 enum ifn_unique_kind kind
775 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
777 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
778 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
780 gimple *stmt = gsi_stmt (gsi);
782 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
784 enum ifn_unique_kind k
785 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
786 (gimple_call_arg (stmt, 0)));
788 if (k == kind && stmt != from)
789 break;
791 print_gimple_stmt (file, stmt, depth * 2 + 2);
793 gsi_next (&gsi);
794 while (gsi_end_p (gsi))
795 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
799 /* Dump OpenACC loops LOOP, its siblings and its children. */
801 static void
802 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
804 int ix;
806 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
807 loop->flags, loop->mask,
808 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
810 if (loop->marker)
811 print_gimple_stmt (file, loop->marker, depth * 2);
813 if (loop->routine)
814 fprintf (file, "%*sRoutine %s:%u:%s\n",
815 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
816 DECL_SOURCE_LINE (loop->routine),
817 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
819 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
820 if (loop->heads[ix])
821 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
822 for (ix = GOMP_DIM_MAX; ix--;)
823 if (loop->tails[ix])
824 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
826 if (loop->child)
827 dump_oacc_loop (file, loop->child, depth + 1);
828 if (loop->sibling)
829 dump_oacc_loop (file, loop->sibling, depth);
832 void debug_oacc_loop (oacc_loop *);
834 /* Dump loops to stderr. */
836 DEBUG_FUNCTION void
837 debug_oacc_loop (oacc_loop *loop)
839 dump_oacc_loop (stderr, loop, 0);
842 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
843 structures as we go. By construction these loops are properly
844 nested. */
846 static void
847 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
849 int marker = 0;
850 int remaining = 0;
852 if (bb->flags & BB_VISITED)
853 return;
855 follow:
856 bb->flags |= BB_VISITED;
858 /* Scan for loop markers. */
859 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
860 gsi_next (&gsi))
862 gimple *stmt = gsi_stmt (gsi);
864 if (!is_gimple_call (stmt))
865 continue;
867 gcall *call = as_a <gcall *> (stmt);
869 /* If this is a routine, make a dummy loop for it. */
870 if (tree decl = gimple_call_fndecl (call))
871 if (tree attrs = oacc_get_fn_attrib (decl))
873 gcc_assert (!marker);
874 new_oacc_loop_routine (loop, call, decl, attrs);
877 if (!gimple_call_internal_p (call))
878 continue;
880 switch (gimple_call_internal_fn (call))
882 default:
883 break;
885 case IFN_GOACC_LOOP:
886 case IFN_GOACC_TILE:
887 /* Record the abstraction function, so we can manipulate it
888 later. */
889 loop->ifns.safe_push (call);
890 break;
892 case IFN_UNIQUE:
893 enum ifn_unique_kind kind
894 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
895 (gimple_call_arg (call, 0)));
896 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
897 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
899 if (gimple_call_num_args (call) == 2)
901 gcc_assert (marker && !remaining);
902 marker = 0;
903 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
904 loop = finish_oacc_loop (loop);
905 else
906 loop->head_end = call;
908 else
910 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
912 if (!marker)
914 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
915 loop = new_oacc_loop (loop, call);
916 remaining = count;
918 gcc_assert (count == remaining);
919 if (remaining)
921 remaining--;
922 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
923 loop->heads[marker] = call;
924 else
925 loop->tails[remaining] = call;
927 marker++;
932 if (remaining || marker)
934 bb = single_succ (bb);
935 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
936 goto follow;
939 /* Walk successor blocks. */
940 edge e;
941 edge_iterator ei;
943 FOR_EACH_EDGE (e, ei, bb->succs)
944 oacc_loop_discover_walk (loop, e->dest);
947 /* LOOP is the first sibling. Reverse the order in place and return
948 the new first sibling. Recurse to child loops. */
950 static oacc_loop *
951 oacc_loop_sibling_nreverse (oacc_loop *loop)
953 oacc_loop *last = NULL;
956 if (loop->child)
957 loop->child = oacc_loop_sibling_nreverse (loop->child);
959 oacc_loop *next = loop->sibling;
960 loop->sibling = last;
961 last = loop;
962 loop = next;
964 while (loop);
966 return last;
969 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
970 the current function. */
972 static oacc_loop *
973 oacc_loop_discovery ()
975 /* Clear basic block flags, in particular BB_VISITED which we're going to use
976 in the following. */
977 clear_bb_flags ();
979 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
980 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
982 /* The siblings were constructed in reverse order, reverse them so
983 that diagnostics come out in an unsurprising order. */
984 top = oacc_loop_sibling_nreverse (top);
986 return top;
989 /* Transform the abstract internal function markers starting at FROM
990 to be for partitioning level LEVEL. Stop when we meet another HEAD
991 or TAIL marker. */
993 static void
994 oacc_loop_xform_head_tail (gcall *from, int level)
996 enum ifn_unique_kind kind
997 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
998 tree replacement = build_int_cst (unsigned_type_node, level);
1000 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1002 gimple *stmt = gsi_stmt (gsi);
1004 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1006 enum ifn_unique_kind k
1007 = ((enum ifn_unique_kind)
1008 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1010 if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1011 *gimple_call_arg_ptr (stmt, 2) = replacement;
1012 else if (k == kind && stmt != from)
1013 break;
1015 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1016 *gimple_call_arg_ptr (stmt, 3) = replacement;
1018 gsi_next (&gsi);
1019 while (gsi_end_p (gsi))
1020 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1024 /* Process the discovered OpenACC loops, setting the correct
1025 partitioning level etc. */
1027 static void
1028 oacc_loop_process (oacc_loop *loop)
1030 if (loop->child)
1031 oacc_loop_process (loop->child);
1033 if (loop->mask && !loop->routine)
1035 int ix;
1036 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1037 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1038 tree chunk_arg = loop->chunk_size;
1039 gcall *call;
1041 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1042 switch (gimple_call_internal_fn (call))
1044 case IFN_GOACC_LOOP:
1046 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1047 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1048 if (!is_e)
1049 gimple_call_set_arg (call, 4, chunk_arg);
1051 break;
1053 case IFN_GOACC_TILE:
1054 gimple_call_set_arg (call, 3, mask_arg);
1055 gimple_call_set_arg (call, 4, e_mask_arg);
1056 break;
1058 default:
1059 gcc_unreachable ();
1062 unsigned dim = GOMP_DIM_GANG;
1063 unsigned mask = loop->mask | loop->e_mask;
1064 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1066 while (!(GOMP_DIM_MASK (dim) & mask))
1067 dim++;
1069 oacc_loop_xform_head_tail (loop->heads[ix], dim);
1070 oacc_loop_xform_head_tail (loop->tails[ix], dim);
1072 mask ^= GOMP_DIM_MASK (dim);
1076 if (loop->sibling)
1077 oacc_loop_process (loop->sibling);
1080 /* Walk the OpenACC loop heirarchy checking and assigning the
1081 programmer-specified partitionings. OUTER_MASK is the partitioning
1082 this loop is contained within. Return mask of partitioning
1083 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1084 bit. */
1086 static unsigned
1087 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1089 unsigned this_mask = loop->mask;
1090 unsigned mask_all = 0;
1091 bool noisy = true;
1093 #ifdef ACCEL_COMPILER
1094 /* When device_type is supported, we want the device compiler to be
1095 noisy, if the loop parameters are device_type-specific. */
1096 noisy = false;
1097 #endif
1099 if (!loop->routine)
1101 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1102 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1103 bool tiling = (loop->flags & OLF_TILE) != 0;
1105 this_mask = ((loop->flags >> OLF_DIM_BASE)
1106 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1108 /* Apply auto partitioning if this is a non-partitioned regular
1109 loop, or (no more than) single axis tiled loop. */
1110 bool maybe_auto
1111 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1113 if ((this_mask != 0) + auto_par + seq_par > 1)
1115 if (noisy)
1116 error_at (loop->loc,
1117 seq_par
1118 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1119 : G_("%<auto%> conflicts with other OpenACC loop "
1120 "specifiers"));
1121 maybe_auto = false;
1122 loop->flags &= ~OLF_AUTO;
1123 if (seq_par)
1125 loop->flags
1126 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1127 this_mask = 0;
1131 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1133 loop->flags |= OLF_AUTO;
1134 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1138 if (this_mask & outer_mask)
1140 const oacc_loop *outer;
1141 for (outer = loop->parent; outer; outer = outer->parent)
1142 if ((outer->mask | outer->e_mask) & this_mask)
1143 break;
1145 if (noisy)
1147 if (outer)
1149 error_at (loop->loc,
1150 loop->routine
1151 ? G_("routine call uses same OpenACC parallelism"
1152 " as containing loop")
1153 : G_("inner loop uses same OpenACC parallelism"
1154 " as containing loop"));
1155 inform (outer->loc, "containing loop here");
1157 else
1158 error_at (loop->loc,
1159 loop->routine
1160 ? G_("routine call uses OpenACC parallelism disallowed"
1161 " by containing routine")
1162 : G_("loop uses OpenACC parallelism disallowed"
1163 " by containing routine"));
1165 if (loop->routine)
1166 inform (DECL_SOURCE_LOCATION (loop->routine),
1167 "routine %qD declared here", loop->routine);
1169 this_mask &= ~outer_mask;
1171 else
1173 unsigned outermost = least_bit_hwi (this_mask);
1175 if (outermost && outermost <= outer_mask)
1177 if (noisy)
1179 error_at (loop->loc,
1180 "incorrectly nested OpenACC loop parallelism");
1182 const oacc_loop *outer;
1183 for (outer = loop->parent;
1184 outer->flags && outer->flags < outermost;
1185 outer = outer->parent)
1186 continue;
1187 inform (outer->loc, "containing loop here");
1190 this_mask &= ~outermost;
1194 mask_all |= this_mask;
1196 if (loop->flags & OLF_TILE)
1198 /* When tiling, vector goes to the element loop, and failing
1199 that we put worker there. The std doesn't contemplate
1200 specifying all three. We choose to put worker and vector on
1201 the element loops in that case. */
1202 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1203 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1204 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1206 loop->e_mask = this_e_mask;
1207 this_mask ^= this_e_mask;
1210 loop->mask = this_mask;
1212 if (dump_file)
1213 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1214 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1215 loop->mask, loop->e_mask);
1217 if (loop->child)
1219 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1220 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1221 mask_all |= loop->inner;
1224 if (loop->sibling)
1225 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1227 return mask_all;
1230 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1231 OUTER_MASK is the partitioning this loop is contained within.
1232 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1233 Return the cumulative partitioning used by this loop, siblings and
1234 children. */
1236 static unsigned
1237 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1238 bool outer_assign)
1240 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1241 bool noisy = true;
1242 bool tiling = loop->flags & OLF_TILE;
1244 #ifdef ACCEL_COMPILER
1245 /* When device_type is supported, we want the device compiler to be
1246 noisy, if the loop parameters are device_type-specific. */
1247 noisy = false;
1248 #endif
1250 if (assign && (!outer_assign || loop->inner))
1252 /* Allocate outermost and non-innermost loops at the outermost
1253 non-innermost available level. */
1254 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1256 /* Find the first outermost available partition. */
1257 while (this_mask <= outer_mask)
1258 this_mask <<= 1;
1260 /* Grab two axes if tiling, and we've not assigned anything */
1261 if (tiling && !(loop->mask | loop->e_mask))
1262 this_mask |= this_mask << 1;
1264 /* Prohibit the innermost partitioning at the moment. */
1265 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1267 /* Don't use any dimension explicitly claimed by an inner loop. */
1268 this_mask &= ~loop->inner;
1270 if (tiling && !loop->e_mask)
1272 /* If we got two axes, allocate the inner one to the element
1273 loop. */
1274 loop->e_mask = this_mask & (this_mask << 1);
1275 this_mask ^= loop->e_mask;
1278 loop->mask |= this_mask;
1281 if (loop->child)
1283 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1284 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1285 outer_assign | assign);
1288 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1290 /* Allocate the loop at the innermost available level. Note
1291 that we do this even if we already assigned this loop the
1292 outermost available level above. That way we'll partition
1293 this along 2 axes, if they are available. */
1294 unsigned this_mask = 0;
1296 /* Determine the outermost partitioning used within this loop. */
1297 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1298 this_mask = least_bit_hwi (this_mask);
1300 /* Pick the partitioning just inside that one. */
1301 this_mask >>= 1;
1303 /* And avoid picking one use by an outer loop. */
1304 this_mask &= ~outer_mask;
1306 /* If tiling and we failed completely above, grab the next one
1307 too. Making sure it doesn't hit an outer loop. */
1308 if (tiling)
1310 this_mask &= ~(loop->e_mask | loop->mask);
1311 unsigned tile_mask = ((this_mask >> 1)
1312 & ~(outer_mask | loop->e_mask | loop->mask));
1314 if (tile_mask || loop->mask)
1316 loop->e_mask |= this_mask;
1317 this_mask = tile_mask;
1319 if (!loop->e_mask && noisy)
1320 warning_at (loop->loc, 0,
1321 "insufficient partitioning available"
1322 " to parallelize element loop");
1325 loop->mask |= this_mask;
1326 if (!loop->mask && noisy)
1327 warning_at (loop->loc, 0,
1328 tiling
1329 ? G_("insufficient partitioning available"
1330 " to parallelize tile loop")
1331 : G_("insufficient partitioning available"
1332 " to parallelize loop"));
1335 if (assign && dump_file)
1336 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1337 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1338 loop->mask, loop->e_mask);
1340 unsigned inner_mask = 0;
1342 if (loop->sibling)
1343 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1344 outer_mask, outer_assign);
1346 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1348 return inner_mask;
1351 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1352 axes. Return mask of partitioning. */
1354 static unsigned
1355 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1357 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1359 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1361 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1362 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1364 return mask_all;
1367 /* Default fork/join early expander. Delete the function calls if
1368 there is no RTL expander. */
1370 bool
1371 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1372 const int *ARG_UNUSED (dims), bool is_fork)
1374 if (is_fork)
1375 return targetm.have_oacc_fork ();
1376 else
1377 return targetm.have_oacc_join ();
1380 /* Default goacc.reduction early expander.
1382 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1383 If RES_PTR is not integer-zerop:
1384 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1385 TEARDOWN - emit '*RES_PTR = VAR'
1386 If LHS is not NULL
1387 emit 'LHS = VAR' */
1389 void
1390 default_goacc_reduction (gcall *call)
1392 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1393 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1394 tree lhs = gimple_call_lhs (call);
1395 tree var = gimple_call_arg (call, 2);
1396 gimple_seq seq = NULL;
1398 if (code == IFN_GOACC_REDUCTION_SETUP
1399 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1401 /* Setup and Teardown need to copy from/to the receiver object,
1402 if there is one. */
1403 tree ref_to_res = gimple_call_arg (call, 1);
1405 if (!integer_zerop (ref_to_res))
1407 tree dst = build_simple_mem_ref (ref_to_res);
1408 tree src = var;
1410 if (code == IFN_GOACC_REDUCTION_SETUP)
1412 src = dst;
1413 dst = lhs;
1414 lhs = NULL;
1416 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1420 /* Copy VAR to LHS, if there is an LHS. */
1421 if (lhs)
1422 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1424 gsi_replace_with_seq (&gsi, seq, true);
1427 /* Main entry point for oacc transformations which run on the device
1428 compiler after LTO, so we know what the target device is at this
1429 point (including the host fallback). */
1431 static unsigned int
1432 execute_oacc_device_lower ()
1434 tree attrs = oacc_get_fn_attrib (current_function_decl);
1436 if (!attrs)
1437 /* Not an offloaded function. */
1438 return 0;
1440 /* Parse the default dim argument exactly once. */
1441 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1443 oacc_parse_default_dims (flag_openacc_dims);
1444 flag_openacc_dims = (char *)&flag_openacc_dims;
1447 bool is_oacc_kernels
1448 = (lookup_attribute ("oacc kernels",
1449 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1450 bool is_oacc_kernels_parallelized
1451 = (lookup_attribute ("oacc kernels parallelized",
1452 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1454 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
1455 kernels, so remove the parallelism dimensions function attributes
1456 potentially set earlier on. */
1457 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
1459 oacc_set_fn_attrib (current_function_decl, NULL, NULL);
1460 attrs = oacc_get_fn_attrib (current_function_decl);
1463 /* Discover, partition and process the loops. */
1464 oacc_loop *loops = oacc_loop_discovery ();
1465 int fn_level = oacc_fn_attrib_level (attrs);
1467 if (dump_file)
1469 if (fn_level >= 0)
1470 fprintf (dump_file, "Function is OpenACC routine level %d\n",
1471 fn_level);
1472 else if (is_oacc_kernels)
1473 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1474 (is_oacc_kernels_parallelized
1475 ? "parallelized" : "unparallelized"));
1476 else
1477 fprintf (dump_file, "Function is OpenACC parallel offload\n");
1480 unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
1481 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1482 /* OpenACC kernels constructs are special: they currently don't use the
1483 generic oacc_loop infrastructure and attribute/dimension processing. */
1484 if (is_oacc_kernels && is_oacc_kernels_parallelized)
1486 /* Parallelized OpenACC kernels constructs use gang parallelism. See
1487 also tree-parloops.c:create_parallel_loop. */
1488 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
1491 int dims[GOMP_DIM_MAX];
1492 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1494 if (dump_file)
1496 const char *comma = "Compute dimensions [";
1497 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1498 fprintf (dump_file, "%s%d", comma, dims[ix]);
1499 fprintf (dump_file, "]\n");
1502 oacc_loop_process (loops);
1503 if (dump_file)
1505 fprintf (dump_file, "OpenACC loops\n");
1506 dump_oacc_loop (dump_file, loops, 0);
1507 fprintf (dump_file, "\n");
1510 /* Offloaded targets may introduce new basic blocks, which require
1511 dominance information to update SSA. */
1512 calculate_dominance_info (CDI_DOMINATORS);
1514 /* Now lower internal loop functions to target-specific code
1515 sequences. */
1516 basic_block bb;
1517 FOR_ALL_BB_FN (bb, cfun)
1518 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1520 gimple *stmt = gsi_stmt (gsi);
1521 if (!is_gimple_call (stmt))
1523 gsi_next (&gsi);
1524 continue;
1527 gcall *call = as_a <gcall *> (stmt);
1528 if (!gimple_call_internal_p (call))
1530 gsi_next (&gsi);
1531 continue;
1534 /* Rewind to allow rescan. */
1535 gsi_prev (&gsi);
1536 bool rescan = false, remove = false;
1537 enum internal_fn ifn_code = gimple_call_internal_fn (call);
1539 switch (ifn_code)
1541 default: break;
1543 case IFN_GOACC_TILE:
1544 oacc_xform_tile (call);
1545 rescan = true;
1546 break;
1548 case IFN_GOACC_LOOP:
1549 oacc_xform_loop (call);
1550 rescan = true;
1551 break;
1553 case IFN_GOACC_REDUCTION:
1554 /* Mark the function for SSA renaming. */
1555 mark_virtual_operands_for_renaming (cfun);
1557 /* If the level is -1, this ended up being an unused
1558 axis. Handle as a default. */
1559 if (integer_minus_onep (gimple_call_arg (call, 3)))
1560 default_goacc_reduction (call);
1561 else
1562 targetm.goacc.reduction (call);
1563 rescan = true;
1564 break;
1566 case IFN_UNIQUE:
1568 enum ifn_unique_kind kind
1569 = ((enum ifn_unique_kind)
1570 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1572 switch (kind)
1574 default:
1575 break;
1577 case IFN_UNIQUE_OACC_FORK:
1578 case IFN_UNIQUE_OACC_JOIN:
1579 if (integer_minus_onep (gimple_call_arg (call, 2)))
1580 remove = true;
1581 else if (!targetm.goacc.fork_join
1582 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1583 remove = true;
1584 break;
1586 case IFN_UNIQUE_OACC_HEAD_MARK:
1587 case IFN_UNIQUE_OACC_TAIL_MARK:
1588 remove = true;
1589 break;
1591 break;
1595 if (gsi_end_p (gsi))
1596 /* We rewound past the beginning of the BB. */
1597 gsi = gsi_start_bb (bb);
1598 else
1599 /* Undo the rewind. */
1600 gsi_next (&gsi);
1602 if (remove)
1604 if (gimple_vdef (call))
1605 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1606 if (gimple_call_lhs (call))
1608 /* Propagate the data dependency var. */
1609 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1610 gimple_call_arg (call, 1));
1611 gsi_replace (&gsi, ass, false);
1613 else
1614 gsi_remove (&gsi, true);
1616 else if (!rescan)
1617 /* If not rescanning, advance over the call. */
1618 gsi_next (&gsi);
1621 free_oacc_loop (loops);
1623 return 0;
1626 /* Default launch dimension validator. Force everything to 1. A
1627 backend that wants to provide larger dimensions must override this
1628 hook. */
1630 bool
1631 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
1632 int ARG_UNUSED (fn_level))
1634 bool changed = false;
1636 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
1638 if (dims[ix] != 1)
1640 dims[ix] = 1;
1641 changed = true;
1645 return changed;
1648 /* Default dimension bound is unknown on accelerator and 1 on host. */
1651 default_goacc_dim_limit (int ARG_UNUSED (axis))
1653 #ifdef ACCEL_COMPILER
1654 return 0;
1655 #else
1656 return 1;
1657 #endif
1660 namespace {
1662 const pass_data pass_data_oacc_device_lower =
1664 GIMPLE_PASS, /* type */
1665 "oaccdevlow", /* name */
1666 OPTGROUP_OMP, /* optinfo_flags */
1667 TV_NONE, /* tv_id */
1668 PROP_cfg, /* properties_required */
1669 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
1670 0, /* properties_destroyed */
1671 0, /* todo_flags_start */
1672 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1675 class pass_oacc_device_lower : public gimple_opt_pass
1677 public:
1678 pass_oacc_device_lower (gcc::context *ctxt)
1679 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
1682 /* opt_pass methods: */
1683 virtual bool gate (function *) { return flag_openacc; };
1685 virtual unsigned int execute (function *)
1687 return execute_oacc_device_lower ();
1690 }; // class pass_oacc_device_lower
1692 } // anon namespace
1694 gimple_opt_pass *
1695 make_pass_oacc_device_lower (gcc::context *ctxt)
1697 return new pass_oacc_device_lower (ctxt);
1701 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
1702 GOMP_SIMT_ENTER call identifying the privatized variables, which are
1703 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
1704 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
1706 static void
1707 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
1709 gimple *alloc_stmt = gsi_stmt (*gsi);
1710 tree simtrec = gimple_call_lhs (alloc_stmt);
1711 tree simduid = gimple_call_arg (alloc_stmt, 0);
1712 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
1713 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
1714 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
1715 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
1716 TREE_ADDRESSABLE (rectype) = 1;
1717 TREE_TYPE (simtrec) = build_pointer_type (rectype);
1718 for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
1720 tree *argp = gimple_call_arg_ptr (enter_stmt, i);
1721 if (*argp == null_pointer_node)
1722 continue;
1723 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
1724 && VAR_P (TREE_OPERAND (*argp, 0)));
1725 tree var = TREE_OPERAND (*argp, 0);
1727 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
1728 DECL_NAME (var), TREE_TYPE (var));
1729 SET_DECL_ALIGN (field, DECL_ALIGN (var));
1730 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
1731 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
1733 insert_field_into_struct (rectype, field);
1735 tree t = build_simple_mem_ref (simtrec);
1736 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
1737 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
1738 SET_DECL_VALUE_EXPR (var, t);
1739 DECL_HAS_VALUE_EXPR_P (var) = 1;
1740 *regimplify = true;
1742 layout_type (rectype);
1743 tree size = TYPE_SIZE_UNIT (rectype);
1744 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
1746 alloc_stmt
1747 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
1748 gimple_call_set_lhs (alloc_stmt, simtrec);
1749 gsi_replace (gsi, alloc_stmt, false);
1750 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
1751 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
1752 gsi_replace (&enter_gsi, enter_stmt, false);
1754 use_operand_p use;
1755 gimple *exit_stmt;
1756 if (single_imm_use (simtrec, &use, &exit_stmt))
1758 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
1759 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
1760 tree clobber = build_constructor (rectype, NULL);
1761 TREE_THIS_VOLATILE (clobber) = 1;
1762 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
1763 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
1765 else
1766 gcc_checking_assert (has_zero_uses (simtrec));
1769 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
1771 static tree
1772 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
1774 tree t = *tp;
1776 if (VAR_P (t)
1777 && DECL_HAS_VALUE_EXPR_P (t)
1778 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
1780 *walk_subtrees = 0;
1781 return t;
1783 return NULL_TREE;
1786 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
1787 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
1788 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
1789 internal functions on non-SIMT targets, and likewise some SIMD internal
1790 functions on SIMT targets. */
1792 static unsigned int
1793 execute_omp_device_lower ()
1795 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
1796 bool regimplify = false;
1797 basic_block bb;
1798 gimple_stmt_iterator gsi;
1799 FOR_EACH_BB_FN (bb, cfun)
1800 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1802 gimple *stmt = gsi_stmt (gsi);
1803 if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
1804 continue;
1805 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
1806 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
1807 switch (gimple_call_internal_fn (stmt))
1809 case IFN_GOMP_USE_SIMT:
1810 rhs = vf == 1 ? integer_zero_node : integer_one_node;
1811 break;
1812 case IFN_GOMP_SIMT_ENTER:
1813 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1814 goto simtreg_enter_exit;
1815 case IFN_GOMP_SIMT_ENTER_ALLOC:
1816 if (vf != 1)
1817 ompdevlow_adjust_simt_enter (&gsi, &regimplify);
1818 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
1819 goto simtreg_enter_exit;
1820 case IFN_GOMP_SIMT_EXIT:
1821 simtreg_enter_exit:
1822 if (vf != 1)
1823 continue;
1824 unlink_stmt_vdef (stmt);
1825 break;
1826 case IFN_GOMP_SIMT_LANE:
1827 case IFN_GOMP_SIMT_LAST_LANE:
1828 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
1829 break;
1830 case IFN_GOMP_SIMT_VF:
1831 rhs = build_int_cst (type, vf);
1832 break;
1833 case IFN_GOMP_SIMT_ORDERED_PRED:
1834 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
1835 if (rhs || !lhs)
1836 unlink_stmt_vdef (stmt);
1837 break;
1838 case IFN_GOMP_SIMT_VOTE_ANY:
1839 case IFN_GOMP_SIMT_XCHG_BFLY:
1840 case IFN_GOMP_SIMT_XCHG_IDX:
1841 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1842 break;
1843 case IFN_GOMP_SIMD_LANE:
1844 case IFN_GOMP_SIMD_LAST_LANE:
1845 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
1846 break;
1847 case IFN_GOMP_SIMD_VF:
1848 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
1849 break;
1850 default:
1851 continue;
1853 if (lhs && !rhs)
1854 continue;
1855 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
1856 gsi_replace (&gsi, stmt, false);
1858 if (regimplify)
1859 FOR_EACH_BB_REVERSE_FN (bb, cfun)
1860 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1861 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
1863 if (gimple_clobber_p (gsi_stmt (gsi)))
1864 gsi_remove (&gsi, true);
1865 else
1866 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
1868 if (vf != 1)
1869 cfun->has_force_vectorize_loops = false;
1870 return 0;
1873 namespace {
1875 const pass_data pass_data_omp_device_lower =
1877 GIMPLE_PASS, /* type */
1878 "ompdevlow", /* name */
1879 OPTGROUP_OMP, /* optinfo_flags */
1880 TV_NONE, /* tv_id */
1881 PROP_cfg, /* properties_required */
1882 PROP_gimple_lomp_dev, /* properties_provided */
1883 0, /* properties_destroyed */
1884 0, /* todo_flags_start */
1885 TODO_update_ssa, /* todo_flags_finish */
1888 class pass_omp_device_lower : public gimple_opt_pass
1890 public:
1891 pass_omp_device_lower (gcc::context *ctxt)
1892 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
1895 /* opt_pass methods: */
1896 virtual bool gate (function *fun)
1898 return !(fun->curr_properties & PROP_gimple_lomp_dev);
1900 virtual unsigned int execute (function *)
1902 return execute_omp_device_lower ();
1905 }; // class pass_expand_omp_ssa
1907 } // anon namespace
1909 gimple_opt_pass *
1910 make_pass_omp_device_lower (gcc::context *ctxt)
1912 return new pass_omp_device_lower (ctxt);
1915 /* "omp declare target link" handling pass. */
1917 namespace {
1919 const pass_data pass_data_omp_target_link =
1921 GIMPLE_PASS, /* type */
1922 "omptargetlink", /* name */
1923 OPTGROUP_OMP, /* optinfo_flags */
1924 TV_NONE, /* tv_id */
1925 PROP_ssa, /* properties_required */
1926 0, /* properties_provided */
1927 0, /* properties_destroyed */
1928 0, /* todo_flags_start */
1929 TODO_update_ssa, /* todo_flags_finish */
1932 class pass_omp_target_link : public gimple_opt_pass
1934 public:
1935 pass_omp_target_link (gcc::context *ctxt)
1936 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
1939 /* opt_pass methods: */
1940 virtual bool gate (function *fun)
1942 #ifdef ACCEL_COMPILER
1943 tree attrs = DECL_ATTRIBUTES (fun->decl);
1944 return lookup_attribute ("omp declare target", attrs)
1945 || lookup_attribute ("omp target entrypoint", attrs);
1946 #else
1947 (void) fun;
1948 return false;
1949 #endif
1952 virtual unsigned execute (function *);
1955 /* Callback for walk_gimple_stmt used to scan for link var operands. */
1957 static tree
1958 find_link_var_op (tree *tp, int *walk_subtrees, void *)
1960 tree t = *tp;
1962 if (VAR_P (t)
1963 && DECL_HAS_VALUE_EXPR_P (t)
1964 && is_global_var (t)
1965 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
1967 *walk_subtrees = 0;
1968 return t;
1971 return NULL_TREE;
1974 unsigned
1975 pass_omp_target_link::execute (function *fun)
1977 basic_block bb;
1978 FOR_EACH_BB_FN (bb, fun)
1980 gimple_stmt_iterator gsi;
1981 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1982 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
1983 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
1986 return 0;
1989 } // anon namespace
1991 gimple_opt_pass *
1992 make_pass_omp_target_link (gcc::context *ctxt)
1994 return new pass_omp_target_link (ctxt);