[30/77] Use scalar_int_mode for doubleword splits
[official-gcc.git] / gcc / omp-offload.c
blob2d4fd41168054a5894fe346a5c8ea2d12bed9de6
1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
4 Copyright (C) 2005-2017 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
55 /* Describe the OpenACC looping structure of a function. The entire
56 function is held in a 'NULL' loop. */
58 struct oacc_loop
60 oacc_loop *parent; /* Containing loop. */
62 oacc_loop *child; /* First inner loop. */
64 oacc_loop *sibling; /* Next loop within same parent. */
66 location_t loc; /* Location of the loop start. */
68 gcall *marker; /* Initial head marker. */
70 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
71 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
73 tree routine; /* Pseudo-loop enclosing a routine. */
75 unsigned mask; /* Partitioning mask. */
76 unsigned e_mask; /* Partitioning of element loops (when tiling). */
77 unsigned inner; /* Partitioning of inner loops. */
78 unsigned flags; /* Partitioning flags. */
79 vec<gcall *> ifns; /* Contained loop abstraction functions. */
80 tree chunk_size; /* Chunk size. */
81 gcall *head_end; /* Final marker of head sequence. */
84 /* Holds offload tables with decls. */
85 vec<tree, va_gc> *offload_funcs, *offload_vars;
87 /* Return level at which oacc routine may spawn a partitioned loop, or
88 -1 if it is not a routine (i.e. is an offload fn). */
90 static int
91 oacc_fn_attrib_level (tree attr)
93 tree pos = TREE_VALUE (attr);
95 if (!TREE_PURPOSE (pos))
96 return -1;
98 int ix = 0;
99 for (ix = 0; ix != GOMP_DIM_MAX;
100 ix++, pos = TREE_CHAIN (pos))
101 if (!integer_zerop (TREE_PURPOSE (pos)))
102 break;
104 return ix;
107 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
108 adds their addresses and sizes to constructor-vector V_CTOR. */
110 static void
111 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
112 vec<constructor_elt, va_gc> *v_ctor)
114 unsigned len = vec_safe_length (v_decls);
115 for (unsigned i = 0; i < len; i++)
117 tree it = (*v_decls)[i];
118 bool is_var = VAR_P (it);
119 bool is_link_var
120 = is_var
121 #ifdef ACCEL_COMPILER
122 && DECL_HAS_VALUE_EXPR_P (it)
123 #endif
124 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
126 tree size = NULL_TREE;
127 if (is_var)
128 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
130 tree addr;
131 if (!is_link_var)
132 addr = build_fold_addr_expr (it);
133 else
135 #ifdef ACCEL_COMPILER
136 /* For "omp declare target link" vars add address of the pointer to
137 the target table, instead of address of the var. */
138 tree value_expr = DECL_VALUE_EXPR (it);
139 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
140 varpool_node::finalize_decl (link_ptr_decl);
141 addr = build_fold_addr_expr (link_ptr_decl);
142 #else
143 addr = build_fold_addr_expr (it);
144 #endif
146 /* Most significant bit of the size marks "omp declare target link"
147 vars in host and target tables. */
148 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
149 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
150 * BITS_PER_UNIT - 1);
151 size = wide_int_to_tree (const_ptr_type_node, isize);
154 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
155 if (is_var)
156 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
160 /* Create new symbols containing (address, size) pairs for global variables,
161 marked with "omp declare target" attribute, as well as addresses for the
162 functions, which are outlined offloading regions. */
163 void
164 omp_finish_file (void)
166 unsigned num_funcs = vec_safe_length (offload_funcs);
167 unsigned num_vars = vec_safe_length (offload_vars);
169 if (num_funcs == 0 && num_vars == 0)
170 return;
172 if (targetm_common.have_named_sections)
174 vec<constructor_elt, va_gc> *v_f, *v_v;
175 vec_alloc (v_f, num_funcs);
176 vec_alloc (v_v, num_vars * 2);
178 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
179 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
181 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
182 num_vars * 2);
183 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
184 num_funcs);
185 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
186 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
187 tree ctor_v = build_constructor (vars_decl_type, v_v);
188 tree ctor_f = build_constructor (funcs_decl_type, v_f);
189 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
190 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
191 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
192 get_identifier (".offload_func_table"),
193 funcs_decl_type);
194 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
195 get_identifier (".offload_var_table"),
196 vars_decl_type);
197 TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
198 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
199 otherwise a joint table in a binary will contain padding between
200 tables from multiple object files. */
201 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
202 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
203 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
204 DECL_INITIAL (funcs_decl) = ctor_f;
205 DECL_INITIAL (vars_decl) = ctor_v;
206 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
207 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
209 varpool_node::finalize_decl (vars_decl);
210 varpool_node::finalize_decl (funcs_decl);
212 else
214 for (unsigned i = 0; i < num_funcs; i++)
216 tree it = (*offload_funcs)[i];
217 targetm.record_offload_symbol (it);
219 for (unsigned i = 0; i < num_vars; i++)
221 tree it = (*offload_vars)[i];
222 targetm.record_offload_symbol (it);
227 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
228 axis DIM. Return a tmp var holding the result. */
230 static tree
231 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
233 tree arg = build_int_cst (unsigned_type_node, dim);
234 tree size = create_tmp_var (integer_type_node);
235 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
236 gimple *call = gimple_build_call_internal (fn, 1, arg);
238 gimple_call_set_lhs (call, size);
239 gimple_seq_add_stmt (seq, call);
241 return size;
244 /* Find the number of threads (POS = false), or thread number (POS =
245 true) for an OpenACC region partitioned as MASK. Setup code
246 required for the calculation is added to SEQ. */
248 static tree
249 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
251 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
252 unsigned ix;
254 /* Start at gang level, and examine relevant dimension indices. */
255 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
256 if (GOMP_DIM_MASK (ix) & mask)
258 if (res)
260 /* We had an outer index, so scale that by the size of
261 this dimension. */
262 tree n = oacc_dim_call (false, ix, seq);
263 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
265 if (pos)
267 /* Determine index in this dimension. */
268 tree id = oacc_dim_call (true, ix, seq);
269 if (res)
270 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
271 else
272 res = id;
276 if (res == NULL_TREE)
277 res = integer_zero_node;
279 return res;
282 /* Transform IFN_GOACC_LOOP calls to actual code. See
283 expand_oacc_for for where these are generated. At the vector
284 level, we stride loops, such that each member of a warp will
285 operate on adjacent iterations. At the worker and gang level,
286 each gang/warp executes a set of contiguous iterations. Chunking
287 can override this such that each iteration engine executes a
288 contiguous chunk, and then moves on to stride to the next chunk. */
290 static void
291 oacc_xform_loop (gcall *call)
293 gimple_stmt_iterator gsi = gsi_for_stmt (call);
294 enum ifn_goacc_loop_kind code
295 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
296 tree dir = gimple_call_arg (call, 1);
297 tree range = gimple_call_arg (call, 2);
298 tree step = gimple_call_arg (call, 3);
299 tree chunk_size = NULL_TREE;
300 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
301 tree lhs = gimple_call_lhs (call);
302 tree type = TREE_TYPE (lhs);
303 tree diff_type = TREE_TYPE (range);
304 tree r = NULL_TREE;
305 gimple_seq seq = NULL;
306 bool chunking = false, striding = true;
307 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
308 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
310 #ifdef ACCEL_COMPILER
311 chunk_size = gimple_call_arg (call, 4);
312 if (integer_minus_onep (chunk_size) /* Force static allocation. */
313 || integer_zerop (chunk_size)) /* Default (also static). */
315 /* If we're at the gang level, we want each to execute a
316 contiguous run of iterations. Otherwise we want each element
317 to stride. */
318 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
319 chunking = false;
321 else
323 /* Chunk of size 1 is striding. */
324 striding = integer_onep (chunk_size);
325 chunking = !striding;
327 #endif
329 /* striding=true, chunking=true
330 -> invalid.
331 striding=true, chunking=false
332 -> chunks=1
333 striding=false,chunking=true
334 -> chunks=ceil (range/(chunksize*threads*step))
335 striding=false,chunking=false
336 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
337 push_gimplify_context (true);
339 switch (code)
341 default: gcc_unreachable ();
343 case IFN_GOACC_LOOP_CHUNKS:
344 if (!chunking)
345 r = build_int_cst (type, 1);
346 else
348 /* chunk_max
349 = (range - dir) / (chunks * step * num_threads) + dir */
350 tree per = oacc_thread_numbers (false, mask, &seq);
351 per = fold_convert (type, per);
352 chunk_size = fold_convert (type, chunk_size);
353 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
354 per = fold_build2 (MULT_EXPR, type, per, step);
355 r = build2 (MINUS_EXPR, type, range, dir);
356 r = build2 (PLUS_EXPR, type, r, per);
357 r = build2 (TRUNC_DIV_EXPR, type, r, per);
359 break;
361 case IFN_GOACC_LOOP_STEP:
363 /* If striding, step by the entire compute volume, otherwise
364 step by the inner volume. */
365 unsigned volume = striding ? mask : inner_mask;
367 r = oacc_thread_numbers (false, volume, &seq);
368 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
370 break;
372 case IFN_GOACC_LOOP_OFFSET:
373 if (striding)
375 r = oacc_thread_numbers (true, mask, &seq);
376 r = fold_convert (diff_type, r);
378 else
380 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
381 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
382 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
383 inner_size, outer_size);
385 volume = fold_convert (diff_type, volume);
386 if (chunking)
387 chunk_size = fold_convert (diff_type, chunk_size);
388 else
390 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
392 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
393 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
394 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
397 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
398 fold_convert (diff_type, inner_size));
399 r = oacc_thread_numbers (true, outer_mask, &seq);
400 r = fold_convert (diff_type, r);
401 r = build2 (MULT_EXPR, diff_type, r, span);
403 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
404 inner = fold_convert (diff_type, inner);
405 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
407 if (chunking)
409 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
410 tree per
411 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
412 per = build2 (MULT_EXPR, diff_type, per, chunk);
414 r = build2 (PLUS_EXPR, diff_type, r, per);
417 r = fold_build2 (MULT_EXPR, diff_type, r, step);
418 if (type != diff_type)
419 r = fold_convert (type, r);
420 break;
422 case IFN_GOACC_LOOP_BOUND:
423 if (striding)
424 r = range;
425 else
427 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
428 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
429 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
430 inner_size, outer_size);
432 volume = fold_convert (diff_type, volume);
433 if (chunking)
434 chunk_size = fold_convert (diff_type, chunk_size);
435 else
437 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
439 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
440 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
441 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
444 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
445 fold_convert (diff_type, inner_size));
447 r = fold_build2 (MULT_EXPR, diff_type, span, step);
449 tree offset = gimple_call_arg (call, 6);
450 r = build2 (PLUS_EXPR, diff_type, r,
451 fold_convert (diff_type, offset));
452 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
453 diff_type, r, range);
455 if (diff_type != type)
456 r = fold_convert (type, r);
457 break;
460 gimplify_assign (lhs, r, &seq);
462 pop_gimplify_context (NULL);
464 gsi_replace_with_seq (&gsi, seq, true);
467 /* Transform a GOACC_TILE call. Determines the element loop span for
468 the specified loop of the nest. This is 1 if we're not tiling.
470 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
472 static void
473 oacc_xform_tile (gcall *call)
475 gimple_stmt_iterator gsi = gsi_for_stmt (call);
476 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
477 /* Inner loops have higher loop_nos. */
478 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
479 tree tile_size = gimple_call_arg (call, 2);
480 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
481 tree lhs = gimple_call_lhs (call);
482 tree type = TREE_TYPE (lhs);
483 gimple_seq seq = NULL;
484 tree span = build_int_cst (type, 1);
486 gcc_assert (!(e_mask
487 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
488 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
489 push_gimplify_context (!seen_error ());
491 #ifndef ACCEL_COMPILER
492 /* Partitioning disabled on host compilers. */
493 e_mask = 0;
494 #endif
495 if (!e_mask)
496 /* Not paritioning. */
497 span = integer_one_node;
498 else if (!integer_zerop (tile_size))
499 /* User explicitly specified size. */
500 span = tile_size;
501 else
503 /* Pick a size based on the paritioning of the element loop and
504 the number of loop nests. */
505 tree first_size = NULL_TREE;
506 tree second_size = NULL_TREE;
508 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
509 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
510 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
511 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
513 if (!first_size)
515 first_size = second_size;
516 second_size = NULL_TREE;
519 if (loop_no + 1 == collapse)
521 span = first_size;
522 if (!loop_no && second_size)
523 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
524 span, second_size);
526 else if (loop_no + 2 == collapse)
527 span = second_size;
528 else
529 span = NULL_TREE;
531 if (!span)
532 /* There's no obvious element size for this loop. Options
533 are 1, first_size or some non-unity constant (32 is my
534 favourite). We should gather some statistics. */
535 span = first_size;
538 span = fold_convert (type, span);
539 gimplify_assign (lhs, span, &seq);
541 pop_gimplify_context (NULL);
543 gsi_replace_with_seq (&gsi, seq, true);
546 /* Default partitioned and minimum partitioned dimensions. */
548 static int oacc_default_dims[GOMP_DIM_MAX];
549 static int oacc_min_dims[GOMP_DIM_MAX];
551 /* Parse the default dimension parameter. This is a set of
552 :-separated optional compute dimensions. Each specified dimension
553 is a positive integer. When device type support is added, it is
554 planned to be a comma separated list of such compute dimensions,
555 with all but the first prefixed by the colon-terminated device
556 type. */
558 static void
559 oacc_parse_default_dims (const char *dims)
561 int ix;
563 for (ix = GOMP_DIM_MAX; ix--;)
565 oacc_default_dims[ix] = -1;
566 oacc_min_dims[ix] = 1;
569 #ifndef ACCEL_COMPILER
570 /* Cannot be overridden on the host. */
571 dims = NULL;
572 #endif
573 if (dims)
575 const char *pos = dims;
577 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
579 if (ix)
581 if (*pos != ':')
582 goto malformed;
583 pos++;
586 if (*pos != ':')
588 long val;
589 const char *eptr;
591 errno = 0;
592 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
593 if (errno || val <= 0 || (int) val != val)
594 goto malformed;
595 pos = eptr;
596 oacc_default_dims[ix] = (int) val;
599 if (*pos)
601 malformed:
602 error_at (UNKNOWN_LOCATION,
603 "-fopenacc-dim operand is malformed at '%s'", pos);
607 /* Allow the backend to validate the dimensions. */
608 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1);
609 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2);
612 /* Validate and update the dimensions for offloaded FN. ATTRS is the
613 raw attribute. DIMS is an array of dimensions, which is filled in.
614 LEVEL is the partitioning level of a routine, or -1 for an offload
615 region itself. USED is the mask of partitioned execution in the
616 function. */
618 static void
619 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
621 tree purpose[GOMP_DIM_MAX];
622 unsigned ix;
623 tree pos = TREE_VALUE (attrs);
625 /* Make sure the attribute creator attached the dimension
626 information. */
627 gcc_assert (pos);
629 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
631 purpose[ix] = TREE_PURPOSE (pos);
632 tree val = TREE_VALUE (pos);
633 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
634 pos = TREE_CHAIN (pos);
637 bool changed = targetm.goacc.validate_dims (fn, dims, level);
639 /* Default anything left to 1 or a partitioned default. */
640 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
641 if (dims[ix] < 0)
643 /* The OpenACC spec says 'If the [num_gangs] clause is not
644 specified, an implementation-defined default will be used;
645 the default may depend on the code within the construct.'
646 (2.5.6). Thus an implementation is free to choose
647 non-unity default for a parallel region that doesn't have
648 any gang-partitioned loops. However, it appears that there
649 is a sufficient body of user code that expects non-gang
650 partitioned regions to not execute in gang-redundant mode.
651 So we (a) don't warn about the non-portability and (b) pick
652 the minimum permissible dimension size when there is no
653 partitioned execution. Otherwise we pick the global
654 default for the dimension, which the user can control. The
655 same wording and logic applies to num_workers and
656 vector_length, however the worker- or vector- single
657 execution doesn't have the same impact as gang-redundant
658 execution. (If the minimum gang-level partioning is not 1,
659 the target is probably too confusing.) */
660 dims[ix] = (used & GOMP_DIM_MASK (ix)
661 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
662 changed = true;
665 if (changed)
667 /* Replace the attribute with new values. */
668 pos = NULL_TREE;
669 for (ix = GOMP_DIM_MAX; ix--;)
670 pos = tree_cons (purpose[ix],
671 build_int_cst (integer_type_node, dims[ix]), pos);
672 oacc_replace_fn_attrib (fn, pos);
676 /* Create an empty OpenACC loop structure at LOC. */
678 static oacc_loop *
679 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
681 oacc_loop *loop = XCNEW (oacc_loop);
683 loop->parent = parent;
685 if (parent)
687 loop->sibling = parent->child;
688 parent->child = loop;
691 loop->loc = loc;
692 return loop;
695 /* Create an outermost, dummy OpenACC loop for offloaded function
696 DECL. */
698 static oacc_loop *
699 new_oacc_loop_outer (tree decl)
701 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
704 /* Start a new OpenACC loop structure beginning at head marker HEAD.
705 Link into PARENT loop. Return the new loop. */
707 static oacc_loop *
708 new_oacc_loop (oacc_loop *parent, gcall *marker)
710 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
712 loop->marker = marker;
714 /* TODO: This is where device_type flattening would occur for the loop
715 flags. */
717 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
719 tree chunk_size = integer_zero_node;
720 if (loop->flags & OLF_GANG_STATIC)
721 chunk_size = gimple_call_arg (marker, 4);
722 loop->chunk_size = chunk_size;
724 return loop;
727 /* Create a dummy loop encompassing a call to a openACC routine.
728 Extract the routine's partitioning requirements. */
730 static void
731 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
733 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
734 int level = oacc_fn_attrib_level (attrs);
736 gcc_assert (level >= 0);
738 loop->marker = call;
739 loop->routine = decl;
740 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
741 ^ (GOMP_DIM_MASK (level) - 1));
744 /* Finish off the current OpenACC loop ending at tail marker TAIL.
745 Return the parent loop. */
747 static oacc_loop *
748 finish_oacc_loop (oacc_loop *loop)
750 /* If the loop has been collapsed, don't partition it. */
751 if (loop->ifns.is_empty ())
752 loop->mask = loop->flags = 0;
753 return loop->parent;
756 /* Free all OpenACC loop structures within LOOP (inclusive). */
758 static void
759 free_oacc_loop (oacc_loop *loop)
761 if (loop->sibling)
762 free_oacc_loop (loop->sibling);
763 if (loop->child)
764 free_oacc_loop (loop->child);
766 loop->ifns.release ();
767 free (loop);
770 /* Dump out the OpenACC loop head or tail beginning at FROM. */
772 static void
773 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
774 const char *title, int level)
776 enum ifn_unique_kind kind
777 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
779 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
780 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
782 gimple *stmt = gsi_stmt (gsi);
784 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
786 enum ifn_unique_kind k
787 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
788 (gimple_call_arg (stmt, 0)));
790 if (k == kind && stmt != from)
791 break;
793 print_gimple_stmt (file, stmt, depth * 2 + 2);
795 gsi_next (&gsi);
796 while (gsi_end_p (gsi))
797 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
801 /* Dump OpenACC loops LOOP, its siblings and its children. */
803 static void
804 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
806 int ix;
808 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
809 loop->flags, loop->mask,
810 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
812 if (loop->marker)
813 print_gimple_stmt (file, loop->marker, depth * 2);
815 if (loop->routine)
816 fprintf (file, "%*sRoutine %s:%u:%s\n",
817 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
818 DECL_SOURCE_LINE (loop->routine),
819 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
821 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
822 if (loop->heads[ix])
823 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
824 for (ix = GOMP_DIM_MAX; ix--;)
825 if (loop->tails[ix])
826 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
828 if (loop->child)
829 dump_oacc_loop (file, loop->child, depth + 1);
830 if (loop->sibling)
831 dump_oacc_loop (file, loop->sibling, depth);
834 void debug_oacc_loop (oacc_loop *);
836 /* Dump loops to stderr. */
838 DEBUG_FUNCTION void
839 debug_oacc_loop (oacc_loop *loop)
841 dump_oacc_loop (stderr, loop, 0);
844 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
845 structures as we go. By construction these loops are properly
846 nested. */
848 static void
849 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
851 int marker = 0;
852 int remaining = 0;
854 if (bb->flags & BB_VISITED)
855 return;
857 follow:
858 bb->flags |= BB_VISITED;
860 /* Scan for loop markers. */
861 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
862 gsi_next (&gsi))
864 gimple *stmt = gsi_stmt (gsi);
866 if (!is_gimple_call (stmt))
867 continue;
869 gcall *call = as_a <gcall *> (stmt);
871 /* If this is a routine, make a dummy loop for it. */
872 if (tree decl = gimple_call_fndecl (call))
873 if (tree attrs = oacc_get_fn_attrib (decl))
875 gcc_assert (!marker);
876 new_oacc_loop_routine (loop, call, decl, attrs);
879 if (!gimple_call_internal_p (call))
880 continue;
882 switch (gimple_call_internal_fn (call))
884 default:
885 break;
887 case IFN_GOACC_LOOP:
888 case IFN_GOACC_TILE:
889 /* Record the abstraction function, so we can manipulate it
890 later. */
891 loop->ifns.safe_push (call);
892 break;
894 case IFN_UNIQUE:
895 enum ifn_unique_kind kind
896 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
897 (gimple_call_arg (call, 0)));
898 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
899 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
901 if (gimple_call_num_args (call) == 2)
903 gcc_assert (marker && !remaining);
904 marker = 0;
905 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
906 loop = finish_oacc_loop (loop);
907 else
908 loop->head_end = call;
910 else
912 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
914 if (!marker)
916 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
917 loop = new_oacc_loop (loop, call);
918 remaining = count;
920 gcc_assert (count == remaining);
921 if (remaining)
923 remaining--;
924 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
925 loop->heads[marker] = call;
926 else
927 loop->tails[remaining] = call;
929 marker++;
934 if (remaining || marker)
936 bb = single_succ (bb);
937 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
938 goto follow;
941 /* Walk successor blocks. */
942 edge e;
943 edge_iterator ei;
945 FOR_EACH_EDGE (e, ei, bb->succs)
946 oacc_loop_discover_walk (loop, e->dest);
949 /* LOOP is the first sibling. Reverse the order in place and return
950 the new first sibling. Recurse to child loops. */
952 static oacc_loop *
953 oacc_loop_sibling_nreverse (oacc_loop *loop)
955 oacc_loop *last = NULL;
958 if (loop->child)
959 loop->child = oacc_loop_sibling_nreverse (loop->child);
961 oacc_loop *next = loop->sibling;
962 loop->sibling = last;
963 last = loop;
964 loop = next;
966 while (loop);
968 return last;
971 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
972 the current function. */
974 static oacc_loop *
975 oacc_loop_discovery ()
977 /* Clear basic block flags, in particular BB_VISITED which we're going to use
978 in the following. */
979 clear_bb_flags ();
981 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
982 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
984 /* The siblings were constructed in reverse order, reverse them so
985 that diagnostics come out in an unsurprising order. */
986 top = oacc_loop_sibling_nreverse (top);
988 return top;
991 /* Transform the abstract internal function markers starting at FROM
992 to be for partitioning level LEVEL. Stop when we meet another HEAD
993 or TAIL marker. */
995 static void
996 oacc_loop_xform_head_tail (gcall *from, int level)
998 enum ifn_unique_kind kind
999 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1000 tree replacement = build_int_cst (unsigned_type_node, level);
1002 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1004 gimple *stmt = gsi_stmt (gsi);
1006 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1008 enum ifn_unique_kind k
1009 = ((enum ifn_unique_kind)
1010 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1012 if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1013 *gimple_call_arg_ptr (stmt, 2) = replacement;
1014 else if (k == kind && stmt != from)
1015 break;
1017 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1018 *gimple_call_arg_ptr (stmt, 3) = replacement;
1020 gsi_next (&gsi);
1021 while (gsi_end_p (gsi))
1022 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1026 /* Process the discovered OpenACC loops, setting the correct
1027 partitioning level etc. */
1029 static void
1030 oacc_loop_process (oacc_loop *loop)
1032 if (loop->child)
1033 oacc_loop_process (loop->child);
1035 if (loop->mask && !loop->routine)
1037 int ix;
1038 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1039 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1040 tree chunk_arg = loop->chunk_size;
1041 gcall *call;
1043 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1044 switch (gimple_call_internal_fn (call))
1046 case IFN_GOACC_LOOP:
1048 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1049 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1050 if (!is_e)
1051 gimple_call_set_arg (call, 4, chunk_arg);
1053 break;
1055 case IFN_GOACC_TILE:
1056 gimple_call_set_arg (call, 3, mask_arg);
1057 gimple_call_set_arg (call, 4, e_mask_arg);
1058 break;
1060 default:
1061 gcc_unreachable ();
1064 unsigned dim = GOMP_DIM_GANG;
1065 unsigned mask = loop->mask | loop->e_mask;
1066 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1068 while (!(GOMP_DIM_MASK (dim) & mask))
1069 dim++;
1071 oacc_loop_xform_head_tail (loop->heads[ix], dim);
1072 oacc_loop_xform_head_tail (loop->tails[ix], dim);
1074 mask ^= GOMP_DIM_MASK (dim);
1078 if (loop->sibling)
1079 oacc_loop_process (loop->sibling);
1082 /* Walk the OpenACC loop heirarchy checking and assigning the
1083 programmer-specified partitionings. OUTER_MASK is the partitioning
1084 this loop is contained within. Return mask of partitioning
1085 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1086 bit. */
1088 static unsigned
1089 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1091 unsigned this_mask = loop->mask;
1092 unsigned mask_all = 0;
1093 bool noisy = true;
1095 #ifdef ACCEL_COMPILER
1096 /* When device_type is supported, we want the device compiler to be
1097 noisy, if the loop parameters are device_type-specific. */
1098 noisy = false;
1099 #endif
1101 if (!loop->routine)
1103 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1104 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1105 bool tiling = (loop->flags & OLF_TILE) != 0;
1107 this_mask = ((loop->flags >> OLF_DIM_BASE)
1108 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1110 /* Apply auto partitioning if this is a non-partitioned regular
1111 loop, or (no more than) single axis tiled loop. */
1112 bool maybe_auto
1113 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1115 if ((this_mask != 0) + auto_par + seq_par > 1)
1117 if (noisy)
1118 error_at (loop->loc,
1119 seq_par
1120 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1121 : G_("%<auto%> conflicts with other OpenACC loop "
1122 "specifiers"));
1123 maybe_auto = false;
1124 loop->flags &= ~OLF_AUTO;
1125 if (seq_par)
1127 loop->flags
1128 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1129 this_mask = 0;
1133 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1135 loop->flags |= OLF_AUTO;
1136 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1140 if (this_mask & outer_mask)
1142 const oacc_loop *outer;
1143 for (outer = loop->parent; outer; outer = outer->parent)
1144 if ((outer->mask | outer->e_mask) & this_mask)
1145 break;
1147 if (noisy)
1149 if (outer)
1151 error_at (loop->loc,
1152 loop->routine
1153 ? G_("routine call uses same OpenACC parallelism"
1154 " as containing loop")
1155 : G_("inner loop uses same OpenACC parallelism"
1156 " as containing loop"));
1157 inform (outer->loc, "containing loop here");
1159 else
1160 error_at (loop->loc,
1161 loop->routine
1162 ? G_("routine call uses OpenACC parallelism disallowed"
1163 " by containing routine")
1164 : G_("loop uses OpenACC parallelism disallowed"
1165 " by containing routine"));
1167 if (loop->routine)
1168 inform (DECL_SOURCE_LOCATION (loop->routine),
1169 "routine %qD declared here", loop->routine);
1171 this_mask &= ~outer_mask;
1173 else
1175 unsigned outermost = least_bit_hwi (this_mask);
1177 if (outermost && outermost <= outer_mask)
1179 if (noisy)
1181 error_at (loop->loc,
1182 "incorrectly nested OpenACC loop parallelism");
1184 const oacc_loop *outer;
1185 for (outer = loop->parent;
1186 outer->flags && outer->flags < outermost;
1187 outer = outer->parent)
1188 continue;
1189 inform (outer->loc, "containing loop here");
1192 this_mask &= ~outermost;
1196 mask_all |= this_mask;
1198 if (loop->flags & OLF_TILE)
1200 /* When tiling, vector goes to the element loop, and failing
1201 that we put worker there. The std doesn't contemplate
1202 specifying all three. We choose to put worker and vector on
1203 the element loops in that case. */
1204 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1205 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1206 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1208 loop->e_mask = this_e_mask;
1209 this_mask ^= this_e_mask;
1212 loop->mask = this_mask;
1214 if (dump_file)
1215 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1216 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1217 loop->mask, loop->e_mask);
1219 if (loop->child)
1221 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1222 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1223 mask_all |= loop->inner;
1226 if (loop->sibling)
1227 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1229 return mask_all;
1232 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1233 OUTER_MASK is the partitioning this loop is contained within.
1234 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1235 Return the cumulative partitioning used by this loop, siblings and
1236 children. */
1238 static unsigned
1239 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1240 bool outer_assign)
1242 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1243 bool noisy = true;
1244 bool tiling = loop->flags & OLF_TILE;
1246 #ifdef ACCEL_COMPILER
1247 /* When device_type is supported, we want the device compiler to be
1248 noisy, if the loop parameters are device_type-specific. */
1249 noisy = false;
1250 #endif
1252 if (assign && (!outer_assign || loop->inner))
1254 /* Allocate outermost and non-innermost loops at the outermost
1255 non-innermost available level. */
1256 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1258 /* Find the first outermost available partition. */
1259 while (this_mask <= outer_mask)
1260 this_mask <<= 1;
1262 /* Grab two axes if tiling, and we've not assigned anything */
1263 if (tiling && !(loop->mask | loop->e_mask))
1264 this_mask |= this_mask << 1;
1266 /* Prohibit the innermost partitioning at the moment. */
1267 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1269 /* Don't use any dimension explicitly claimed by an inner loop. */
1270 this_mask &= ~loop->inner;
1272 if (tiling && !loop->e_mask)
1274 /* If we got two axes, allocate the inner one to the element
1275 loop. */
1276 loop->e_mask = this_mask & (this_mask << 1);
1277 this_mask ^= loop->e_mask;
1280 loop->mask |= this_mask;
1283 if (loop->child)
1285 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1286 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1287 outer_assign | assign);
1290 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1292 /* Allocate the loop at the innermost available level. Note
1293 that we do this even if we already assigned this loop the
1294 outermost available level above. That way we'll partition
1295 this along 2 axes, if they are available. */
1296 unsigned this_mask = 0;
1298 /* Determine the outermost partitioning used within this loop. */
1299 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1300 this_mask = least_bit_hwi (this_mask);
1302 /* Pick the partitioning just inside that one. */
1303 this_mask >>= 1;
1305 /* And avoid picking one use by an outer loop. */
1306 this_mask &= ~outer_mask;
1308 /* If tiling and we failed completely above, grab the next one
1309 too. Making sure it doesn't hit an outer loop. */
1310 if (tiling)
1312 this_mask &= ~(loop->e_mask | loop->mask);
1313 unsigned tile_mask = ((this_mask >> 1)
1314 & ~(outer_mask | loop->e_mask | loop->mask));
1316 if (tile_mask || loop->mask)
1318 loop->e_mask |= this_mask;
1319 this_mask = tile_mask;
1321 if (!loop->e_mask && noisy)
1322 warning_at (loop->loc, 0,
1323 "insufficient partitioning available"
1324 " to parallelize element loop");
1327 loop->mask |= this_mask;
1328 if (!loop->mask && noisy)
1329 warning_at (loop->loc, 0,
1330 tiling
1331 ? G_("insufficient partitioning available"
1332 " to parallelize tile loop")
1333 : G_("insufficient partitioning available"
1334 " to parallelize loop"));
1337 if (assign && dump_file)
1338 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1339 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1340 loop->mask, loop->e_mask);
1342 unsigned inner_mask = 0;
1344 if (loop->sibling)
1345 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1346 outer_mask, outer_assign);
1348 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1350 return inner_mask;
1353 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1354 axes. Return mask of partitioning. */
1356 static unsigned
1357 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1359 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1361 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1363 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1364 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1366 return mask_all;
1369 /* Default fork/join early expander. Delete the function calls if
1370 there is no RTL expander. */
1372 bool
1373 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1374 const int *ARG_UNUSED (dims), bool is_fork)
1376 if (is_fork)
1377 return targetm.have_oacc_fork ();
1378 else
1379 return targetm.have_oacc_join ();
1382 /* Default goacc.reduction early expander.
1384 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1385 If RES_PTR is not integer-zerop:
1386 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1387 TEARDOWN - emit '*RES_PTR = VAR'
1388 If LHS is not NULL
1389 emit 'LHS = VAR' */
1391 void
1392 default_goacc_reduction (gcall *call)
1394 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1395 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1396 tree lhs = gimple_call_lhs (call);
1397 tree var = gimple_call_arg (call, 2);
1398 gimple_seq seq = NULL;
1400 if (code == IFN_GOACC_REDUCTION_SETUP
1401 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1403 /* Setup and Teardown need to copy from/to the receiver object,
1404 if there is one. */
1405 tree ref_to_res = gimple_call_arg (call, 1);
1407 if (!integer_zerop (ref_to_res))
1409 tree dst = build_simple_mem_ref (ref_to_res);
1410 tree src = var;
1412 if (code == IFN_GOACC_REDUCTION_SETUP)
1414 src = dst;
1415 dst = lhs;
1416 lhs = NULL;
1418 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1422 /* Copy VAR to LHS, if there is an LHS. */
1423 if (lhs)
1424 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1426 gsi_replace_with_seq (&gsi, seq, true);
1429 /* Main entry point for oacc transformations which run on the device
1430 compiler after LTO, so we know what the target device is at this
1431 point (including the host fallback). */
1433 static unsigned int
1434 execute_oacc_device_lower ()
1436 tree attrs = oacc_get_fn_attrib (current_function_decl);
1438 if (!attrs)
1439 /* Not an offloaded function. */
1440 return 0;
1442 /* Parse the default dim argument exactly once. */
1443 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1445 oacc_parse_default_dims (flag_openacc_dims);
1446 flag_openacc_dims = (char *)&flag_openacc_dims;
1449 bool is_oacc_kernels
1450 = (lookup_attribute ("oacc kernels",
1451 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1452 bool is_oacc_kernels_parallelized
1453 = (lookup_attribute ("oacc kernels parallelized",
1454 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1456 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
1457 kernels, so remove the parallelism dimensions function attributes
1458 potentially set earlier on. */
1459 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
1461 oacc_set_fn_attrib (current_function_decl, NULL, NULL);
1462 attrs = oacc_get_fn_attrib (current_function_decl);
1465 /* Discover, partition and process the loops. */
1466 oacc_loop *loops = oacc_loop_discovery ();
1467 int fn_level = oacc_fn_attrib_level (attrs);
1469 if (dump_file)
1471 if (fn_level >= 0)
1472 fprintf (dump_file, "Function is OpenACC routine level %d\n",
1473 fn_level);
1474 else if (is_oacc_kernels)
1475 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1476 (is_oacc_kernels_parallelized
1477 ? "parallelized" : "unparallelized"));
1478 else
1479 fprintf (dump_file, "Function is OpenACC parallel offload\n");
1482 unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
1483 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1484 /* OpenACC kernels constructs are special: they currently don't use the
1485 generic oacc_loop infrastructure and attribute/dimension processing. */
1486 if (is_oacc_kernels && is_oacc_kernels_parallelized)
1488 /* Parallelized OpenACC kernels constructs use gang parallelism. See
1489 also tree-parloops.c:create_parallel_loop. */
1490 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
1493 int dims[GOMP_DIM_MAX];
1494 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1496 if (dump_file)
1498 const char *comma = "Compute dimensions [";
1499 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1500 fprintf (dump_file, "%s%d", comma, dims[ix]);
1501 fprintf (dump_file, "]\n");
1504 oacc_loop_process (loops);
1505 if (dump_file)
1507 fprintf (dump_file, "OpenACC loops\n");
1508 dump_oacc_loop (dump_file, loops, 0);
1509 fprintf (dump_file, "\n");
1512 /* Offloaded targets may introduce new basic blocks, which require
1513 dominance information to update SSA. */
1514 calculate_dominance_info (CDI_DOMINATORS);
1516 /* Now lower internal loop functions to target-specific code
1517 sequences. */
1518 basic_block bb;
1519 FOR_ALL_BB_FN (bb, cfun)
1520 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1522 gimple *stmt = gsi_stmt (gsi);
1523 if (!is_gimple_call (stmt))
1525 gsi_next (&gsi);
1526 continue;
1529 gcall *call = as_a <gcall *> (stmt);
1530 if (!gimple_call_internal_p (call))
1532 gsi_next (&gsi);
1533 continue;
1536 /* Rewind to allow rescan. */
1537 gsi_prev (&gsi);
1538 bool rescan = false, remove = false;
1539 enum internal_fn ifn_code = gimple_call_internal_fn (call);
1541 switch (ifn_code)
1543 default: break;
1545 case IFN_GOACC_TILE:
1546 oacc_xform_tile (call);
1547 rescan = true;
1548 break;
1550 case IFN_GOACC_LOOP:
1551 oacc_xform_loop (call);
1552 rescan = true;
1553 break;
1555 case IFN_GOACC_REDUCTION:
1556 /* Mark the function for SSA renaming. */
1557 mark_virtual_operands_for_renaming (cfun);
1559 /* If the level is -1, this ended up being an unused
1560 axis. Handle as a default. */
1561 if (integer_minus_onep (gimple_call_arg (call, 3)))
1562 default_goacc_reduction (call);
1563 else
1564 targetm.goacc.reduction (call);
1565 rescan = true;
1566 break;
1568 case IFN_UNIQUE:
1570 enum ifn_unique_kind kind
1571 = ((enum ifn_unique_kind)
1572 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1574 switch (kind)
1576 default:
1577 break;
1579 case IFN_UNIQUE_OACC_FORK:
1580 case IFN_UNIQUE_OACC_JOIN:
1581 if (integer_minus_onep (gimple_call_arg (call, 2)))
1582 remove = true;
1583 else if (!targetm.goacc.fork_join
1584 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1585 remove = true;
1586 break;
1588 case IFN_UNIQUE_OACC_HEAD_MARK:
1589 case IFN_UNIQUE_OACC_TAIL_MARK:
1590 remove = true;
1591 break;
1593 break;
1597 if (gsi_end_p (gsi))
1598 /* We rewound past the beginning of the BB. */
1599 gsi = gsi_start_bb (bb);
1600 else
1601 /* Undo the rewind. */
1602 gsi_next (&gsi);
1604 if (remove)
1606 if (gimple_vdef (call))
1607 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1608 if (gimple_call_lhs (call))
1610 /* Propagate the data dependency var. */
1611 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1612 gimple_call_arg (call, 1));
1613 gsi_replace (&gsi, ass, false);
1615 else
1616 gsi_remove (&gsi, true);
1618 else if (!rescan)
1619 /* If not rescanning, advance over the call. */
1620 gsi_next (&gsi);
1623 free_oacc_loop (loops);
1625 return 0;
1628 /* Default launch dimension validator. Force everything to 1. A
1629 backend that wants to provide larger dimensions must override this
1630 hook. */
1632 bool
1633 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
1634 int ARG_UNUSED (fn_level))
1636 bool changed = false;
1638 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
1640 if (dims[ix] != 1)
1642 dims[ix] = 1;
1643 changed = true;
1647 return changed;
1650 /* Default dimension bound is unknown on accelerator and 1 on host. */
1653 default_goacc_dim_limit (int ARG_UNUSED (axis))
1655 #ifdef ACCEL_COMPILER
1656 return 0;
1657 #else
1658 return 1;
1659 #endif
1662 namespace {
1664 const pass_data pass_data_oacc_device_lower =
1666 GIMPLE_PASS, /* type */
1667 "oaccdevlow", /* name */
1668 OPTGROUP_OMP, /* optinfo_flags */
1669 TV_NONE, /* tv_id */
1670 PROP_cfg, /* properties_required */
1671 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
1672 0, /* properties_destroyed */
1673 0, /* todo_flags_start */
1674 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1677 class pass_oacc_device_lower : public gimple_opt_pass
1679 public:
1680 pass_oacc_device_lower (gcc::context *ctxt)
1681 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
1684 /* opt_pass methods: */
1685 virtual bool gate (function *) { return flag_openacc; };
1687 virtual unsigned int execute (function *)
1689 return execute_oacc_device_lower ();
1692 }; // class pass_oacc_device_lower
1694 } // anon namespace
1696 gimple_opt_pass *
1697 make_pass_oacc_device_lower (gcc::context *ctxt)
1699 return new pass_oacc_device_lower (ctxt);
1703 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
1704 GOMP_SIMT_ENTER call identifying the privatized variables, which are
1705 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
1706 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
1708 static void
1709 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
1711 gimple *alloc_stmt = gsi_stmt (*gsi);
1712 tree simtrec = gimple_call_lhs (alloc_stmt);
1713 tree simduid = gimple_call_arg (alloc_stmt, 0);
1714 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
1715 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
1716 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
1717 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
1718 TREE_ADDRESSABLE (rectype) = 1;
1719 TREE_TYPE (simtrec) = build_pointer_type (rectype);
1720 for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
1722 tree *argp = gimple_call_arg_ptr (enter_stmt, i);
1723 if (*argp == null_pointer_node)
1724 continue;
1725 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
1726 && VAR_P (TREE_OPERAND (*argp, 0)));
1727 tree var = TREE_OPERAND (*argp, 0);
1729 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
1730 DECL_NAME (var), TREE_TYPE (var));
1731 SET_DECL_ALIGN (field, DECL_ALIGN (var));
1732 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
1733 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
1735 insert_field_into_struct (rectype, field);
1737 tree t = build_simple_mem_ref (simtrec);
1738 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
1739 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
1740 SET_DECL_VALUE_EXPR (var, t);
1741 DECL_HAS_VALUE_EXPR_P (var) = 1;
1742 *regimplify = true;
1744 layout_type (rectype);
1745 tree size = TYPE_SIZE_UNIT (rectype);
1746 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
1748 alloc_stmt
1749 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
1750 gimple_call_set_lhs (alloc_stmt, simtrec);
1751 gsi_replace (gsi, alloc_stmt, false);
1752 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
1753 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
1754 gsi_replace (&enter_gsi, enter_stmt, false);
1756 use_operand_p use;
1757 gimple *exit_stmt;
1758 if (single_imm_use (simtrec, &use, &exit_stmt))
1760 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
1761 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
1762 tree clobber = build_constructor (rectype, NULL);
1763 TREE_THIS_VOLATILE (clobber) = 1;
1764 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
1765 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
1767 else
1768 gcc_checking_assert (has_zero_uses (simtrec));
1771 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
1773 static tree
1774 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
1776 tree t = *tp;
1778 if (VAR_P (t)
1779 && DECL_HAS_VALUE_EXPR_P (t)
1780 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
1782 *walk_subtrees = 0;
1783 return t;
1785 return NULL_TREE;
1788 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
1789 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
1790 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
1791 internal functions on non-SIMT targets, and likewise some SIMD internal
1792 functions on SIMT targets. */
1794 static unsigned int
1795 execute_omp_device_lower ()
1797 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
1798 bool regimplify = false;
1799 basic_block bb;
1800 gimple_stmt_iterator gsi;
1801 FOR_EACH_BB_FN (bb, cfun)
1802 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1804 gimple *stmt = gsi_stmt (gsi);
1805 if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
1806 continue;
1807 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
1808 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
1809 switch (gimple_call_internal_fn (stmt))
1811 case IFN_GOMP_USE_SIMT:
1812 rhs = vf == 1 ? integer_zero_node : integer_one_node;
1813 break;
1814 case IFN_GOMP_SIMT_ENTER:
1815 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1816 goto simtreg_enter_exit;
1817 case IFN_GOMP_SIMT_ENTER_ALLOC:
1818 if (vf != 1)
1819 ompdevlow_adjust_simt_enter (&gsi, &regimplify);
1820 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
1821 goto simtreg_enter_exit;
1822 case IFN_GOMP_SIMT_EXIT:
1823 simtreg_enter_exit:
1824 if (vf != 1)
1825 continue;
1826 unlink_stmt_vdef (stmt);
1827 break;
1828 case IFN_GOMP_SIMT_LANE:
1829 case IFN_GOMP_SIMT_LAST_LANE:
1830 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
1831 break;
1832 case IFN_GOMP_SIMT_VF:
1833 rhs = build_int_cst (type, vf);
1834 break;
1835 case IFN_GOMP_SIMT_ORDERED_PRED:
1836 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
1837 if (rhs || !lhs)
1838 unlink_stmt_vdef (stmt);
1839 break;
1840 case IFN_GOMP_SIMT_VOTE_ANY:
1841 case IFN_GOMP_SIMT_XCHG_BFLY:
1842 case IFN_GOMP_SIMT_XCHG_IDX:
1843 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1844 break;
1845 case IFN_GOMP_SIMD_LANE:
1846 case IFN_GOMP_SIMD_LAST_LANE:
1847 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
1848 break;
1849 case IFN_GOMP_SIMD_VF:
1850 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
1851 break;
1852 default:
1853 continue;
1855 if (lhs && !rhs)
1856 continue;
1857 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
1858 gsi_replace (&gsi, stmt, false);
1860 if (regimplify)
1861 FOR_EACH_BB_REVERSE_FN (bb, cfun)
1862 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1863 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
1865 if (gimple_clobber_p (gsi_stmt (gsi)))
1866 gsi_remove (&gsi, true);
1867 else
1868 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
1870 if (vf != 1)
1871 cfun->has_force_vectorize_loops = false;
1872 return 0;
1875 namespace {
1877 const pass_data pass_data_omp_device_lower =
1879 GIMPLE_PASS, /* type */
1880 "ompdevlow", /* name */
1881 OPTGROUP_OMP, /* optinfo_flags */
1882 TV_NONE, /* tv_id */
1883 PROP_cfg, /* properties_required */
1884 PROP_gimple_lomp_dev, /* properties_provided */
1885 0, /* properties_destroyed */
1886 0, /* todo_flags_start */
1887 TODO_update_ssa, /* todo_flags_finish */
1890 class pass_omp_device_lower : public gimple_opt_pass
1892 public:
1893 pass_omp_device_lower (gcc::context *ctxt)
1894 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
1897 /* opt_pass methods: */
1898 virtual bool gate (function *fun)
1900 return !(fun->curr_properties & PROP_gimple_lomp_dev);
1902 virtual unsigned int execute (function *)
1904 return execute_omp_device_lower ();
1907 }; // class pass_expand_omp_ssa
1909 } // anon namespace
1911 gimple_opt_pass *
1912 make_pass_omp_device_lower (gcc::context *ctxt)
1914 return new pass_omp_device_lower (ctxt);
1917 /* "omp declare target link" handling pass. */
1919 namespace {
1921 const pass_data pass_data_omp_target_link =
1923 GIMPLE_PASS, /* type */
1924 "omptargetlink", /* name */
1925 OPTGROUP_OMP, /* optinfo_flags */
1926 TV_NONE, /* tv_id */
1927 PROP_ssa, /* properties_required */
1928 0, /* properties_provided */
1929 0, /* properties_destroyed */
1930 0, /* todo_flags_start */
1931 TODO_update_ssa, /* todo_flags_finish */
1934 class pass_omp_target_link : public gimple_opt_pass
1936 public:
1937 pass_omp_target_link (gcc::context *ctxt)
1938 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
1941 /* opt_pass methods: */
1942 virtual bool gate (function *fun)
1944 #ifdef ACCEL_COMPILER
1945 tree attrs = DECL_ATTRIBUTES (fun->decl);
1946 return lookup_attribute ("omp declare target", attrs)
1947 || lookup_attribute ("omp target entrypoint", attrs);
1948 #else
1949 (void) fun;
1950 return false;
1951 #endif
1954 virtual unsigned execute (function *);
1957 /* Callback for walk_gimple_stmt used to scan for link var operands. */
1959 static tree
1960 find_link_var_op (tree *tp, int *walk_subtrees, void *)
1962 tree t = *tp;
1964 if (VAR_P (t)
1965 && DECL_HAS_VALUE_EXPR_P (t)
1966 && is_global_var (t)
1967 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
1969 *walk_subtrees = 0;
1970 return t;
1973 return NULL_TREE;
1976 unsigned
1977 pass_omp_target_link::execute (function *fun)
1979 basic_block bb;
1980 FOR_EACH_BB_FN (bb, fun)
1982 gimple_stmt_iterator gsi;
1983 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1984 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
1985 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
1988 return 0;
1991 } // anon namespace
1993 gimple_opt_pass *
1994 make_pass_omp_target_link (gcc::context *ctxt)
1996 return new pass_omp_target_link (ctxt);