[ARM] PR target/71436: Restrict *load_multiple pattern till after LRA
[official-gcc.git] / gcc / omp-offload.c
blobd73955c554f72ab58cf167bc384c550bb8775660
1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
4 Copyright (C) 2005-2017 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "gimplify.h"
37 #include "gimple-iterator.h"
38 #include "gimplify-me.h"
39 #include "gimple-walk.h"
40 #include "tree-cfg.h"
41 #include "tree-into-ssa.h"
42 #include "common/common-target.h"
43 #include "omp-general.h"
44 #include "omp-offload.h"
45 #include "lto-section-names.h"
46 #include "gomp-constants.h"
47 #include "gimple-pretty-print.h"
48 #include "intl.h"
50 /* Describe the OpenACC looping structure of a function. The entire
51 function is held in a 'NULL' loop. */
53 struct oacc_loop
55 oacc_loop *parent; /* Containing loop. */
57 oacc_loop *child; /* First inner loop. */
59 oacc_loop *sibling; /* Next loop within same parent. */
61 location_t loc; /* Location of the loop start. */
63 gcall *marker; /* Initial head marker. */
65 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
66 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
68 tree routine; /* Pseudo-loop enclosing a routine. */
70 unsigned mask; /* Partitioning mask. */
71 unsigned e_mask; /* Partitioning of element loops (when tiling). */
72 unsigned inner; /* Partitioning of inner loops. */
73 unsigned flags; /* Partitioning flags. */
74 vec<gcall *> ifns; /* Contained loop abstraction functions. */
75 tree chunk_size; /* Chunk size. */
76 gcall *head_end; /* Final marker of head sequence. */
79 /* Holds offload tables with decls. */
80 vec<tree, va_gc> *offload_funcs, *offload_vars;
82 /* Return level at which oacc routine may spawn a partitioned loop, or
83 -1 if it is not a routine (i.e. is an offload fn). */
85 static int
86 oacc_fn_attrib_level (tree attr)
88 tree pos = TREE_VALUE (attr);
90 if (!TREE_PURPOSE (pos))
91 return -1;
93 int ix = 0;
94 for (ix = 0; ix != GOMP_DIM_MAX;
95 ix++, pos = TREE_CHAIN (pos))
96 if (!integer_zerop (TREE_PURPOSE (pos)))
97 break;
99 return ix;
102 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
103 adds their addresses and sizes to constructor-vector V_CTOR. */
105 static void
106 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
107 vec<constructor_elt, va_gc> *v_ctor)
109 unsigned len = vec_safe_length (v_decls);
110 for (unsigned i = 0; i < len; i++)
112 tree it = (*v_decls)[i];
113 bool is_var = VAR_P (it);
114 bool is_link_var
115 = is_var
116 #ifdef ACCEL_COMPILER
117 && DECL_HAS_VALUE_EXPR_P (it)
118 #endif
119 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
121 tree size = NULL_TREE;
122 if (is_var)
123 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
125 tree addr;
126 if (!is_link_var)
127 addr = build_fold_addr_expr (it);
128 else
130 #ifdef ACCEL_COMPILER
131 /* For "omp declare target link" vars add address of the pointer to
132 the target table, instead of address of the var. */
133 tree value_expr = DECL_VALUE_EXPR (it);
134 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
135 varpool_node::finalize_decl (link_ptr_decl);
136 addr = build_fold_addr_expr (link_ptr_decl);
137 #else
138 addr = build_fold_addr_expr (it);
139 #endif
141 /* Most significant bit of the size marks "omp declare target link"
142 vars in host and target tables. */
143 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
144 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
145 * BITS_PER_UNIT - 1);
146 size = wide_int_to_tree (const_ptr_type_node, isize);
149 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
150 if (is_var)
151 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
155 /* Create new symbols containing (address, size) pairs for global variables,
156 marked with "omp declare target" attribute, as well as addresses for the
157 functions, which are outlined offloading regions. */
158 void
159 omp_finish_file (void)
161 unsigned num_funcs = vec_safe_length (offload_funcs);
162 unsigned num_vars = vec_safe_length (offload_vars);
164 if (num_funcs == 0 && num_vars == 0)
165 return;
167 if (targetm_common.have_named_sections)
169 vec<constructor_elt, va_gc> *v_f, *v_v;
170 vec_alloc (v_f, num_funcs);
171 vec_alloc (v_v, num_vars * 2);
173 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
174 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
176 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
177 num_vars * 2);
178 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
179 num_funcs);
180 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
181 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
182 tree ctor_v = build_constructor (vars_decl_type, v_v);
183 tree ctor_f = build_constructor (funcs_decl_type, v_f);
184 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
185 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
186 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
187 get_identifier (".offload_func_table"),
188 funcs_decl_type);
189 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
190 get_identifier (".offload_var_table"),
191 vars_decl_type);
192 TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
193 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
194 otherwise a joint table in a binary will contain padding between
195 tables from multiple object files. */
196 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
197 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
198 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
199 DECL_INITIAL (funcs_decl) = ctor_f;
200 DECL_INITIAL (vars_decl) = ctor_v;
201 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
202 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
204 varpool_node::finalize_decl (vars_decl);
205 varpool_node::finalize_decl (funcs_decl);
207 else
209 for (unsigned i = 0; i < num_funcs; i++)
211 tree it = (*offload_funcs)[i];
212 targetm.record_offload_symbol (it);
214 for (unsigned i = 0; i < num_vars; i++)
216 tree it = (*offload_vars)[i];
217 targetm.record_offload_symbol (it);
222 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
223 axis DIM. Return a tmp var holding the result. */
225 static tree
226 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
228 tree arg = build_int_cst (unsigned_type_node, dim);
229 tree size = create_tmp_var (integer_type_node);
230 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
231 gimple *call = gimple_build_call_internal (fn, 1, arg);
233 gimple_call_set_lhs (call, size);
234 gimple_seq_add_stmt (seq, call);
236 return size;
239 /* Find the number of threads (POS = false), or thread number (POS =
240 true) for an OpenACC region partitioned as MASK. Setup code
241 required for the calculation is added to SEQ. */
243 static tree
244 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
246 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
247 unsigned ix;
249 /* Start at gang level, and examine relevant dimension indices. */
250 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
251 if (GOMP_DIM_MASK (ix) & mask)
253 if (res)
255 /* We had an outer index, so scale that by the size of
256 this dimension. */
257 tree n = oacc_dim_call (false, ix, seq);
258 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
260 if (pos)
262 /* Determine index in this dimension. */
263 tree id = oacc_dim_call (true, ix, seq);
264 if (res)
265 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
266 else
267 res = id;
271 if (res == NULL_TREE)
272 res = integer_zero_node;
274 return res;
277 /* Transform IFN_GOACC_LOOP calls to actual code. See
278 expand_oacc_for for where these are generated. At the vector
279 level, we stride loops, such that each member of a warp will
280 operate on adjacent iterations. At the worker and gang level,
281 each gang/warp executes a set of contiguous iterations. Chunking
282 can override this such that each iteration engine executes a
283 contiguous chunk, and then moves on to stride to the next chunk. */
285 static void
286 oacc_xform_loop (gcall *call)
288 gimple_stmt_iterator gsi = gsi_for_stmt (call);
289 enum ifn_goacc_loop_kind code
290 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
291 tree dir = gimple_call_arg (call, 1);
292 tree range = gimple_call_arg (call, 2);
293 tree step = gimple_call_arg (call, 3);
294 tree chunk_size = NULL_TREE;
295 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
296 tree lhs = gimple_call_lhs (call);
297 tree type = TREE_TYPE (lhs);
298 tree diff_type = TREE_TYPE (range);
299 tree r = NULL_TREE;
300 gimple_seq seq = NULL;
301 bool chunking = false, striding = true;
302 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
303 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
305 #ifdef ACCEL_COMPILER
306 chunk_size = gimple_call_arg (call, 4);
307 if (integer_minus_onep (chunk_size) /* Force static allocation. */
308 || integer_zerop (chunk_size)) /* Default (also static). */
310 /* If we're at the gang level, we want each to execute a
311 contiguous run of iterations. Otherwise we want each element
312 to stride. */
313 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
314 chunking = false;
316 else
318 /* Chunk of size 1 is striding. */
319 striding = integer_onep (chunk_size);
320 chunking = !striding;
322 #endif
324 /* striding=true, chunking=true
325 -> invalid.
326 striding=true, chunking=false
327 -> chunks=1
328 striding=false,chunking=true
329 -> chunks=ceil (range/(chunksize*threads*step))
330 striding=false,chunking=false
331 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
332 push_gimplify_context (true);
334 switch (code)
336 default: gcc_unreachable ();
338 case IFN_GOACC_LOOP_CHUNKS:
339 if (!chunking)
340 r = build_int_cst (type, 1);
341 else
343 /* chunk_max
344 = (range - dir) / (chunks * step * num_threads) + dir */
345 tree per = oacc_thread_numbers (false, mask, &seq);
346 per = fold_convert (type, per);
347 chunk_size = fold_convert (type, chunk_size);
348 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
349 per = fold_build2 (MULT_EXPR, type, per, step);
350 r = build2 (MINUS_EXPR, type, range, dir);
351 r = build2 (PLUS_EXPR, type, r, per);
352 r = build2 (TRUNC_DIV_EXPR, type, r, per);
354 break;
356 case IFN_GOACC_LOOP_STEP:
358 /* If striding, step by the entire compute volume, otherwise
359 step by the inner volume. */
360 unsigned volume = striding ? mask : inner_mask;
362 r = oacc_thread_numbers (false, volume, &seq);
363 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
365 break;
367 case IFN_GOACC_LOOP_OFFSET:
368 if (striding)
370 r = oacc_thread_numbers (true, mask, &seq);
371 r = fold_convert (diff_type, r);
373 else
375 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
376 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
377 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
378 inner_size, outer_size);
380 volume = fold_convert (diff_type, volume);
381 if (chunking)
382 chunk_size = fold_convert (diff_type, chunk_size);
383 else
385 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
387 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
388 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
389 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
392 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
393 fold_convert (diff_type, inner_size));
394 r = oacc_thread_numbers (true, outer_mask, &seq);
395 r = fold_convert (diff_type, r);
396 r = build2 (MULT_EXPR, diff_type, r, span);
398 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
399 inner = fold_convert (diff_type, inner);
400 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
402 if (chunking)
404 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
405 tree per
406 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
407 per = build2 (MULT_EXPR, diff_type, per, chunk);
409 r = build2 (PLUS_EXPR, diff_type, r, per);
412 r = fold_build2 (MULT_EXPR, diff_type, r, step);
413 if (type != diff_type)
414 r = fold_convert (type, r);
415 break;
417 case IFN_GOACC_LOOP_BOUND:
418 if (striding)
419 r = range;
420 else
422 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
423 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
424 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
425 inner_size, outer_size);
427 volume = fold_convert (diff_type, volume);
428 if (chunking)
429 chunk_size = fold_convert (diff_type, chunk_size);
430 else
432 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
434 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
435 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
436 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
439 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
440 fold_convert (diff_type, inner_size));
442 r = fold_build2 (MULT_EXPR, diff_type, span, step);
444 tree offset = gimple_call_arg (call, 6);
445 r = build2 (PLUS_EXPR, diff_type, r,
446 fold_convert (diff_type, offset));
447 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
448 diff_type, r, range);
450 if (diff_type != type)
451 r = fold_convert (type, r);
452 break;
455 gimplify_assign (lhs, r, &seq);
457 pop_gimplify_context (NULL);
459 gsi_replace_with_seq (&gsi, seq, true);
462 /* Transform a GOACC_TILE call. Determines the element loop span for
463 the specified loop of the nest. This is 1 if we're not tiling.
465 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
467 static void
468 oacc_xform_tile (gcall *call)
470 gimple_stmt_iterator gsi = gsi_for_stmt (call);
471 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
472 /* Inner loops have higher loop_nos. */
473 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
474 tree tile_size = gimple_call_arg (call, 2);
475 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
476 tree lhs = gimple_call_lhs (call);
477 tree type = TREE_TYPE (lhs);
478 gimple_seq seq = NULL;
479 tree span = build_int_cst (type, 1);
481 gcc_assert (!(e_mask
482 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
483 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
484 push_gimplify_context (!seen_error ());
486 #ifndef ACCEL_COMPILER
487 /* Partitioning disabled on host compilers. */
488 e_mask = 0;
489 #endif
490 if (!e_mask)
491 /* Not paritioning. */
492 span = integer_one_node;
493 else if (!integer_zerop (tile_size))
494 /* User explicitly specified size. */
495 span = tile_size;
496 else
498 /* Pick a size based on the paritioning of the element loop and
499 the number of loop nests. */
500 tree first_size = NULL_TREE;
501 tree second_size = NULL_TREE;
503 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
504 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
505 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
506 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
508 if (!first_size)
510 first_size = second_size;
511 second_size = NULL_TREE;
514 if (loop_no + 1 == collapse)
516 span = first_size;
517 if (!loop_no && second_size)
518 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
519 span, second_size);
521 else if (loop_no + 2 == collapse)
522 span = second_size;
523 else
524 span = NULL_TREE;
526 if (!span)
527 /* There's no obvious element size for this loop. Options
528 are 1, first_size or some non-unity constant (32 is my
529 favourite). We should gather some statistics. */
530 span = first_size;
533 span = fold_convert (type, span);
534 gimplify_assign (lhs, span, &seq);
536 pop_gimplify_context (NULL);
538 gsi_replace_with_seq (&gsi, seq, true);
541 /* Default partitioned and minimum partitioned dimensions. */
543 static int oacc_default_dims[GOMP_DIM_MAX];
544 static int oacc_min_dims[GOMP_DIM_MAX];
546 /* Parse the default dimension parameter. This is a set of
547 :-separated optional compute dimensions. Each specified dimension
548 is a positive integer. When device type support is added, it is
549 planned to be a comma separated list of such compute dimensions,
550 with all but the first prefixed by the colon-terminated device
551 type. */
553 static void
554 oacc_parse_default_dims (const char *dims)
556 int ix;
558 for (ix = GOMP_DIM_MAX; ix--;)
560 oacc_default_dims[ix] = -1;
561 oacc_min_dims[ix] = 1;
564 #ifndef ACCEL_COMPILER
565 /* Cannot be overridden on the host. */
566 dims = NULL;
567 #endif
568 if (dims)
570 const char *pos = dims;
572 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
574 if (ix)
576 if (*pos != ':')
577 goto malformed;
578 pos++;
581 if (*pos != ':')
583 long val;
584 const char *eptr;
586 errno = 0;
587 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
588 if (errno || val <= 0 || (int) val != val)
589 goto malformed;
590 pos = eptr;
591 oacc_default_dims[ix] = (int) val;
594 if (*pos)
596 malformed:
597 error_at (UNKNOWN_LOCATION,
598 "-fopenacc-dim operand is malformed at '%s'", pos);
602 /* Allow the backend to validate the dimensions. */
603 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1);
604 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2);
607 /* Validate and update the dimensions for offloaded FN. ATTRS is the
608 raw attribute. DIMS is an array of dimensions, which is filled in.
609 LEVEL is the partitioning level of a routine, or -1 for an offload
610 region itself. USED is the mask of partitioned execution in the
611 function. */
613 static void
614 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
616 tree purpose[GOMP_DIM_MAX];
617 unsigned ix;
618 tree pos = TREE_VALUE (attrs);
619 bool is_kernel = oacc_fn_attrib_kernels_p (attrs);
621 /* Make sure the attribute creator attached the dimension
622 information. */
623 gcc_assert (pos);
625 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
627 purpose[ix] = TREE_PURPOSE (pos);
628 tree val = TREE_VALUE (pos);
629 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
630 pos = TREE_CHAIN (pos);
633 bool changed = targetm.goacc.validate_dims (fn, dims, level);
635 /* Default anything left to 1 or a partitioned default. */
636 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
637 if (dims[ix] < 0)
639 /* The OpenACC spec says 'If the [num_gangs] clause is not
640 specified, an implementation-defined default will be used;
641 the default may depend on the code within the construct.'
642 (2.5.6). Thus an implementation is free to choose
643 non-unity default for a parallel region that doesn't have
644 any gang-partitioned loops. However, it appears that there
645 is a sufficient body of user code that expects non-gang
646 partitioned regions to not execute in gang-redundant mode.
647 So we (a) don't warn about the non-portability and (b) pick
648 the minimum permissible dimension size when there is no
649 partitioned execution. Otherwise we pick the global
650 default for the dimension, which the user can control. The
651 same wording and logic applies to num_workers and
652 vector_length, however the worker- or vector- single
653 execution doesn't have the same impact as gang-redundant
654 execution. (If the minimum gang-level partioning is not 1,
655 the target is probably too confusing.) */
656 dims[ix] = (used & GOMP_DIM_MASK (ix)
657 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
658 changed = true;
661 if (changed)
663 /* Replace the attribute with new values. */
664 pos = NULL_TREE;
665 for (ix = GOMP_DIM_MAX; ix--;)
667 pos = tree_cons (purpose[ix],
668 build_int_cst (integer_type_node, dims[ix]),
669 pos);
670 if (is_kernel)
671 TREE_PUBLIC (pos) = 1;
673 oacc_replace_fn_attrib (fn, pos);
677 /* Create an empty OpenACC loop structure at LOC. */
679 static oacc_loop *
680 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
682 oacc_loop *loop = XCNEW (oacc_loop);
684 loop->parent = parent;
686 if (parent)
688 loop->sibling = parent->child;
689 parent->child = loop;
692 loop->loc = loc;
693 return loop;
696 /* Create an outermost, dummy OpenACC loop for offloaded function
697 DECL. */
699 static oacc_loop *
700 new_oacc_loop_outer (tree decl)
702 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
705 /* Start a new OpenACC loop structure beginning at head marker HEAD.
706 Link into PARENT loop. Return the new loop. */
708 static oacc_loop *
709 new_oacc_loop (oacc_loop *parent, gcall *marker)
711 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
713 loop->marker = marker;
715 /* TODO: This is where device_type flattening would occur for the loop
716 flags. */
718 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
720 tree chunk_size = integer_zero_node;
721 if (loop->flags & OLF_GANG_STATIC)
722 chunk_size = gimple_call_arg (marker, 4);
723 loop->chunk_size = chunk_size;
725 return loop;
728 /* Create a dummy loop encompassing a call to a openACC routine.
729 Extract the routine's partitioning requirements. */
731 static void
732 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
734 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
735 int level = oacc_fn_attrib_level (attrs);
737 gcc_assert (level >= 0);
739 loop->marker = call;
740 loop->routine = decl;
741 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
742 ^ (GOMP_DIM_MASK (level) - 1));
745 /* Finish off the current OpenACC loop ending at tail marker TAIL.
746 Return the parent loop. */
748 static oacc_loop *
749 finish_oacc_loop (oacc_loop *loop)
751 /* If the loop has been collapsed, don't partition it. */
752 if (loop->ifns.is_empty ())
753 loop->mask = loop->flags = 0;
754 return loop->parent;
757 /* Free all OpenACC loop structures within LOOP (inclusive). */
759 static void
760 free_oacc_loop (oacc_loop *loop)
762 if (loop->sibling)
763 free_oacc_loop (loop->sibling);
764 if (loop->child)
765 free_oacc_loop (loop->child);
767 loop->ifns.release ();
768 free (loop);
771 /* Dump out the OpenACC loop head or tail beginning at FROM. */
773 static void
774 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
775 const char *title, int level)
777 enum ifn_unique_kind kind
778 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
780 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
781 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
783 gimple *stmt = gsi_stmt (gsi);
785 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
787 enum ifn_unique_kind k
788 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
789 (gimple_call_arg (stmt, 0)));
791 if (k == kind && stmt != from)
792 break;
794 print_gimple_stmt (file, stmt, depth * 2 + 2, 0);
796 gsi_next (&gsi);
797 while (gsi_end_p (gsi))
798 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
802 /* Dump OpenACC loops LOOP, its siblings and its children. */
804 static void
805 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
807 int ix;
809 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
810 loop->flags, loop->mask,
811 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
813 if (loop->marker)
814 print_gimple_stmt (file, loop->marker, depth * 2, 0);
816 if (loop->routine)
817 fprintf (file, "%*sRoutine %s:%u:%s\n",
818 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
819 DECL_SOURCE_LINE (loop->routine),
820 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
822 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
823 if (loop->heads[ix])
824 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
825 for (ix = GOMP_DIM_MAX; ix--;)
826 if (loop->tails[ix])
827 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
829 if (loop->child)
830 dump_oacc_loop (file, loop->child, depth + 1);
831 if (loop->sibling)
832 dump_oacc_loop (file, loop->sibling, depth);
835 void debug_oacc_loop (oacc_loop *);
837 /* Dump loops to stderr. */
839 DEBUG_FUNCTION void
840 debug_oacc_loop (oacc_loop *loop)
842 dump_oacc_loop (stderr, loop, 0);
845 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
846 structures as we go. By construction these loops are properly
847 nested. */
849 static void
850 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
852 int marker = 0;
853 int remaining = 0;
855 if (bb->flags & BB_VISITED)
856 return;
858 follow:
859 bb->flags |= BB_VISITED;
861 /* Scan for loop markers. */
862 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
863 gsi_next (&gsi))
865 gimple *stmt = gsi_stmt (gsi);
867 if (!is_gimple_call (stmt))
868 continue;
870 gcall *call = as_a <gcall *> (stmt);
872 /* If this is a routine, make a dummy loop for it. */
873 if (tree decl = gimple_call_fndecl (call))
874 if (tree attrs = oacc_get_fn_attrib (decl))
876 gcc_assert (!marker);
877 new_oacc_loop_routine (loop, call, decl, attrs);
880 if (!gimple_call_internal_p (call))
881 continue;
883 switch (gimple_call_internal_fn (call))
885 default:
886 break;
888 case IFN_GOACC_LOOP:
889 case IFN_GOACC_TILE:
890 /* Record the abstraction function, so we can manipulate it
891 later. */
892 loop->ifns.safe_push (call);
893 break;
895 case IFN_UNIQUE:
896 enum ifn_unique_kind kind
897 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
898 (gimple_call_arg (call, 0)));
899 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
900 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
902 if (gimple_call_num_args (call) == 2)
904 gcc_assert (marker && !remaining);
905 marker = 0;
906 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
907 loop = finish_oacc_loop (loop);
908 else
909 loop->head_end = call;
911 else
913 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
915 if (!marker)
917 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
918 loop = new_oacc_loop (loop, call);
919 remaining = count;
921 gcc_assert (count == remaining);
922 if (remaining)
924 remaining--;
925 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
926 loop->heads[marker] = call;
927 else
928 loop->tails[remaining] = call;
930 marker++;
935 if (remaining || marker)
937 bb = single_succ (bb);
938 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
939 goto follow;
942 /* Walk successor blocks. */
943 edge e;
944 edge_iterator ei;
946 FOR_EACH_EDGE (e, ei, bb->succs)
947 oacc_loop_discover_walk (loop, e->dest);
950 /* LOOP is the first sibling. Reverse the order in place and return
951 the new first sibling. Recurse to child loops. */
953 static oacc_loop *
954 oacc_loop_sibling_nreverse (oacc_loop *loop)
956 oacc_loop *last = NULL;
959 if (loop->child)
960 loop->child = oacc_loop_sibling_nreverse (loop->child);
962 oacc_loop *next = loop->sibling;
963 loop->sibling = last;
964 last = loop;
965 loop = next;
967 while (loop);
969 return last;
972 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
973 the current function. */
975 static oacc_loop *
976 oacc_loop_discovery ()
978 /* Clear basic block flags, in particular BB_VISITED which we're going to use
979 in the following. */
980 clear_bb_flags ();
982 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
983 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
985 /* The siblings were constructed in reverse order, reverse them so
986 that diagnostics come out in an unsurprising order. */
987 top = oacc_loop_sibling_nreverse (top);
989 return top;
992 /* Transform the abstract internal function markers starting at FROM
993 to be for partitioning level LEVEL. Stop when we meet another HEAD
994 or TAIL marker. */
996 static void
997 oacc_loop_xform_head_tail (gcall *from, int level)
999 enum ifn_unique_kind kind
1000 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1001 tree replacement = build_int_cst (unsigned_type_node, level);
1003 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1005 gimple *stmt = gsi_stmt (gsi);
1007 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1009 enum ifn_unique_kind k
1010 = ((enum ifn_unique_kind)
1011 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1013 if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1014 *gimple_call_arg_ptr (stmt, 2) = replacement;
1015 else if (k == kind && stmt != from)
1016 break;
1018 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1019 *gimple_call_arg_ptr (stmt, 3) = replacement;
1021 gsi_next (&gsi);
1022 while (gsi_end_p (gsi))
1023 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1027 /* Process the discovered OpenACC loops, setting the correct
1028 partitioning level etc. */
1030 static void
1031 oacc_loop_process (oacc_loop *loop)
1033 if (loop->child)
1034 oacc_loop_process (loop->child);
1036 if (loop->mask && !loop->routine)
1038 int ix;
1039 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1040 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1041 tree chunk_arg = loop->chunk_size;
1042 gcall *call;
1044 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1045 switch (gimple_call_internal_fn (call))
1047 case IFN_GOACC_LOOP:
1049 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1050 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1051 if (!is_e)
1052 gimple_call_set_arg (call, 4, chunk_arg);
1054 break;
1056 case IFN_GOACC_TILE:
1057 gimple_call_set_arg (call, 3, mask_arg);
1058 gimple_call_set_arg (call, 4, e_mask_arg);
1059 break;
1061 default:
1062 gcc_unreachable ();
1065 unsigned dim = GOMP_DIM_GANG;
1066 unsigned mask = loop->mask | loop->e_mask;
1067 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1069 while (!(GOMP_DIM_MASK (dim) & mask))
1070 dim++;
1072 oacc_loop_xform_head_tail (loop->heads[ix], dim);
1073 oacc_loop_xform_head_tail (loop->tails[ix], dim);
1075 mask ^= GOMP_DIM_MASK (dim);
1079 if (loop->sibling)
1080 oacc_loop_process (loop->sibling);
1083 /* Walk the OpenACC loop heirarchy checking and assigning the
1084 programmer-specified partitionings. OUTER_MASK is the partitioning
1085 this loop is contained within. Return mask of partitioning
1086 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1087 bit. */
1089 static unsigned
1090 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1092 unsigned this_mask = loop->mask;
1093 unsigned mask_all = 0;
1094 bool noisy = true;
1096 #ifdef ACCEL_COMPILER
1097 /* When device_type is supported, we want the device compiler to be
1098 noisy, if the loop parameters are device_type-specific. */
1099 noisy = false;
1100 #endif
1102 if (!loop->routine)
1104 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1105 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1106 bool tiling = (loop->flags & OLF_TILE) != 0;
1108 this_mask = ((loop->flags >> OLF_DIM_BASE)
1109 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1111 /* Apply auto partitioning if this is a non-partitioned regular
1112 loop, or (no more than) single axis tiled loop. */
1113 bool maybe_auto
1114 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1116 if ((this_mask != 0) + auto_par + seq_par > 1)
1118 if (noisy)
1119 error_at (loop->loc,
1120 seq_par
1121 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1122 : G_("%<auto%> conflicts with other OpenACC loop "
1123 "specifiers"));
1124 maybe_auto = false;
1125 loop->flags &= ~OLF_AUTO;
1126 if (seq_par)
1128 loop->flags
1129 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1130 this_mask = 0;
1134 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1136 loop->flags |= OLF_AUTO;
1137 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1141 if (this_mask & outer_mask)
1143 const oacc_loop *outer;
1144 for (outer = loop->parent; outer; outer = outer->parent)
1145 if ((outer->mask | outer->e_mask) & this_mask)
1146 break;
1148 if (noisy)
1150 if (outer)
1152 error_at (loop->loc,
1153 loop->routine
1154 ? G_("routine call uses same OpenACC parallelism"
1155 " as containing loop")
1156 : G_("inner loop uses same OpenACC parallelism"
1157 " as containing loop"));
1158 inform (outer->loc, "containing loop here");
1160 else
1161 error_at (loop->loc,
1162 loop->routine
1163 ? G_("routine call uses OpenACC parallelism disallowed"
1164 " by containing routine")
1165 : G_("loop uses OpenACC parallelism disallowed"
1166 " by containing routine"));
1168 if (loop->routine)
1169 inform (DECL_SOURCE_LOCATION (loop->routine),
1170 "routine %qD declared here", loop->routine);
1172 this_mask &= ~outer_mask;
1174 else
1176 unsigned outermost = least_bit_hwi (this_mask);
1178 if (outermost && outermost <= outer_mask)
1180 if (noisy)
1182 error_at (loop->loc,
1183 "incorrectly nested OpenACC loop parallelism");
1185 const oacc_loop *outer;
1186 for (outer = loop->parent;
1187 outer->flags && outer->flags < outermost;
1188 outer = outer->parent)
1189 continue;
1190 inform (outer->loc, "containing loop here");
1193 this_mask &= ~outermost;
1197 mask_all |= this_mask;
1199 if (loop->flags & OLF_TILE)
1201 /* When tiling, vector goes to the element loop, and failing
1202 that we put worker there. The std doesn't contemplate
1203 specifying all three. We choose to put worker and vector on
1204 the element loops in that case. */
1205 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1206 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1207 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1209 loop->e_mask = this_e_mask;
1210 this_mask ^= this_e_mask;
1213 loop->mask = this_mask;
1215 if (dump_file)
1216 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1217 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1218 loop->mask, loop->e_mask);
1220 if (loop->child)
1222 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1223 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1224 mask_all |= loop->inner;
1227 if (loop->sibling)
1228 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1230 return mask_all;
1233 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1234 OUTER_MASK is the partitioning this loop is contained within.
1235 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1236 Return the cumulative partitioning used by this loop, siblings and
1237 children. */
1239 static unsigned
1240 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1241 bool outer_assign)
1243 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1244 bool noisy = true;
1245 bool tiling = loop->flags & OLF_TILE;
1247 #ifdef ACCEL_COMPILER
1248 /* When device_type is supported, we want the device compiler to be
1249 noisy, if the loop parameters are device_type-specific. */
1250 noisy = false;
1251 #endif
1253 if (assign && (!outer_assign || loop->inner))
1255 /* Allocate outermost and non-innermost loops at the outermost
1256 non-innermost available level. */
1257 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1259 /* Find the first outermost available partition. */
1260 while (this_mask <= outer_mask)
1261 this_mask <<= 1;
1263 /* Grab two axes if tiling, and we've not assigned anything */
1264 if (tiling && !(loop->mask | loop->e_mask))
1265 this_mask |= this_mask << 1;
1267 /* Prohibit the innermost partitioning at the moment. */
1268 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1270 /* Don't use any dimension explicitly claimed by an inner loop. */
1271 this_mask &= ~loop->inner;
1273 if (tiling && !loop->e_mask)
1275 /* If we got two axes, allocate the inner one to the element
1276 loop. */
1277 loop->e_mask = this_mask & (this_mask << 1);
1278 this_mask ^= loop->e_mask;
1281 loop->mask |= this_mask;
1284 if (loop->child)
1286 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1287 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1288 outer_assign | assign);
1291 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1293 /* Allocate the loop at the innermost available level. Note
1294 that we do this even if we already assigned this loop the
1295 outermost available level above. That way we'll partition
1296 this along 2 axes, if they are available. */
1297 unsigned this_mask = 0;
1299 /* Determine the outermost partitioning used within this loop. */
1300 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1301 this_mask = least_bit_hwi (this_mask);
1303 /* Pick the partitioning just inside that one. */
1304 this_mask >>= 1;
1306 /* And avoid picking one use by an outer loop. */
1307 this_mask &= ~outer_mask;
1309 /* If tiling and we failed completely above, grab the next one
1310 too. Making sure it doesn't hit an outer loop. */
1311 if (tiling)
1313 this_mask &= ~(loop->e_mask | loop->mask);
1314 unsigned tile_mask = ((this_mask >> 1)
1315 & ~(outer_mask | loop->e_mask | loop->mask));
1317 if (tile_mask || loop->mask)
1319 loop->e_mask |= this_mask;
1320 this_mask = tile_mask;
1322 if (!loop->e_mask && noisy)
1323 warning_at (loop->loc, 0,
1324 "insufficient partitioning available"
1325 " to parallelize element loop");
1328 loop->mask |= this_mask;
1329 if (!loop->mask && noisy)
1330 warning_at (loop->loc, 0,
1331 tiling
1332 ? G_("insufficient partitioning available"
1333 " to parallelize tile loop")
1334 : G_("insufficient partitioning available"
1335 " to parallelize loop"));
1338 if (assign && dump_file)
1339 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1340 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1341 loop->mask, loop->e_mask);
1343 unsigned inner_mask = 0;
1345 if (loop->sibling)
1346 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1347 outer_mask, outer_assign);
1349 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1351 return inner_mask;
1354 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1355 axes. Return mask of partitioning. */
1357 static unsigned
1358 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1360 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1362 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1364 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1365 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1367 return mask_all;
1370 /* Default fork/join early expander. Delete the function calls if
1371 there is no RTL expander. */
1373 bool
1374 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1375 const int *ARG_UNUSED (dims), bool is_fork)
1377 if (is_fork)
1378 return targetm.have_oacc_fork ();
1379 else
1380 return targetm.have_oacc_join ();
1383 /* Default goacc.reduction early expander.
1385 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1386 If RES_PTR is not integer-zerop:
1387 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1388 TEARDOWN - emit '*RES_PTR = VAR'
1389 If LHS is not NULL
1390 emit 'LHS = VAR' */
1392 void
1393 default_goacc_reduction (gcall *call)
1395 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1396 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1397 tree lhs = gimple_call_lhs (call);
1398 tree var = gimple_call_arg (call, 2);
1399 gimple_seq seq = NULL;
1401 if (code == IFN_GOACC_REDUCTION_SETUP
1402 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1404 /* Setup and Teardown need to copy from/to the receiver object,
1405 if there is one. */
1406 tree ref_to_res = gimple_call_arg (call, 1);
1408 if (!integer_zerop (ref_to_res))
1410 tree dst = build_simple_mem_ref (ref_to_res);
1411 tree src = var;
1413 if (code == IFN_GOACC_REDUCTION_SETUP)
1415 src = dst;
1416 dst = lhs;
1417 lhs = NULL;
1419 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1423 /* Copy VAR to LHS, if there is an LHS. */
1424 if (lhs)
1425 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1427 gsi_replace_with_seq (&gsi, seq, true);
1430 /* Main entry point for oacc transformations which run on the device
1431 compiler after LTO, so we know what the target device is at this
1432 point (including the host fallback). */
1434 static unsigned int
1435 execute_oacc_device_lower ()
1437 tree attrs = oacc_get_fn_attrib (current_function_decl);
1439 if (!attrs)
1440 /* Not an offloaded function. */
1441 return 0;
1443 /* Parse the default dim argument exactly once. */
1444 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1446 oacc_parse_default_dims (flag_openacc_dims);
1447 flag_openacc_dims = (char *)&flag_openacc_dims;
1450 /* Discover, partition and process the loops. */
1451 oacc_loop *loops = oacc_loop_discovery ();
1452 int fn_level = oacc_fn_attrib_level (attrs);
1454 if (dump_file)
1455 fprintf (dump_file, oacc_fn_attrib_kernels_p (attrs)
1456 ? "Function is kernels offload\n"
1457 : fn_level < 0 ? "Function is parallel offload\n"
1458 : "Function is routine level %d\n", fn_level);
1460 unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
1461 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1462 int dims[GOMP_DIM_MAX];
1464 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1466 if (dump_file)
1468 const char *comma = "Compute dimensions [";
1469 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1470 fprintf (dump_file, "%s%d", comma, dims[ix]);
1471 fprintf (dump_file, "]\n");
1474 oacc_loop_process (loops);
1475 if (dump_file)
1477 fprintf (dump_file, "OpenACC loops\n");
1478 dump_oacc_loop (dump_file, loops, 0);
1479 fprintf (dump_file, "\n");
1482 /* Offloaded targets may introduce new basic blocks, which require
1483 dominance information to update SSA. */
1484 calculate_dominance_info (CDI_DOMINATORS);
1486 /* Now lower internal loop functions to target-specific code
1487 sequences. */
1488 basic_block bb;
1489 FOR_ALL_BB_FN (bb, cfun)
1490 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1492 gimple *stmt = gsi_stmt (gsi);
1493 if (!is_gimple_call (stmt))
1495 gsi_next (&gsi);
1496 continue;
1499 gcall *call = as_a <gcall *> (stmt);
1500 if (!gimple_call_internal_p (call))
1502 gsi_next (&gsi);
1503 continue;
1506 /* Rewind to allow rescan. */
1507 gsi_prev (&gsi);
1508 bool rescan = false, remove = false;
1509 enum internal_fn ifn_code = gimple_call_internal_fn (call);
1511 switch (ifn_code)
1513 default: break;
1515 case IFN_GOACC_TILE:
1516 oacc_xform_tile (call);
1517 rescan = true;
1518 break;
1520 case IFN_GOACC_LOOP:
1521 oacc_xform_loop (call);
1522 rescan = true;
1523 break;
1525 case IFN_GOACC_REDUCTION:
1526 /* Mark the function for SSA renaming. */
1527 mark_virtual_operands_for_renaming (cfun);
1529 /* If the level is -1, this ended up being an unused
1530 axis. Handle as a default. */
1531 if (integer_minus_onep (gimple_call_arg (call, 3)))
1532 default_goacc_reduction (call);
1533 else
1534 targetm.goacc.reduction (call);
1535 rescan = true;
1536 break;
1538 case IFN_UNIQUE:
1540 enum ifn_unique_kind kind
1541 = ((enum ifn_unique_kind)
1542 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1544 switch (kind)
1546 default:
1547 break;
1549 case IFN_UNIQUE_OACC_FORK:
1550 case IFN_UNIQUE_OACC_JOIN:
1551 if (integer_minus_onep (gimple_call_arg (call, 2)))
1552 remove = true;
1553 else if (!targetm.goacc.fork_join
1554 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1555 remove = true;
1556 break;
1558 case IFN_UNIQUE_OACC_HEAD_MARK:
1559 case IFN_UNIQUE_OACC_TAIL_MARK:
1560 remove = true;
1561 break;
1563 break;
1567 if (gsi_end_p (gsi))
1568 /* We rewound past the beginning of the BB. */
1569 gsi = gsi_start_bb (bb);
1570 else
1571 /* Undo the rewind. */
1572 gsi_next (&gsi);
1574 if (remove)
1576 if (gimple_vdef (call))
1577 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1578 if (gimple_call_lhs (call))
1580 /* Propagate the data dependency var. */
1581 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1582 gimple_call_arg (call, 1));
1583 gsi_replace (&gsi, ass, false);
1585 else
1586 gsi_remove (&gsi, true);
1588 else if (!rescan)
1589 /* If not rescanning, advance over the call. */
1590 gsi_next (&gsi);
1593 free_oacc_loop (loops);
1595 return 0;
1598 /* Default launch dimension validator. Force everything to 1. A
1599 backend that wants to provide larger dimensions must override this
1600 hook. */
1602 bool
1603 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
1604 int ARG_UNUSED (fn_level))
1606 bool changed = false;
1608 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
1610 if (dims[ix] != 1)
1612 dims[ix] = 1;
1613 changed = true;
1617 return changed;
1620 /* Default dimension bound is unknown on accelerator and 1 on host. */
1623 default_goacc_dim_limit (int ARG_UNUSED (axis))
1625 #ifdef ACCEL_COMPILER
1626 return 0;
1627 #else
1628 return 1;
1629 #endif
1632 namespace {
1634 const pass_data pass_data_oacc_device_lower =
1636 GIMPLE_PASS, /* type */
1637 "oaccdevlow", /* name */
1638 OPTGROUP_OMP, /* optinfo_flags */
1639 TV_NONE, /* tv_id */
1640 PROP_cfg, /* properties_required */
1641 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
1642 0, /* properties_destroyed */
1643 0, /* todo_flags_start */
1644 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1647 class pass_oacc_device_lower : public gimple_opt_pass
1649 public:
1650 pass_oacc_device_lower (gcc::context *ctxt)
1651 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
1654 /* opt_pass methods: */
1655 virtual bool gate (function *) { return flag_openacc; };
1657 virtual unsigned int execute (function *)
1659 return execute_oacc_device_lower ();
1662 }; // class pass_oacc_device_lower
1664 } // anon namespace
1666 gimple_opt_pass *
1667 make_pass_oacc_device_lower (gcc::context *ctxt)
1669 return new pass_oacc_device_lower (ctxt);
1672 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
1673 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
1674 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
1675 internal functions on non-SIMT targets, and likewise some SIMD internal
1676 functions on SIMT targets. */
1678 static unsigned int
1679 execute_omp_device_lower ()
1681 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
1682 basic_block bb;
1683 gimple_stmt_iterator gsi;
1684 FOR_EACH_BB_FN (bb, cfun)
1685 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1687 gimple *stmt = gsi_stmt (gsi);
1688 if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
1689 continue;
1690 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
1691 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
1692 switch (gimple_call_internal_fn (stmt))
1694 case IFN_GOMP_USE_SIMT:
1695 rhs = vf == 1 ? integer_zero_node : integer_one_node;
1696 break;
1697 case IFN_GOMP_SIMT_LANE:
1698 case IFN_GOMP_SIMT_LAST_LANE:
1699 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
1700 break;
1701 case IFN_GOMP_SIMT_VF:
1702 rhs = build_int_cst (type, vf);
1703 break;
1704 case IFN_GOMP_SIMT_ORDERED_PRED:
1705 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
1706 if (rhs || !lhs)
1707 unlink_stmt_vdef (stmt);
1708 break;
1709 case IFN_GOMP_SIMT_VOTE_ANY:
1710 case IFN_GOMP_SIMT_XCHG_BFLY:
1711 case IFN_GOMP_SIMT_XCHG_IDX:
1712 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1713 break;
1714 case IFN_GOMP_SIMD_LANE:
1715 case IFN_GOMP_SIMD_LAST_LANE:
1716 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
1717 break;
1718 case IFN_GOMP_SIMD_VF:
1719 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
1720 break;
1721 default:
1722 continue;
1724 if (lhs && !rhs)
1725 continue;
1726 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
1727 gsi_replace (&gsi, stmt, false);
1729 if (vf != 1)
1730 cfun->has_force_vectorize_loops = false;
1731 return 0;
1734 namespace {
1736 const pass_data pass_data_omp_device_lower =
1738 GIMPLE_PASS, /* type */
1739 "ompdevlow", /* name */
1740 OPTGROUP_OMP, /* optinfo_flags */
1741 TV_NONE, /* tv_id */
1742 PROP_cfg, /* properties_required */
1743 PROP_gimple_lomp_dev, /* properties_provided */
1744 0, /* properties_destroyed */
1745 0, /* todo_flags_start */
1746 TODO_update_ssa, /* todo_flags_finish */
1749 class pass_omp_device_lower : public gimple_opt_pass
1751 public:
1752 pass_omp_device_lower (gcc::context *ctxt)
1753 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
1756 /* opt_pass methods: */
1757 virtual bool gate (function *fun)
1759 return !(fun->curr_properties & PROP_gimple_lomp_dev);
1761 virtual unsigned int execute (function *)
1763 return execute_omp_device_lower ();
1766 }; // class pass_expand_omp_ssa
1768 } // anon namespace
1770 gimple_opt_pass *
1771 make_pass_omp_device_lower (gcc::context *ctxt)
1773 return new pass_omp_device_lower (ctxt);
1776 /* "omp declare target link" handling pass. */
1778 namespace {
1780 const pass_data pass_data_omp_target_link =
1782 GIMPLE_PASS, /* type */
1783 "omptargetlink", /* name */
1784 OPTGROUP_OMP, /* optinfo_flags */
1785 TV_NONE, /* tv_id */
1786 PROP_ssa, /* properties_required */
1787 0, /* properties_provided */
1788 0, /* properties_destroyed */
1789 0, /* todo_flags_start */
1790 TODO_update_ssa, /* todo_flags_finish */
1793 class pass_omp_target_link : public gimple_opt_pass
1795 public:
1796 pass_omp_target_link (gcc::context *ctxt)
1797 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
1800 /* opt_pass methods: */
1801 virtual bool gate (function *fun)
1803 #ifdef ACCEL_COMPILER
1804 tree attrs = DECL_ATTRIBUTES (fun->decl);
1805 return lookup_attribute ("omp declare target", attrs)
1806 || lookup_attribute ("omp target entrypoint", attrs);
1807 #else
1808 (void) fun;
1809 return false;
1810 #endif
1813 virtual unsigned execute (function *);
1816 /* Callback for walk_gimple_stmt used to scan for link var operands. */
1818 static tree
1819 find_link_var_op (tree *tp, int *walk_subtrees, void *)
1821 tree t = *tp;
1823 if (VAR_P (t)
1824 && DECL_HAS_VALUE_EXPR_P (t)
1825 && is_global_var (t)
1826 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
1828 *walk_subtrees = 0;
1829 return t;
1832 return NULL_TREE;
1835 unsigned
1836 pass_omp_target_link::execute (function *fun)
1838 basic_block bb;
1839 FOR_EACH_BB_FN (bb, fun)
1841 gimple_stmt_iterator gsi;
1842 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1843 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
1844 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
1847 return 0;
1850 } // anon namespace
1852 gimple_opt_pass *
1853 make_pass_omp_target_link (gcc::context *ctxt)
1855 return new pass_omp_target_link (ctxt);