1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
4 Copyright (C) 2005-2021 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
29 #include "tree-pass.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
52 #include "stringpool.h"
59 /* Describe the OpenACC looping structure of a function. The entire
60 function is held in a 'NULL' loop. */
64 oacc_loop
*parent
; /* Containing loop. */
66 oacc_loop
*child
; /* First inner loop. */
68 oacc_loop
*sibling
; /* Next loop within same parent. */
70 location_t loc
; /* Location of the loop start. */
72 gcall
*marker
; /* Initial head marker. */
74 gcall
*heads
[GOMP_DIM_MAX
]; /* Head marker functions. */
75 gcall
*tails
[GOMP_DIM_MAX
]; /* Tail marker functions. */
77 tree routine
; /* Pseudo-loop enclosing a routine. */
79 unsigned mask
; /* Partitioning mask. */
80 unsigned e_mask
; /* Partitioning of element loops (when tiling). */
81 unsigned inner
; /* Partitioning of inner loops. */
82 unsigned flags
; /* Partitioning flags. */
83 vec
<gcall
*> ifns
; /* Contained loop abstraction functions. */
84 tree chunk_size
; /* Chunk size. */
85 gcall
*head_end
; /* Final marker of head sequence. */
88 /* Holds offload tables with decls. */
89 vec
<tree
, va_gc
> *offload_funcs
, *offload_vars
;
91 /* Return level at which oacc routine may spawn a partitioned loop, or
92 -1 if it is not a routine (i.e. is an offload fn). */
95 oacc_fn_attrib_level (tree attr
)
97 tree pos
= TREE_VALUE (attr
);
99 if (!TREE_PURPOSE (pos
))
103 for (ix
= 0; ix
!= GOMP_DIM_MAX
;
104 ix
++, pos
= TREE_CHAIN (pos
))
105 if (!integer_zerop (TREE_PURPOSE (pos
)))
111 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
112 adds their addresses and sizes to constructor-vector V_CTOR. */
115 add_decls_addresses_to_decl_constructor (vec
<tree
, va_gc
> *v_decls
,
116 vec
<constructor_elt
, va_gc
> *v_ctor
)
118 unsigned len
= vec_safe_length (v_decls
);
119 for (unsigned i
= 0; i
< len
; i
++)
121 tree it
= (*v_decls
)[i
];
122 bool is_var
= VAR_P (it
);
125 #ifdef ACCEL_COMPILER
126 && DECL_HAS_VALUE_EXPR_P (it
)
128 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it
));
130 /* See also omp_finish_file and output_offload_tables in lto-cgraph.c. */
131 if (!in_lto_p
&& !symtab_node::get (it
))
134 tree size
= NULL_TREE
;
136 size
= fold_convert (const_ptr_type_node
, DECL_SIZE_UNIT (it
));
140 addr
= build_fold_addr_expr (it
);
143 #ifdef ACCEL_COMPILER
144 /* For "omp declare target link" vars add address of the pointer to
145 the target table, instead of address of the var. */
146 tree value_expr
= DECL_VALUE_EXPR (it
);
147 tree link_ptr_decl
= TREE_OPERAND (value_expr
, 0);
148 varpool_node::finalize_decl (link_ptr_decl
);
149 addr
= build_fold_addr_expr (link_ptr_decl
);
151 addr
= build_fold_addr_expr (it
);
154 /* Most significant bit of the size marks "omp declare target link"
155 vars in host and target tables. */
156 unsigned HOST_WIDE_INT isize
= tree_to_uhwi (size
);
157 isize
|= 1ULL << (int_size_in_bytes (const_ptr_type_node
)
158 * BITS_PER_UNIT
- 1);
159 size
= wide_int_to_tree (const_ptr_type_node
, isize
);
162 CONSTRUCTOR_APPEND_ELT (v_ctor
, NULL_TREE
, addr
);
164 CONSTRUCTOR_APPEND_ELT (v_ctor
, NULL_TREE
, size
);
168 /* Return true if DECL is a function for which its references should be
172 omp_declare_target_fn_p (tree decl
)
174 return (TREE_CODE (decl
) == FUNCTION_DECL
175 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl
))
176 && !lookup_attribute ("omp declare target host",
177 DECL_ATTRIBUTES (decl
))
179 || oacc_get_fn_attrib (decl
) == NULL_TREE
));
182 /* Return true if DECL Is a variable for which its initializer references
183 should be analyzed. */
186 omp_declare_target_var_p (tree decl
)
189 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl
))
190 && !lookup_attribute ("omp declare target link",
191 DECL_ATTRIBUTES (decl
)));
194 /* Helper function for omp_discover_implicit_declare_target, called through
195 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
196 declare target to. */
199 omp_discover_declare_target_tgt_fn_r (tree
*tp
, int *walk_subtrees
, void *data
)
201 if (TREE_CODE (*tp
) == CALL_EXPR
202 && CALL_EXPR_FN (*tp
)
203 && TREE_CODE (CALL_EXPR_FN (*tp
)) == ADDR_EXPR
204 && TREE_CODE (TREE_OPERAND (CALL_EXPR_FN (*tp
), 0)) == FUNCTION_DECL
205 && lookup_attribute ("omp declare variant base",
206 DECL_ATTRIBUTES (TREE_OPERAND (CALL_EXPR_FN (*tp
),
209 tree fn
= TREE_OPERAND (CALL_EXPR_FN (*tp
), 0);
210 for (tree attr
= DECL_ATTRIBUTES (fn
); attr
; attr
= TREE_CHAIN (attr
))
212 attr
= lookup_attribute ("omp declare variant base", attr
);
213 if (attr
== NULL_TREE
)
215 tree purpose
= TREE_PURPOSE (TREE_VALUE (attr
));
216 if (TREE_CODE (purpose
) == FUNCTION_DECL
)
217 omp_discover_declare_target_tgt_fn_r (&purpose
, walk_subtrees
, data
);
220 else if (TREE_CODE (*tp
) == FUNCTION_DECL
)
223 tree id
= get_identifier ("omp declare target");
224 symtab_node
*node
= symtab_node::get (*tp
);
227 while (node
->alias_target
228 && TREE_CODE (node
->alias_target
) == FUNCTION_DECL
)
230 if (!omp_declare_target_fn_p (node
->decl
)
231 && !lookup_attribute ("omp declare target host",
232 DECL_ATTRIBUTES (node
->decl
)))
234 node
->offloadable
= 1;
235 DECL_ATTRIBUTES (node
->decl
)
236 = tree_cons (id
, NULL_TREE
, DECL_ATTRIBUTES (node
->decl
));
238 node
= symtab_node::get (node
->alias_target
);
240 symtab_node
*new_node
= node
->ultimate_alias_target ();
241 decl
= new_node
->decl
;
242 while (node
!= new_node
)
244 if (!omp_declare_target_fn_p (node
->decl
)
245 && !lookup_attribute ("omp declare target host",
246 DECL_ATTRIBUTES (node
->decl
)))
248 node
->offloadable
= 1;
249 DECL_ATTRIBUTES (node
->decl
)
250 = tree_cons (id
, NULL_TREE
, DECL_ATTRIBUTES (node
->decl
));
252 gcc_assert (node
->alias
&& node
->analyzed
);
253 node
= node
->get_alias_target ();
255 node
->offloadable
= 1;
256 if (ENABLE_OFFLOADING
)
257 g
->have_offload
= true;
259 if (omp_declare_target_fn_p (decl
)
260 || lookup_attribute ("omp declare target host",
261 DECL_ATTRIBUTES (decl
)))
264 if (!DECL_EXTERNAL (decl
) && DECL_SAVED_TREE (decl
))
265 ((vec
<tree
> *) data
)->safe_push (decl
);
266 DECL_ATTRIBUTES (decl
) = tree_cons (id
, NULL_TREE
,
267 DECL_ATTRIBUTES (decl
));
269 else if (TYPE_P (*tp
))
271 /* else if (TREE_CODE (*tp) == OMP_TARGET)
273 if (tree dev = omp_find_clause (OMP_TARGET_CLAUSES (*tp)))
274 if (OMP_DEVICE_ANCESTOR (dev))
280 /* Similarly, but ignore references outside of OMP_TARGET regions. */
283 omp_discover_declare_target_fn_r (tree
*tp
, int *walk_subtrees
, void *data
)
285 if (TREE_CODE (*tp
) == OMP_TARGET
)
287 /* And not OMP_DEVICE_ANCESTOR. */
288 walk_tree_without_duplicates (&OMP_TARGET_BODY (*tp
),
289 omp_discover_declare_target_tgt_fn_r
,
293 else if (TYPE_P (*tp
))
298 /* Helper function for omp_discover_implicit_declare_target, called through
299 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
300 declare target to. */
303 omp_discover_declare_target_var_r (tree
*tp
, int *walk_subtrees
, void *data
)
305 if (TREE_CODE (*tp
) == FUNCTION_DECL
)
306 return omp_discover_declare_target_tgt_fn_r (tp
, walk_subtrees
, data
);
308 && is_global_var (*tp
)
309 && !omp_declare_target_var_p (*tp
))
311 tree id
= get_identifier ("omp declare target");
312 if (lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp
)))
314 error_at (DECL_SOURCE_LOCATION (*tp
),
315 "%qD specified both in declare target %<link%> and "
316 "implicitly in %<to%> clauses", *tp
);
317 DECL_ATTRIBUTES (*tp
)
318 = remove_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp
));
320 if (TREE_STATIC (*tp
) && lang_hooks
.decls
.omp_get_decl_init (*tp
))
321 ((vec
<tree
> *) data
)->safe_push (*tp
);
322 DECL_ATTRIBUTES (*tp
) = tree_cons (id
, NULL_TREE
, DECL_ATTRIBUTES (*tp
));
323 symtab_node
*node
= symtab_node::get (*tp
);
324 if (node
!= NULL
&& !node
->offloadable
)
326 node
->offloadable
= 1;
327 if (ENABLE_OFFLOADING
)
329 g
->have_offload
= true;
330 if (is_a
<varpool_node
*> (node
))
331 vec_safe_push (offload_vars
, node
->decl
);
335 else if (TYPE_P (*tp
))
340 /* Perform the OpenMP implicit declare target to discovery. */
343 omp_discover_implicit_declare_target (void)
347 auto_vec
<tree
> worklist
;
349 FOR_EACH_DEFINED_FUNCTION (node
)
350 if (DECL_SAVED_TREE (node
->decl
))
352 struct cgraph_node
*cgn
;
353 if (omp_declare_target_fn_p (node
->decl
))
354 worklist
.safe_push (node
->decl
);
355 else if (DECL_STRUCT_FUNCTION (node
->decl
)
356 && DECL_STRUCT_FUNCTION (node
->decl
)->has_omp_target
)
357 worklist
.safe_push (node
->decl
);
358 for (cgn
= first_nested_function (node
);
359 cgn
; cgn
= next_nested_function (cgn
))
360 if (omp_declare_target_fn_p (cgn
->decl
))
361 worklist
.safe_push (cgn
->decl
);
362 else if (DECL_STRUCT_FUNCTION (cgn
->decl
)
363 && DECL_STRUCT_FUNCTION (cgn
->decl
)->has_omp_target
)
364 worklist
.safe_push (cgn
->decl
);
366 FOR_EACH_VARIABLE (vnode
)
367 if (lang_hooks
.decls
.omp_get_decl_init (vnode
->decl
)
368 && omp_declare_target_var_p (vnode
->decl
))
369 worklist
.safe_push (vnode
->decl
);
370 while (!worklist
.is_empty ())
372 tree decl
= worklist
.pop ();
374 walk_tree_without_duplicates (lang_hooks
.decls
.omp_get_decl_init (decl
),
375 omp_discover_declare_target_var_r
,
377 else if (omp_declare_target_fn_p (decl
))
378 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl
),
379 omp_discover_declare_target_tgt_fn_r
,
382 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl
),
383 omp_discover_declare_target_fn_r
,
387 lang_hooks
.decls
.omp_finish_decl_inits ();
391 /* Create new symbols containing (address, size) pairs for global variables,
392 marked with "omp declare target" attribute, as well as addresses for the
393 functions, which are outlined offloading regions. */
395 omp_finish_file (void)
397 unsigned num_funcs
= vec_safe_length (offload_funcs
);
398 unsigned num_vars
= vec_safe_length (offload_vars
);
400 if (num_funcs
== 0 && num_vars
== 0)
403 if (targetm_common
.have_named_sections
)
405 vec
<constructor_elt
, va_gc
> *v_f
, *v_v
;
406 vec_alloc (v_f
, num_funcs
);
407 vec_alloc (v_v
, num_vars
* 2);
409 add_decls_addresses_to_decl_constructor (offload_funcs
, v_f
);
410 add_decls_addresses_to_decl_constructor (offload_vars
, v_v
);
412 tree vars_decl_type
= build_array_type_nelts (pointer_sized_int_node
,
413 vec_safe_length (v_v
));
414 tree funcs_decl_type
= build_array_type_nelts (pointer_sized_int_node
,
416 SET_TYPE_ALIGN (vars_decl_type
, TYPE_ALIGN (pointer_sized_int_node
));
417 SET_TYPE_ALIGN (funcs_decl_type
, TYPE_ALIGN (pointer_sized_int_node
));
418 tree ctor_v
= build_constructor (vars_decl_type
, v_v
);
419 tree ctor_f
= build_constructor (funcs_decl_type
, v_f
);
420 TREE_CONSTANT (ctor_v
) = TREE_CONSTANT (ctor_f
) = 1;
421 TREE_STATIC (ctor_v
) = TREE_STATIC (ctor_f
) = 1;
422 tree funcs_decl
= build_decl (UNKNOWN_LOCATION
, VAR_DECL
,
423 get_identifier (".offload_func_table"),
425 tree vars_decl
= build_decl (UNKNOWN_LOCATION
, VAR_DECL
,
426 get_identifier (".offload_var_table"),
428 TREE_STATIC (funcs_decl
) = TREE_STATIC (vars_decl
) = 1;
429 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
430 otherwise a joint table in a binary will contain padding between
431 tables from multiple object files. */
432 DECL_USER_ALIGN (funcs_decl
) = DECL_USER_ALIGN (vars_decl
) = 1;
433 SET_DECL_ALIGN (funcs_decl
, TYPE_ALIGN (funcs_decl_type
));
434 SET_DECL_ALIGN (vars_decl
, TYPE_ALIGN (vars_decl_type
));
435 DECL_INITIAL (funcs_decl
) = ctor_f
;
436 DECL_INITIAL (vars_decl
) = ctor_v
;
437 set_decl_section_name (funcs_decl
, OFFLOAD_FUNC_TABLE_SECTION_NAME
);
438 set_decl_section_name (vars_decl
, OFFLOAD_VAR_TABLE_SECTION_NAME
);
440 varpool_node::finalize_decl (vars_decl
);
441 varpool_node::finalize_decl (funcs_decl
);
445 for (unsigned i
= 0; i
< num_funcs
; i
++)
447 tree it
= (*offload_funcs
)[i
];
448 /* See also add_decls_addresses_to_decl_constructor
449 and output_offload_tables in lto-cgraph.c. */
450 if (!in_lto_p
&& !symtab_node::get (it
))
452 targetm
.record_offload_symbol (it
);
454 for (unsigned i
= 0; i
< num_vars
; i
++)
456 tree it
= (*offload_vars
)[i
];
457 if (!in_lto_p
&& !symtab_node::get (it
))
459 #ifdef ACCEL_COMPILER
460 if (DECL_HAS_VALUE_EXPR_P (it
)
461 && lookup_attribute ("omp declare target link",
462 DECL_ATTRIBUTES (it
)))
464 tree value_expr
= DECL_VALUE_EXPR (it
);
465 tree link_ptr_decl
= TREE_OPERAND (value_expr
, 0);
466 targetm
.record_offload_symbol (link_ptr_decl
);
467 varpool_node::finalize_decl (link_ptr_decl
);
471 targetm
.record_offload_symbol (it
);
476 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
477 axis DIM. Return a tmp var holding the result. */
480 oacc_dim_call (bool pos
, int dim
, gimple_seq
*seq
)
482 tree arg
= build_int_cst (unsigned_type_node
, dim
);
483 tree size
= create_tmp_var (integer_type_node
);
484 enum internal_fn fn
= pos
? IFN_GOACC_DIM_POS
: IFN_GOACC_DIM_SIZE
;
485 gimple
*call
= gimple_build_call_internal (fn
, 1, arg
);
487 gimple_call_set_lhs (call
, size
);
488 gimple_seq_add_stmt (seq
, call
);
493 /* Find the number of threads (POS = false), or thread number (POS =
494 true) for an OpenACC region partitioned as MASK. Setup code
495 required for the calculation is added to SEQ. */
498 oacc_thread_numbers (bool pos
, int mask
, gimple_seq
*seq
)
500 tree res
= pos
? NULL_TREE
: build_int_cst (unsigned_type_node
, 1);
503 /* Start at gang level, and examine relevant dimension indices. */
504 for (ix
= GOMP_DIM_GANG
; ix
!= GOMP_DIM_MAX
; ix
++)
505 if (GOMP_DIM_MASK (ix
) & mask
)
509 /* We had an outer index, so scale that by the size of
511 tree n
= oacc_dim_call (false, ix
, seq
);
512 res
= fold_build2 (MULT_EXPR
, integer_type_node
, res
, n
);
516 /* Determine index in this dimension. */
517 tree id
= oacc_dim_call (true, ix
, seq
);
519 res
= fold_build2 (PLUS_EXPR
, integer_type_node
, res
, id
);
525 if (res
== NULL_TREE
)
526 res
= integer_zero_node
;
531 /* Transform IFN_GOACC_LOOP calls to actual code. See
532 expand_oacc_for for where these are generated. At the vector
533 level, we stride loops, such that each member of a warp will
534 operate on adjacent iterations. At the worker and gang level,
535 each gang/warp executes a set of contiguous iterations. Chunking
536 can override this such that each iteration engine executes a
537 contiguous chunk, and then moves on to stride to the next chunk. */
540 oacc_xform_loop (gcall
*call
)
542 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
543 enum ifn_goacc_loop_kind code
544 = (enum ifn_goacc_loop_kind
) TREE_INT_CST_LOW (gimple_call_arg (call
, 0));
545 tree dir
= gimple_call_arg (call
, 1);
546 tree range
= gimple_call_arg (call
, 2);
547 tree step
= gimple_call_arg (call
, 3);
548 tree chunk_size
= NULL_TREE
;
549 unsigned mask
= (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call
, 5));
550 tree lhs
= gimple_call_lhs (call
);
551 tree type
= NULL_TREE
;
552 tree diff_type
= TREE_TYPE (range
);
554 gimple_seq seq
= NULL
;
555 bool chunking
= false, striding
= true;
556 unsigned outer_mask
= mask
& (~mask
+ 1); // Outermost partitioning
557 unsigned inner_mask
= mask
& ~outer_mask
; // Inner partitioning (if any)
559 /* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
562 gsi_replace_with_seq (&gsi
, seq
, true);
566 type
= TREE_TYPE (lhs
);
568 #ifdef ACCEL_COMPILER
569 chunk_size
= gimple_call_arg (call
, 4);
570 if (integer_minus_onep (chunk_size
) /* Force static allocation. */
571 || integer_zerop (chunk_size
)) /* Default (also static). */
573 /* If we're at the gang level, we want each to execute a
574 contiguous run of iterations. Otherwise we want each element
576 striding
= !(outer_mask
& GOMP_DIM_MASK (GOMP_DIM_GANG
));
581 /* Chunk of size 1 is striding. */
582 striding
= integer_onep (chunk_size
);
583 chunking
= !striding
;
587 /* striding=true, chunking=true
589 striding=true, chunking=false
591 striding=false,chunking=true
592 -> chunks=ceil (range/(chunksize*threads*step))
593 striding=false,chunking=false
594 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
595 push_gimplify_context (true);
599 default: gcc_unreachable ();
601 case IFN_GOACC_LOOP_CHUNKS
:
603 r
= build_int_cst (type
, 1);
607 = (range - dir) / (chunks * step * num_threads) + dir */
608 tree per
= oacc_thread_numbers (false, mask
, &seq
);
609 per
= fold_convert (type
, per
);
610 chunk_size
= fold_convert (type
, chunk_size
);
611 per
= fold_build2 (MULT_EXPR
, type
, per
, chunk_size
);
612 per
= fold_build2 (MULT_EXPR
, type
, per
, step
);
613 r
= build2 (MINUS_EXPR
, type
, range
, dir
);
614 r
= build2 (PLUS_EXPR
, type
, r
, per
);
615 r
= build2 (TRUNC_DIV_EXPR
, type
, r
, per
);
619 case IFN_GOACC_LOOP_STEP
:
621 /* If striding, step by the entire compute volume, otherwise
622 step by the inner volume. */
623 unsigned volume
= striding
? mask
: inner_mask
;
625 r
= oacc_thread_numbers (false, volume
, &seq
);
626 r
= build2 (MULT_EXPR
, type
, fold_convert (type
, r
), step
);
630 case IFN_GOACC_LOOP_OFFSET
:
631 /* Enable vectorization on non-SIMT targets. */
633 && outer_mask
== GOMP_DIM_MASK (GOMP_DIM_VECTOR
)
634 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
636 && (flag_tree_loop_vectorize
637 || !OPTION_SET_P (flag_tree_loop_vectorize
)))
639 basic_block bb
= gsi_bb (gsi
);
640 class loop
*parent
= bb
->loop_father
;
641 class loop
*body
= parent
->inner
;
643 parent
->force_vectorize
= true;
644 parent
->safelen
= INT_MAX
;
646 /* "Chunking loops" may have inner loops. */
649 body
->force_vectorize
= true;
650 body
->safelen
= INT_MAX
;
653 cfun
->has_force_vectorize_loops
= true;
657 r
= oacc_thread_numbers (true, mask
, &seq
);
658 r
= fold_convert (diff_type
, r
);
662 tree inner_size
= oacc_thread_numbers (false, inner_mask
, &seq
);
663 tree outer_size
= oacc_thread_numbers (false, outer_mask
, &seq
);
664 tree volume
= fold_build2 (MULT_EXPR
, TREE_TYPE (inner_size
),
665 inner_size
, outer_size
);
667 volume
= fold_convert (diff_type
, volume
);
669 chunk_size
= fold_convert (diff_type
, chunk_size
);
672 tree per
= fold_build2 (MULT_EXPR
, diff_type
, volume
, step
);
674 chunk_size
= build2 (MINUS_EXPR
, diff_type
, range
, dir
);
675 chunk_size
= build2 (PLUS_EXPR
, diff_type
, chunk_size
, per
);
676 chunk_size
= build2 (TRUNC_DIV_EXPR
, diff_type
, chunk_size
, per
);
679 tree span
= build2 (MULT_EXPR
, diff_type
, chunk_size
,
680 fold_convert (diff_type
, inner_size
));
681 r
= oacc_thread_numbers (true, outer_mask
, &seq
);
682 r
= fold_convert (diff_type
, r
);
683 r
= build2 (MULT_EXPR
, diff_type
, r
, span
);
685 tree inner
= oacc_thread_numbers (true, inner_mask
, &seq
);
686 inner
= fold_convert (diff_type
, inner
);
687 r
= fold_build2 (PLUS_EXPR
, diff_type
, r
, inner
);
691 tree chunk
= fold_convert (diff_type
, gimple_call_arg (call
, 6));
693 = fold_build2 (MULT_EXPR
, diff_type
, volume
, chunk_size
);
694 per
= build2 (MULT_EXPR
, diff_type
, per
, chunk
);
696 r
= build2 (PLUS_EXPR
, diff_type
, r
, per
);
699 r
= fold_build2 (MULT_EXPR
, diff_type
, r
, step
);
700 if (type
!= diff_type
)
701 r
= fold_convert (type
, r
);
704 case IFN_GOACC_LOOP_BOUND
:
709 tree inner_size
= oacc_thread_numbers (false, inner_mask
, &seq
);
710 tree outer_size
= oacc_thread_numbers (false, outer_mask
, &seq
);
711 tree volume
= fold_build2 (MULT_EXPR
, TREE_TYPE (inner_size
),
712 inner_size
, outer_size
);
714 volume
= fold_convert (diff_type
, volume
);
716 chunk_size
= fold_convert (diff_type
, chunk_size
);
719 tree per
= fold_build2 (MULT_EXPR
, diff_type
, volume
, step
);
721 chunk_size
= build2 (MINUS_EXPR
, diff_type
, range
, dir
);
722 chunk_size
= build2 (PLUS_EXPR
, diff_type
, chunk_size
, per
);
723 chunk_size
= build2 (TRUNC_DIV_EXPR
, diff_type
, chunk_size
, per
);
726 tree span
= build2 (MULT_EXPR
, diff_type
, chunk_size
,
727 fold_convert (diff_type
, inner_size
));
729 r
= fold_build2 (MULT_EXPR
, diff_type
, span
, step
);
731 tree offset
= gimple_call_arg (call
, 6);
732 r
= build2 (PLUS_EXPR
, diff_type
, r
,
733 fold_convert (diff_type
, offset
));
734 r
= build2 (integer_onep (dir
) ? MIN_EXPR
: MAX_EXPR
,
735 diff_type
, r
, range
);
737 if (diff_type
!= type
)
738 r
= fold_convert (type
, r
);
742 gimplify_assign (lhs
, r
, &seq
);
744 pop_gimplify_context (NULL
);
746 gsi_replace_with_seq (&gsi
, seq
, true);
749 /* Transform a GOACC_TILE call. Determines the element loop span for
750 the specified loop of the nest. This is 1 if we're not tiling.
752 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
755 oacc_xform_tile (gcall
*call
)
757 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
758 unsigned collapse
= tree_to_uhwi (gimple_call_arg (call
, 0));
759 /* Inner loops have higher loop_nos. */
760 unsigned loop_no
= tree_to_uhwi (gimple_call_arg (call
, 1));
761 tree tile_size
= gimple_call_arg (call
, 2);
762 unsigned e_mask
= tree_to_uhwi (gimple_call_arg (call
, 4));
763 tree lhs
= gimple_call_lhs (call
);
764 tree type
= TREE_TYPE (lhs
);
765 gimple_seq seq
= NULL
;
766 tree span
= build_int_cst (type
, 1);
769 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR
)
770 | GOMP_DIM_MASK (GOMP_DIM_WORKER
))));
771 push_gimplify_context (!seen_error ());
773 #ifndef ACCEL_COMPILER
774 /* Partitioning disabled on host compilers. */
778 /* Not paritioning. */
779 span
= integer_one_node
;
780 else if (!integer_zerop (tile_size
))
781 /* User explicitly specified size. */
785 /* Pick a size based on the paritioning of the element loop and
786 the number of loop nests. */
787 tree first_size
= NULL_TREE
;
788 tree second_size
= NULL_TREE
;
790 if (e_mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
791 first_size
= oacc_dim_call (false, GOMP_DIM_VECTOR
, &seq
);
792 if (e_mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
793 second_size
= oacc_dim_call (false, GOMP_DIM_WORKER
, &seq
);
797 first_size
= second_size
;
798 second_size
= NULL_TREE
;
801 if (loop_no
+ 1 == collapse
)
804 if (!loop_no
&& second_size
)
805 span
= fold_build2 (MULT_EXPR
, TREE_TYPE (span
),
808 else if (loop_no
+ 2 == collapse
)
814 /* There's no obvious element size for this loop. Options
815 are 1, first_size or some non-unity constant (32 is my
816 favourite). We should gather some statistics. */
820 span
= fold_convert (type
, span
);
821 gimplify_assign (lhs
, span
, &seq
);
823 pop_gimplify_context (NULL
);
825 gsi_replace_with_seq (&gsi
, seq
, true);
828 /* Default partitioned and minimum partitioned dimensions. */
830 static int oacc_default_dims
[GOMP_DIM_MAX
];
831 static int oacc_min_dims
[GOMP_DIM_MAX
];
834 oacc_get_default_dim (int dim
)
836 gcc_assert (0 <= dim
&& dim
< GOMP_DIM_MAX
);
837 return oacc_default_dims
[dim
];
841 oacc_get_min_dim (int dim
)
843 gcc_assert (0 <= dim
&& dim
< GOMP_DIM_MAX
);
844 return oacc_min_dims
[dim
];
847 /* Parse the default dimension parameter. This is a set of
848 :-separated optional compute dimensions. Each specified dimension
849 is a positive integer. When device type support is added, it is
850 planned to be a comma separated list of such compute dimensions,
851 with all but the first prefixed by the colon-terminated device
855 oacc_parse_default_dims (const char *dims
)
859 for (ix
= GOMP_DIM_MAX
; ix
--;)
861 oacc_default_dims
[ix
] = -1;
862 oacc_min_dims
[ix
] = 1;
865 #ifndef ACCEL_COMPILER
866 /* Cannot be overridden on the host. */
871 const char *pos
= dims
;
873 for (ix
= 0; *pos
&& ix
!= GOMP_DIM_MAX
; ix
++)
888 val
= strtol (pos
, CONST_CAST (char **, &eptr
), 10);
889 if (errno
|| val
<= 0 || (int) val
!= val
)
892 oacc_default_dims
[ix
] = (int) val
;
898 error_at (UNKNOWN_LOCATION
,
899 "%<-fopenacc-dim%> operand is malformed at %qs", pos
);
903 /* Allow the backend to validate the dimensions. */
904 targetm
.goacc
.validate_dims (NULL_TREE
, oacc_default_dims
, -1, 0);
905 targetm
.goacc
.validate_dims (NULL_TREE
, oacc_min_dims
, -2, 0);
908 /* Validate and update the dimensions for offloaded FN. ATTRS is the
909 raw attribute. DIMS is an array of dimensions, which is filled in.
910 LEVEL is the partitioning level of a routine, or -1 for an offload
911 region itself. USED is the mask of partitioned execution in the
915 oacc_validate_dims (tree fn
, tree attrs
, int *dims
, int level
, unsigned used
)
917 tree purpose
[GOMP_DIM_MAX
];
919 tree pos
= TREE_VALUE (attrs
);
921 /* Make sure the attribute creator attached the dimension
925 for (ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++)
927 purpose
[ix
] = TREE_PURPOSE (pos
);
928 tree val
= TREE_VALUE (pos
);
929 dims
[ix
] = val
? TREE_INT_CST_LOW (val
) : -1;
930 pos
= TREE_CHAIN (pos
);
934 #ifdef ACCEL_COMPILER
938 && warn_openacc_parallelism
939 && !lookup_attribute ("oacc kernels", DECL_ATTRIBUTES (fn
)))
941 static char const *const axes
[] =
942 /* Must be kept in sync with GOMP_DIM enumeration. */
943 { "gang", "worker", "vector" };
944 for (ix
= level
>= 0 ? level
: 0; ix
!= GOMP_DIM_MAX
; ix
++)
946 ; /* Defaulting axis. */
947 else if ((used
& GOMP_DIM_MASK (ix
)) && dims
[ix
] == 1)
948 /* There is partitioned execution, but the user requested a
949 dimension size of 1. They're probably confused. */
950 warning_at (DECL_SOURCE_LOCATION (fn
), OPT_Wopenacc_parallelism
,
951 "region contains %s partitioned code but"
952 " is not %s partitioned", axes
[ix
], axes
[ix
]);
953 else if (!(used
& GOMP_DIM_MASK (ix
)) && dims
[ix
] != 1)
954 /* The dimension is explicitly partitioned to non-unity, but
955 no use is made within the region. */
956 warning_at (DECL_SOURCE_LOCATION (fn
), OPT_Wopenacc_parallelism
,
957 "region is %s partitioned but"
958 " does not contain %s partitioned code",
962 bool changed
= targetm
.goacc
.validate_dims (fn
, dims
, level
, used
);
964 /* Default anything left to 1 or a partitioned default. */
965 for (ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++)
968 /* The OpenACC spec says 'If the [num_gangs] clause is not
969 specified, an implementation-defined default will be used;
970 the default may depend on the code within the construct.'
971 (2.5.6). Thus an implementation is free to choose
972 non-unity default for a parallel region that doesn't have
973 any gang-partitioned loops. However, it appears that there
974 is a sufficient body of user code that expects non-gang
975 partitioned regions to not execute in gang-redundant mode.
976 So we (a) don't warn about the non-portability and (b) pick
977 the minimum permissible dimension size when there is no
978 partitioned execution. Otherwise we pick the global
979 default for the dimension, which the user can control. The
980 same wording and logic applies to num_workers and
981 vector_length, however the worker- or vector- single
982 execution doesn't have the same impact as gang-redundant
983 execution. (If the minimum gang-level partioning is not 1,
984 the target is probably too confusing.) */
985 dims
[ix
] = (used
& GOMP_DIM_MASK (ix
)
986 ? oacc_default_dims
[ix
] : oacc_min_dims
[ix
]);
992 /* Replace the attribute with new values. */
994 for (ix
= GOMP_DIM_MAX
; ix
--;)
995 pos
= tree_cons (purpose
[ix
],
996 build_int_cst (integer_type_node
, dims
[ix
]), pos
);
997 oacc_replace_fn_attrib (fn
, pos
);
1001 /* Create an empty OpenACC loop structure at LOC. */
1004 new_oacc_loop_raw (oacc_loop
*parent
, location_t loc
)
1006 oacc_loop
*loop
= XCNEW (oacc_loop
);
1008 loop
->parent
= parent
;
1012 loop
->sibling
= parent
->child
;
1013 parent
->child
= loop
;
1020 /* Create an outermost, dummy OpenACC loop for offloaded function
1024 new_oacc_loop_outer (tree decl
)
1026 return new_oacc_loop_raw (NULL
, DECL_SOURCE_LOCATION (decl
));
1029 /* Start a new OpenACC loop structure beginning at head marker HEAD.
1030 Link into PARENT loop. Return the new loop. */
1033 new_oacc_loop (oacc_loop
*parent
, gcall
*marker
)
1035 oacc_loop
*loop
= new_oacc_loop_raw (parent
, gimple_location (marker
));
1037 loop
->marker
= marker
;
1039 /* TODO: This is where device_type flattening would occur for the loop
1042 loop
->flags
= TREE_INT_CST_LOW (gimple_call_arg (marker
, 3));
1044 tree chunk_size
= integer_zero_node
;
1045 if (loop
->flags
& OLF_GANG_STATIC
)
1046 chunk_size
= gimple_call_arg (marker
, 4);
1047 loop
->chunk_size
= chunk_size
;
1052 /* Create a dummy loop encompassing a call to a openACC routine.
1053 Extract the routine's partitioning requirements. */
1056 new_oacc_loop_routine (oacc_loop
*parent
, gcall
*call
, tree decl
, tree attrs
)
1058 oacc_loop
*loop
= new_oacc_loop_raw (parent
, gimple_location (call
));
1059 int level
= oacc_fn_attrib_level (attrs
);
1061 gcc_assert (level
>= 0);
1063 loop
->marker
= call
;
1064 loop
->routine
= decl
;
1065 loop
->mask
= ((GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1)
1066 ^ (GOMP_DIM_MASK (level
) - 1));
1069 /* Finish off the current OpenACC loop ending at tail marker TAIL.
1070 Return the parent loop. */
1073 finish_oacc_loop (oacc_loop
*loop
)
1075 /* If the loop has been collapsed, don't partition it. */
1076 if (loop
->ifns
.is_empty ())
1077 loop
->mask
= loop
->flags
= 0;
1078 return loop
->parent
;
1081 /* Free all OpenACC loop structures within LOOP (inclusive). */
1084 free_oacc_loop (oacc_loop
*loop
)
1087 free_oacc_loop (loop
->sibling
);
1089 free_oacc_loop (loop
->child
);
1091 loop
->ifns
.release ();
1095 /* Dump out the OpenACC loop head or tail beginning at FROM. */
1098 dump_oacc_loop_part (FILE *file
, gcall
*from
, int depth
,
1099 const char *title
, int level
)
1101 enum ifn_unique_kind kind
1102 = (enum ifn_unique_kind
) TREE_INT_CST_LOW (gimple_call_arg (from
, 0));
1104 fprintf (file
, "%*s%s-%d:\n", depth
* 2, "", title
, level
);
1105 for (gimple_stmt_iterator gsi
= gsi_for_stmt (from
);;)
1107 gimple
*stmt
= gsi_stmt (gsi
);
1109 if (gimple_call_internal_p (stmt
, IFN_UNIQUE
))
1111 enum ifn_unique_kind k
1112 = ((enum ifn_unique_kind
) TREE_INT_CST_LOW
1113 (gimple_call_arg (stmt
, 0)));
1115 if (k
== kind
&& stmt
!= from
)
1118 print_gimple_stmt (file
, stmt
, depth
* 2 + 2);
1121 while (gsi_end_p (gsi
))
1122 gsi
= gsi_start_bb (single_succ (gsi_bb (gsi
)));
1126 /* Dump OpenACC loop LOOP, its children, and its siblings. */
1129 dump_oacc_loop (FILE *file
, oacc_loop
*loop
, int depth
)
1133 fprintf (file
, "%*sLoop %x(%x) %s:%u\n", depth
* 2, "",
1134 loop
->flags
, loop
->mask
,
1135 LOCATION_FILE (loop
->loc
), LOCATION_LINE (loop
->loc
));
1138 print_gimple_stmt (file
, loop
->marker
, depth
* 2);
1141 fprintf (file
, "%*sRoutine %s:%u:%s\n",
1142 depth
* 2, "", DECL_SOURCE_FILE (loop
->routine
),
1143 DECL_SOURCE_LINE (loop
->routine
),
1144 IDENTIFIER_POINTER (DECL_NAME (loop
->routine
)));
1146 for (ix
= GOMP_DIM_GANG
; ix
!= GOMP_DIM_MAX
; ix
++)
1147 if (loop
->heads
[ix
])
1148 dump_oacc_loop_part (file
, loop
->heads
[ix
], depth
, "Head", ix
);
1149 for (ix
= GOMP_DIM_MAX
; ix
--;)
1150 if (loop
->tails
[ix
])
1151 dump_oacc_loop_part (file
, loop
->tails
[ix
], depth
, "Tail", ix
);
1154 dump_oacc_loop (file
, loop
->child
, depth
+ 1);
1156 dump_oacc_loop (file
, loop
->sibling
, depth
);
1159 void debug_oacc_loop (oacc_loop
*);
1161 /* Dump loops to stderr. */
1164 debug_oacc_loop (oacc_loop
*loop
)
1166 dump_oacc_loop (stderr
, loop
, 0);
1169 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
1173 inform_oacc_loop (const oacc_loop
*loop
)
1176 = loop
->mask
& GOMP_DIM_MASK (GOMP_DIM_GANG
) ? " gang" : "";
1178 = loop
->mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
) ? " worker" : "";
1180 = loop
->mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
) ? " vector" : "";
1181 const char *seq
= loop
->mask
== 0 ? " seq" : "";
1182 const dump_user_location_t loc
1183 = dump_user_location_t::from_location_t (loop
->loc
);
1184 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, loc
,
1185 "assigned OpenACC%s%s%s%s loop parallelism\n", gang
, worker
,
1189 inform_oacc_loop (loop
->child
);
1191 inform_oacc_loop (loop
->sibling
);
1194 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
1195 structures as we go. By construction these loops are properly
1199 oacc_loop_discover_walk (oacc_loop
*loop
, basic_block bb
)
1204 if (bb
->flags
& BB_VISITED
)
1208 bb
->flags
|= BB_VISITED
;
1210 /* Scan for loop markers. */
1211 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
1214 gimple
*stmt
= gsi_stmt (gsi
);
1216 if (!is_gimple_call (stmt
))
1219 gcall
*call
= as_a
<gcall
*> (stmt
);
1221 /* If this is a routine, make a dummy loop for it. */
1222 if (tree decl
= gimple_call_fndecl (call
))
1223 if (tree attrs
= oacc_get_fn_attrib (decl
))
1225 gcc_assert (!marker
);
1226 new_oacc_loop_routine (loop
, call
, decl
, attrs
);
1229 if (!gimple_call_internal_p (call
))
1232 switch (gimple_call_internal_fn (call
))
1237 case IFN_GOACC_LOOP
:
1238 case IFN_GOACC_TILE
:
1239 /* Record the abstraction function, so we can manipulate it
1241 loop
->ifns
.safe_push (call
);
1245 enum ifn_unique_kind kind
1246 = (enum ifn_unique_kind
) (TREE_INT_CST_LOW
1247 (gimple_call_arg (call
, 0)));
1248 if (kind
== IFN_UNIQUE_OACC_HEAD_MARK
1249 || kind
== IFN_UNIQUE_OACC_TAIL_MARK
)
1251 if (gimple_call_num_args (call
) == 2)
1253 gcc_assert (marker
&& !remaining
);
1255 if (kind
== IFN_UNIQUE_OACC_TAIL_MARK
)
1256 loop
= finish_oacc_loop (loop
);
1258 loop
->head_end
= call
;
1262 int count
= TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
1266 if (kind
== IFN_UNIQUE_OACC_HEAD_MARK
)
1267 loop
= new_oacc_loop (loop
, call
);
1270 gcc_assert (count
== remaining
);
1274 if (kind
== IFN_UNIQUE_OACC_HEAD_MARK
)
1275 loop
->heads
[marker
] = call
;
1277 loop
->tails
[remaining
] = call
;
1284 if (remaining
|| marker
)
1286 bb
= single_succ (bb
);
1287 gcc_assert (single_pred_p (bb
) && !(bb
->flags
& BB_VISITED
));
1291 /* Walk successor blocks. */
1295 FOR_EACH_EDGE (e
, ei
, bb
->succs
)
1296 oacc_loop_discover_walk (loop
, e
->dest
);
1299 /* LOOP is the first sibling. Reverse the order in place and return
1300 the new first sibling. Recurse to child loops. */
1303 oacc_loop_sibling_nreverse (oacc_loop
*loop
)
1305 oacc_loop
*last
= NULL
;
1309 loop
->child
= oacc_loop_sibling_nreverse (loop
->child
);
1311 oacc_loop
*next
= loop
->sibling
;
1312 loop
->sibling
= last
;
1321 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1322 the current function. */
1325 oacc_loop_discovery ()
1327 /* Clear basic block flags, in particular BB_VISITED which we're going to use
1328 in the following. */
1331 oacc_loop
*top
= new_oacc_loop_outer (current_function_decl
);
1332 oacc_loop_discover_walk (top
, ENTRY_BLOCK_PTR_FOR_FN (cfun
));
1334 /* The siblings were constructed in reverse order, reverse them so
1335 that diagnostics come out in an unsurprising order. */
1336 top
= oacc_loop_sibling_nreverse (top
);
1341 /* Transform the abstract internal function markers starting at FROM
1342 to be for partitioning level LEVEL. Stop when we meet another HEAD
1346 oacc_loop_xform_head_tail (gcall
*from
, int level
)
1348 enum ifn_unique_kind kind
1349 = (enum ifn_unique_kind
) TREE_INT_CST_LOW (gimple_call_arg (from
, 0));
1350 tree replacement
= build_int_cst (unsigned_type_node
, level
);
1352 for (gimple_stmt_iterator gsi
= gsi_for_stmt (from
);;)
1354 gimple
*stmt
= gsi_stmt (gsi
);
1356 if (gimple_call_internal_p (stmt
, IFN_UNIQUE
))
1358 enum ifn_unique_kind k
1359 = ((enum ifn_unique_kind
)
1360 TREE_INT_CST_LOW (gimple_call_arg (stmt
, 0)));
1362 if (k
== IFN_UNIQUE_OACC_FORK
1363 || k
== IFN_UNIQUE_OACC_JOIN
1364 || k
== IFN_UNIQUE_OACC_PRIVATE
)
1365 *gimple_call_arg_ptr (stmt
, 2) = replacement
;
1366 else if (k
== kind
&& stmt
!= from
)
1369 else if (gimple_call_internal_p (stmt
, IFN_GOACC_REDUCTION
))
1370 *gimple_call_arg_ptr (stmt
, 3) = replacement
;
1374 while (gsi_end_p (gsi
))
1375 gsi
= gsi_start_bb (single_succ (gsi_bb (gsi
)));
1379 /* Process the discovered OpenACC loops, setting the correct
1380 partitioning level etc. */
1383 oacc_loop_process (oacc_loop
*loop
)
1386 oacc_loop_process (loop
->child
);
1388 if (loop
->mask
&& !loop
->routine
)
1391 tree mask_arg
= build_int_cst (unsigned_type_node
, loop
->mask
);
1392 tree e_mask_arg
= build_int_cst (unsigned_type_node
, loop
->e_mask
);
1393 tree chunk_arg
= loop
->chunk_size
;
1396 for (ix
= 0; loop
->ifns
.iterate (ix
, &call
); ix
++)
1398 switch (gimple_call_internal_fn (call
))
1400 case IFN_GOACC_LOOP
:
1402 bool is_e
= gimple_call_arg (call
, 5) == integer_minus_one_node
;
1403 gimple_call_set_arg (call
, 5, is_e
? e_mask_arg
: mask_arg
);
1405 gimple_call_set_arg (call
, 4, chunk_arg
);
1409 case IFN_GOACC_TILE
:
1410 gimple_call_set_arg (call
, 3, mask_arg
);
1411 gimple_call_set_arg (call
, 4, e_mask_arg
);
1420 unsigned dim
= GOMP_DIM_GANG
;
1421 unsigned mask
= loop
->mask
| loop
->e_mask
;
1422 for (ix
= 0; ix
!= GOMP_DIM_MAX
&& mask
; ix
++)
1424 while (!(GOMP_DIM_MASK (dim
) & mask
))
1427 oacc_loop_xform_head_tail (loop
->heads
[ix
], dim
);
1428 oacc_loop_xform_head_tail (loop
->tails
[ix
], dim
);
1430 mask
^= GOMP_DIM_MASK (dim
);
1435 oacc_loop_process (loop
->sibling
);
1438 /* Walk the OpenACC loop heirarchy checking and assigning the
1439 programmer-specified partitionings. OUTER_MASK is the partitioning
1440 this loop is contained within. Return mask of partitioning
1441 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1445 oacc_loop_fixed_partitions (oacc_loop
*loop
, unsigned outer_mask
)
1447 unsigned this_mask
= loop
->mask
;
1448 unsigned mask_all
= 0;
1451 #ifdef ACCEL_COMPILER
1452 /* When device_type is supported, we want the device compiler to be
1453 noisy, if the loop parameters are device_type-specific. */
1459 bool auto_par
= (loop
->flags
& OLF_AUTO
) != 0;
1460 bool seq_par
= (loop
->flags
& OLF_SEQ
) != 0;
1461 bool tiling
= (loop
->flags
& OLF_TILE
) != 0;
1463 this_mask
= ((loop
->flags
>> OLF_DIM_BASE
)
1464 & (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1));
1466 /* Apply auto partitioning if this is a non-partitioned regular
1467 loop, or (no more than) single axis tiled loop. */
1469 = !seq_par
&& this_mask
== (tiling
? this_mask
& -this_mask
: 0);
1471 if ((this_mask
!= 0) + auto_par
+ seq_par
> 1)
1474 error_at (loop
->loc
,
1476 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1477 : G_("%<auto%> conflicts with other OpenACC loop "
1480 loop
->flags
&= ~OLF_AUTO
;
1484 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1) << OLF_DIM_BASE
);
1489 if (maybe_auto
&& (loop
->flags
& OLF_INDEPENDENT
))
1491 loop
->flags
|= OLF_AUTO
;
1492 mask_all
|= GOMP_DIM_MASK (GOMP_DIM_MAX
);
1496 if (this_mask
& outer_mask
)
1498 const oacc_loop
*outer
;
1499 for (outer
= loop
->parent
; outer
; outer
= outer
->parent
)
1500 if ((outer
->mask
| outer
->e_mask
) & this_mask
)
1507 error_at (loop
->loc
,
1509 ? G_("routine call uses same OpenACC parallelism"
1510 " as containing loop")
1511 : G_("inner loop uses same OpenACC parallelism"
1512 " as containing loop"));
1513 inform (outer
->loc
, "containing loop here");
1516 error_at (loop
->loc
,
1518 ? G_("routine call uses OpenACC parallelism disallowed"
1519 " by containing routine")
1520 : G_("loop uses OpenACC parallelism disallowed"
1521 " by containing routine"));
1524 inform (DECL_SOURCE_LOCATION (loop
->routine
),
1525 "routine %qD declared here", loop
->routine
);
1527 this_mask
&= ~outer_mask
;
1531 unsigned outermost
= least_bit_hwi (this_mask
);
1533 if (outermost
&& outermost
<= outer_mask
)
1537 error_at (loop
->loc
,
1538 "incorrectly nested OpenACC loop parallelism");
1540 const oacc_loop
*outer
;
1541 for (outer
= loop
->parent
;
1542 outer
->flags
&& outer
->flags
< outermost
;
1543 outer
= outer
->parent
)
1545 inform (outer
->loc
, "containing loop here");
1548 this_mask
&= ~outermost
;
1552 mask_all
|= this_mask
;
1554 if (loop
->flags
& OLF_TILE
)
1556 /* When tiling, vector goes to the element loop, and failing
1557 that we put worker there. The std doesn't contemplate
1558 specifying all three. We choose to put worker and vector on
1559 the element loops in that case. */
1560 unsigned this_e_mask
= this_mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
);
1561 if (!this_e_mask
|| this_mask
& GOMP_DIM_MASK (GOMP_DIM_GANG
))
1562 this_e_mask
|= this_mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
);
1564 loop
->e_mask
= this_e_mask
;
1565 this_mask
^= this_e_mask
;
1568 loop
->mask
= this_mask
;
1571 fprintf (dump_file
, "Loop %s:%d user specified %d & %d\n",
1572 LOCATION_FILE (loop
->loc
), LOCATION_LINE (loop
->loc
),
1573 loop
->mask
, loop
->e_mask
);
1577 unsigned tmp_mask
= outer_mask
| this_mask
| loop
->e_mask
;
1578 loop
->inner
= oacc_loop_fixed_partitions (loop
->child
, tmp_mask
);
1579 mask_all
|= loop
->inner
;
1583 mask_all
|= oacc_loop_fixed_partitions (loop
->sibling
, outer_mask
);
1588 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1589 OUTER_MASK is the partitioning this loop is contained within.
1590 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1591 Return the cumulative partitioning used by this loop, siblings and
1595 oacc_loop_auto_partitions (oacc_loop
*loop
, unsigned outer_mask
,
1598 bool assign
= (loop
->flags
& OLF_AUTO
) && (loop
->flags
& OLF_INDEPENDENT
);
1600 bool tiling
= loop
->flags
& OLF_TILE
;
1602 #ifdef ACCEL_COMPILER
1603 /* When device_type is supported, we want the device compiler to be
1604 noisy, if the loop parameters are device_type-specific. */
1608 if (assign
&& (!outer_assign
|| loop
->inner
))
1610 /* Allocate outermost and non-innermost loops at the outermost
1611 non-innermost available level. */
1612 unsigned this_mask
= GOMP_DIM_MASK (GOMP_DIM_GANG
);
1614 /* Find the first outermost available partition. */
1615 while (this_mask
<= outer_mask
)
1618 /* Grab two axes if tiling, and we've not assigned anything */
1619 if (tiling
&& !(loop
->mask
| loop
->e_mask
))
1620 this_mask
|= this_mask
<< 1;
1622 /* Prohibit the innermost partitioning at the moment. */
1623 this_mask
&= GOMP_DIM_MASK (GOMP_DIM_MAX
- 1) - 1;
1625 /* Don't use any dimension explicitly claimed by an inner loop. */
1626 this_mask
&= ~loop
->inner
;
1628 if (tiling
&& !loop
->e_mask
)
1630 /* If we got two axes, allocate the inner one to the element
1632 loop
->e_mask
= this_mask
& (this_mask
<< 1);
1633 this_mask
^= loop
->e_mask
;
1636 loop
->mask
|= this_mask
;
1641 unsigned tmp_mask
= outer_mask
| loop
->mask
| loop
->e_mask
;
1642 loop
->inner
= oacc_loop_auto_partitions (loop
->child
, tmp_mask
,
1643 outer_assign
| assign
);
1646 if (assign
&& (!loop
->mask
|| (tiling
&& !loop
->e_mask
) || !outer_assign
))
1648 /* Allocate the loop at the innermost available level. Note
1649 that we do this even if we already assigned this loop the
1650 outermost available level above. That way we'll partition
1651 this along 2 axes, if they are available. */
1652 unsigned this_mask
= 0;
1654 /* Determine the outermost partitioning used within this loop. */
1655 this_mask
= loop
->inner
| GOMP_DIM_MASK (GOMP_DIM_MAX
);
1656 this_mask
= least_bit_hwi (this_mask
);
1658 /* Pick the partitioning just inside that one. */
1661 /* And avoid picking one use by an outer loop. */
1662 this_mask
&= ~outer_mask
;
1664 /* If tiling and we failed completely above, grab the next one
1665 too. Making sure it doesn't hit an outer loop. */
1668 this_mask
&= ~(loop
->e_mask
| loop
->mask
);
1669 unsigned tile_mask
= ((this_mask
>> 1)
1670 & ~(outer_mask
| loop
->e_mask
| loop
->mask
));
1672 if (tile_mask
|| loop
->mask
)
1674 loop
->e_mask
|= this_mask
;
1675 this_mask
= tile_mask
;
1677 if (!loop
->e_mask
&& noisy
)
1678 warning_at (loop
->loc
, 0,
1679 "insufficient partitioning available"
1680 " to parallelize element loop");
1683 loop
->mask
|= this_mask
;
1684 if (!loop
->mask
&& noisy
)
1685 warning_at (loop
->loc
, 0,
1687 ? G_("insufficient partitioning available"
1688 " to parallelize tile loop")
1689 : G_("insufficient partitioning available"
1690 " to parallelize loop"));
1693 if (assign
&& dump_file
)
1694 fprintf (dump_file
, "Auto loop %s:%d assigned %d & %d\n",
1695 LOCATION_FILE (loop
->loc
), LOCATION_LINE (loop
->loc
),
1696 loop
->mask
, loop
->e_mask
);
1698 unsigned inner_mask
= 0;
1701 inner_mask
|= oacc_loop_auto_partitions (loop
->sibling
,
1702 outer_mask
, outer_assign
);
1704 inner_mask
|= loop
->inner
| loop
->mask
| loop
->e_mask
;
1709 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1710 axes. Return mask of partitioning. */
1713 oacc_loop_partition (oacc_loop
*loop
, unsigned outer_mask
)
1715 unsigned mask_all
= oacc_loop_fixed_partitions (loop
, outer_mask
);
1717 if (mask_all
& GOMP_DIM_MASK (GOMP_DIM_MAX
))
1719 mask_all
^= GOMP_DIM_MASK (GOMP_DIM_MAX
);
1720 mask_all
|= oacc_loop_auto_partitions (loop
, outer_mask
, false);
1725 /* Default fork/join early expander. Delete the function calls if
1726 there is no RTL expander. */
1729 default_goacc_fork_join (gcall
*ARG_UNUSED (call
),
1730 const int *ARG_UNUSED (dims
), bool is_fork
)
1733 return targetm
.have_oacc_fork ();
1735 return targetm
.have_oacc_join ();
1738 /* Default goacc.reduction early expander.
1740 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1741 If RES_PTR is not integer-zerop:
1742 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1743 TEARDOWN - emit '*RES_PTR = VAR'
1748 default_goacc_reduction (gcall
*call
)
1750 unsigned code
= (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call
, 0));
1751 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
1752 tree lhs
= gimple_call_lhs (call
);
1753 tree var
= gimple_call_arg (call
, 2);
1754 gimple_seq seq
= NULL
;
1756 if (code
== IFN_GOACC_REDUCTION_SETUP
1757 || code
== IFN_GOACC_REDUCTION_TEARDOWN
)
1759 /* Setup and Teardown need to copy from/to the receiver object,
1761 tree ref_to_res
= gimple_call_arg (call
, 1);
1763 if (!integer_zerop (ref_to_res
))
1765 tree dst
= build_simple_mem_ref (ref_to_res
);
1768 if (code
== IFN_GOACC_REDUCTION_SETUP
)
1774 gimple_seq_add_stmt (&seq
, gimple_build_assign (dst
, src
));
1778 /* Copy VAR to LHS, if there is an LHS. */
1780 gimple_seq_add_stmt (&seq
, gimple_build_assign (lhs
, var
));
1782 gsi_replace_with_seq (&gsi
, seq
, true);
1785 struct var_decl_rewrite_info
1788 hash_map
<tree
, tree
> *adjusted_vars
;
1789 bool avoid_pointer_conversion
;
1793 /* Helper function for execute_oacc_device_lower. Rewrite VAR_DECLs (by
1794 themselves or wrapped in various other nodes) according to ADJUSTED_VARS in
1795 the var_decl_rewrite_info pointed to via DATA. Used as part of coercing
1796 gang-private variables in OpenACC offload regions to reside in GPU shared
1800 oacc_rewrite_var_decl (tree
*tp
, int *walk_subtrees
, void *data
)
1802 walk_stmt_info
*wi
= (walk_stmt_info
*) data
;
1803 var_decl_rewrite_info
*info
= (var_decl_rewrite_info
*) wi
->info
;
1805 if (TREE_CODE (*tp
) == ADDR_EXPR
)
1807 tree arg
= TREE_OPERAND (*tp
, 0);
1808 tree
*new_arg
= info
->adjusted_vars
->get (arg
);
1812 if (info
->avoid_pointer_conversion
)
1814 *tp
= build_fold_addr_expr (*new_arg
);
1815 info
->modified
= true;
1820 gimple_stmt_iterator gsi
= gsi_for_stmt (info
->stmt
);
1821 tree repl
= build_fold_addr_expr (*new_arg
);
1823 = gimple_build_assign (make_ssa_name (TREE_TYPE (repl
)), repl
);
1824 tree conv
= convert_to_pointer (TREE_TYPE (*tp
),
1825 gimple_assign_lhs (stmt1
));
1827 = gimple_build_assign (make_ssa_name (TREE_TYPE (*tp
)), conv
);
1828 gsi_insert_before (&gsi
, stmt1
, GSI_SAME_STMT
);
1829 gsi_insert_before (&gsi
, stmt2
, GSI_SAME_STMT
);
1830 *tp
= gimple_assign_lhs (stmt2
);
1831 info
->modified
= true;
1836 else if (TREE_CODE (*tp
) == COMPONENT_REF
|| TREE_CODE (*tp
) == ARRAY_REF
)
1838 tree
*base
= &TREE_OPERAND (*tp
, 0);
1840 while (TREE_CODE (*base
) == COMPONENT_REF
1841 || TREE_CODE (*base
) == ARRAY_REF
)
1842 base
= &TREE_OPERAND (*base
, 0);
1844 if (TREE_CODE (*base
) != VAR_DECL
)
1847 tree
*new_decl
= info
->adjusted_vars
->get (*base
);
1851 int base_quals
= TYPE_QUALS (TREE_TYPE (*new_decl
));
1852 tree field
= TREE_OPERAND (*tp
, 1);
1854 /* Adjust the type of the field. */
1855 int field_quals
= TYPE_QUALS (TREE_TYPE (field
));
1856 if (TREE_CODE (field
) == FIELD_DECL
&& field_quals
!= base_quals
)
1858 tree
*field_type
= &TREE_TYPE (field
);
1859 while (TREE_CODE (*field_type
) == ARRAY_TYPE
)
1860 field_type
= &TREE_TYPE (*field_type
);
1861 field_quals
|= base_quals
;
1862 *field_type
= build_qualified_type (*field_type
, field_quals
);
1865 /* Adjust the type of the component ref itself. */
1866 tree comp_type
= TREE_TYPE (*tp
);
1867 int comp_quals
= TYPE_QUALS (comp_type
);
1868 if (TREE_CODE (*tp
) == COMPONENT_REF
&& comp_quals
!= base_quals
)
1870 comp_quals
|= base_quals
;
1872 = build_qualified_type (comp_type
, comp_quals
);
1876 info
->modified
= true;
1878 else if (TREE_CODE (*tp
) == VAR_DECL
)
1880 tree
*new_decl
= info
->adjusted_vars
->get (*tp
);
1884 info
->modified
= true;
1891 /* Return TRUE if CALL is a call to a builtin atomic/sync operation. */
1894 is_sync_builtin_call (gcall
*call
)
1896 tree callee
= gimple_call_fndecl (call
);
1898 if (callee
!= NULL_TREE
1899 && gimple_call_builtin_p (call
, BUILT_IN_NORMAL
))
1900 switch (DECL_FUNCTION_CODE (callee
))
1902 #undef DEF_SYNC_BUILTIN
1903 #define DEF_SYNC_BUILTIN(ENUM, NAME, TYPE, ATTRS) case ENUM:
1904 #include "sync-builtins.def"
1905 #undef DEF_SYNC_BUILTIN
1915 /* Main entry point for oacc transformations which run on the device
1916 compiler after LTO, so we know what the target device is at this
1917 point (including the host fallback). */
1920 execute_oacc_loop_designation ()
1922 tree attrs
= oacc_get_fn_attrib (current_function_decl
);
1925 /* Not an offloaded function. */
1928 /* Parse the default dim argument exactly once. */
1929 if ((const void *)flag_openacc_dims
!= &flag_openacc_dims
)
1931 oacc_parse_default_dims (flag_openacc_dims
);
1932 flag_openacc_dims
= (char *)&flag_openacc_dims
;
1935 bool is_oacc_parallel
1936 = (lookup_attribute ("oacc parallel",
1937 DECL_ATTRIBUTES (current_function_decl
)) != NULL
);
1938 bool is_oacc_kernels
1939 = (lookup_attribute ("oacc kernels",
1940 DECL_ATTRIBUTES (current_function_decl
)) != NULL
);
1942 = (lookup_attribute ("oacc serial",
1943 DECL_ATTRIBUTES (current_function_decl
)) != NULL
);
1944 bool is_oacc_parallel_kernels_parallelized
1945 = (lookup_attribute ("oacc parallel_kernels_parallelized",
1946 DECL_ATTRIBUTES (current_function_decl
)) != NULL
);
1947 bool is_oacc_parallel_kernels_gang_single
1948 = (lookup_attribute ("oacc parallel_kernels_gang_single",
1949 DECL_ATTRIBUTES (current_function_decl
)) != NULL
);
1950 int fn_level
= oacc_fn_attrib_level (attrs
);
1951 bool is_oacc_routine
= (fn_level
>= 0);
1952 gcc_checking_assert (is_oacc_parallel
1955 + is_oacc_parallel_kernels_parallelized
1956 + is_oacc_parallel_kernels_gang_single
1960 bool is_oacc_kernels_parallelized
1961 = (lookup_attribute ("oacc kernels parallelized",
1962 DECL_ATTRIBUTES (current_function_decl
)) != NULL
);
1963 if (is_oacc_kernels_parallelized
)
1964 gcc_checking_assert (is_oacc_kernels
);
1968 if (is_oacc_parallel
)
1969 fprintf (dump_file
, "Function is OpenACC parallel offload\n");
1970 else if (is_oacc_kernels
)
1971 fprintf (dump_file
, "Function is %s OpenACC kernels offload\n",
1972 (is_oacc_kernels_parallelized
1973 ? "parallelized" : "unparallelized"));
1974 else if (is_oacc_serial
)
1975 fprintf (dump_file
, "Function is OpenACC serial offload\n");
1976 else if (is_oacc_parallel_kernels_parallelized
)
1977 fprintf (dump_file
, "Function is %s OpenACC kernels offload\n",
1978 "parallel_kernels_parallelized");
1979 else if (is_oacc_parallel_kernels_gang_single
)
1980 fprintf (dump_file
, "Function is %s OpenACC kernels offload\n",
1981 "parallel_kernels_gang_single");
1982 else if (is_oacc_routine
)
1983 fprintf (dump_file
, "Function is OpenACC routine level %d\n",
1989 /* This doesn't belong into 'pass_oacc_loop_designation' conceptually, but
1990 it's a convenient place, so... */
1991 if (is_oacc_routine
)
1993 tree attr
= lookup_attribute ("omp declare target",
1994 DECL_ATTRIBUTES (current_function_decl
));
1995 gcc_checking_assert (attr
);
1996 tree clauses
= TREE_VALUE (attr
);
1997 gcc_checking_assert (clauses
);
1999 /* Should this OpenACC routine be discarded? */
2000 bool discard
= false;
2002 tree clause_nohost
= omp_find_clause (clauses
, OMP_CLAUSE_NOHOST
);
2005 "OpenACC routine '%s' %s '%s' clause.\n",
2006 lang_hooks
.decl_printable_name (current_function_decl
, 2),
2007 clause_nohost
? "has" : "doesn't have",
2008 omp_clause_code_name
[OMP_CLAUSE_NOHOST
]);
2009 /* Host compiler, 'nohost' clause? */
2010 #ifndef ACCEL_COMPILER
2017 "OpenACC routine '%s' %sdiscarded.\n",
2018 lang_hooks
.decl_printable_name (current_function_decl
, 2),
2019 discard
? "" : "not ");
2022 TREE_ASM_WRITTEN (current_function_decl
) = 1;
2023 return TODO_discard_function
;
2027 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
2028 kernels, so remove the parallelism dimensions function attributes
2029 potentially set earlier on. */
2030 if (is_oacc_kernels
&& !is_oacc_kernels_parallelized
)
2032 oacc_set_fn_attrib (current_function_decl
, NULL
, NULL
);
2033 attrs
= oacc_get_fn_attrib (current_function_decl
);
2036 /* Discover, partition and process the loops. */
2037 oacc_loop
*loops
= oacc_loop_discovery ();
2039 unsigned outer_mask
= 0;
2040 if (is_oacc_routine
)
2041 outer_mask
= GOMP_DIM_MASK (fn_level
) - 1;
2042 unsigned used_mask
= oacc_loop_partition (loops
, outer_mask
);
2043 /* OpenACC kernels constructs are special: they currently don't use the
2044 generic oacc_loop infrastructure and attribute/dimension processing. */
2045 if (is_oacc_kernels
&& is_oacc_kernels_parallelized
)
2047 /* Parallelized OpenACC kernels constructs use gang parallelism. See
2048 also tree-parloops.c:create_parallel_loop. */
2049 used_mask
|= GOMP_DIM_MASK (GOMP_DIM_GANG
);
2052 int dims
[GOMP_DIM_MAX
];
2053 oacc_validate_dims (current_function_decl
, attrs
, dims
, fn_level
, used_mask
);
2057 const char *comma
= "Compute dimensions [";
2058 for (int ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++, comma
= ", ")
2059 fprintf (dump_file
, "%s%d", comma
, dims
[ix
]);
2060 fprintf (dump_file
, "]\n");
2063 /* Verify that for OpenACC 'kernels' decomposed "gang-single" parts we launch
2064 a single gang only. */
2065 if (is_oacc_parallel_kernels_gang_single
)
2066 gcc_checking_assert (dims
[GOMP_DIM_GANG
] == 1);
2068 oacc_loop_process (loops
);
2071 fprintf (dump_file
, "OpenACC loops\n");
2072 dump_oacc_loop (dump_file
, loops
, 0);
2073 fprintf (dump_file
, "\n");
2075 if (dump_enabled_p ())
2077 oacc_loop
*l
= loops
;
2078 /* OpenACC kernels constructs are special: they currently don't use the
2079 generic oacc_loop infrastructure. */
2080 if (is_oacc_kernels
)
2082 /* Create a fake oacc_loop for diagnostic purposes. */
2083 l
= new_oacc_loop_raw (NULL
,
2084 DECL_SOURCE_LOCATION (current_function_decl
));
2085 l
->mask
= used_mask
;
2089 /* Skip the outermost, dummy OpenACC loop */
2093 inform_oacc_loop (l
);
2094 if (is_oacc_kernels
)
2098 free_oacc_loop (loops
);
2104 execute_oacc_device_lower ()
2106 tree attrs
= oacc_get_fn_attrib (current_function_decl
);
2109 /* Not an offloaded function. */
2112 int dims
[GOMP_DIM_MAX
];
2113 for (unsigned i
= 0; i
< GOMP_DIM_MAX
; i
++)
2114 dims
[i
] = oacc_get_fn_dim_size (current_function_decl
, i
);
2116 hash_map
<tree
, tree
> adjusted_vars
;
2118 /* Now lower internal loop functions to target-specific code
2121 FOR_ALL_BB_FN (bb
, cfun
)
2122 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);)
2124 gimple
*stmt
= gsi_stmt (gsi
);
2125 if (!is_gimple_call (stmt
))
2131 gcall
*call
= as_a
<gcall
*> (stmt
);
2132 if (!gimple_call_internal_p (call
))
2138 /* Rewind to allow rescan. */
2140 bool rescan
= false, remove
= false;
2141 enum internal_fn ifn_code
= gimple_call_internal_fn (call
);
2147 case IFN_GOACC_TILE
:
2148 oacc_xform_tile (call
);
2152 case IFN_GOACC_LOOP
:
2153 oacc_xform_loop (call
);
2157 case IFN_GOACC_REDUCTION
:
2158 /* Mark the function for SSA renaming. */
2159 mark_virtual_operands_for_renaming (cfun
);
2161 /* If the level is -1, this ended up being an unused
2162 axis. Handle as a default. */
2163 if (integer_minus_onep (gimple_call_arg (call
, 3)))
2164 default_goacc_reduction (call
);
2166 targetm
.goacc
.reduction (call
);
2172 enum ifn_unique_kind kind
2173 = ((enum ifn_unique_kind
)
2174 TREE_INT_CST_LOW (gimple_call_arg (call
, 0)));
2181 case IFN_UNIQUE_OACC_FORK
:
2182 case IFN_UNIQUE_OACC_JOIN
:
2183 if (integer_minus_onep (gimple_call_arg (call
, 2)))
2185 else if (!targetm
.goacc
.fork_join
2186 (call
, dims
, kind
== IFN_UNIQUE_OACC_FORK
))
2190 case IFN_UNIQUE_OACC_HEAD_MARK
:
2191 case IFN_UNIQUE_OACC_TAIL_MARK
:
2195 case IFN_UNIQUE_OACC_PRIVATE
:
2197 dump_flags_t l_dump_flags
2198 = get_openacc_privatization_dump_flags ();
2200 location_t loc
= gimple_location (stmt
);
2201 if (LOCATION_LOCUS (loc
) == UNKNOWN_LOCATION
)
2202 loc
= DECL_SOURCE_LOCATION (current_function_decl
);
2203 const dump_user_location_t d_u_loc
2204 = dump_user_location_t::from_location_t (loc
);
2207 = TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
2208 gcc_checking_assert (level
== -1
2210 && level
< GOMP_DIM_MAX
));
2211 for (unsigned i
= 3;
2212 i
< gimple_call_num_args (call
);
2215 static char const *const axes
[] =
2216 /* Must be kept in sync with GOMP_DIM enumeration. */
2217 { "gang", "worker", "vector" };
2219 tree arg
= gimple_call_arg (call
, i
);
2220 gcc_checking_assert (TREE_CODE (arg
) == ADDR_EXPR
);
2221 tree decl
= TREE_OPERAND (arg
, 0);
2222 if (dump_enabled_p ())
2223 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2225 # pragma GCC diagnostic push
2226 # pragma GCC diagnostic ignored "-Wformat"
2228 dump_printf_loc (l_dump_flags
, d_u_loc
,
2229 "variable %<%T%> ought to be"
2230 " adjusted for OpenACC"
2231 " privatization level: %qs\n",
2234 ? "UNKNOWN" : axes
[level
]));
2236 # pragma GCC diagnostic pop
2241 else if (!targetm
.goacc
.adjust_private_decl
)
2243 else if (level
== GOMP_DIM_VECTOR
)
2245 /* That's the default behavior. */
2250 tree oldtype
= TREE_TYPE (decl
);
2252 = targetm
.goacc
.adjust_private_decl (loc
, decl
,
2254 adjusted
= (TREE_TYPE (newdecl
) != oldtype
2255 || newdecl
!= decl
);
2257 adjusted_vars
.put (decl
, newdecl
);
2260 && dump_enabled_p ())
2261 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2263 # pragma GCC diagnostic push
2264 # pragma GCC diagnostic ignored "-Wformat"
2266 dump_printf_loc (l_dump_flags
, d_u_loc
,
2267 "variable %<%T%> adjusted for"
2268 " OpenACC privatization level:"
2272 # pragma GCC diagnostic pop
2283 if (gsi_end_p (gsi
))
2284 /* We rewound past the beginning of the BB. */
2285 gsi
= gsi_start_bb (bb
);
2287 /* Undo the rewind. */
2292 if (gimple_vdef (call
))
2293 replace_uses_by (gimple_vdef (call
), gimple_vuse (call
));
2294 if (gimple_call_lhs (call
))
2296 /* Propagate the data dependency var. */
2297 gimple
*ass
= gimple_build_assign (gimple_call_lhs (call
),
2298 gimple_call_arg (call
, 1));
2299 gsi_replace (&gsi
, ass
, false);
2302 gsi_remove (&gsi
, true);
2305 /* If not rescanning, advance over the call. */
2309 /* Regarding the OpenACC privatization level, we're currently only looking at
2310 making the gang-private level work. Regarding that, we have the following
2313 - GCN offloading: 'targetm.goacc.adjust_private_decl' does the work (in
2314 particular, change 'TREE_TYPE', etc.) and there is no
2315 'targetm.goacc.expand_var_decl'.
2317 - nvptx offloading: 'targetm.goacc.adjust_private_decl' only sets a
2318 marker and then 'targetm.goacc.expand_var_decl' does the work.
2320 Eventually (in particular, for worker-private level?), both
2321 'targetm.goacc.adjust_private_decl' and 'targetm.goacc.expand_var_decl'
2322 may need to do things, but that's currently not meant to be addressed, and
2323 thus not fully worked out and implemented, and thus untested. Hence,
2324 'assert' what currently is implemented/tested, only. */
2326 if (targetm
.goacc
.expand_var_decl
)
2327 gcc_assert (adjusted_vars
.is_empty ());
2329 /* Make adjustments to gang-private local variables if required by the
2330 target, e.g. forcing them into a particular address space. Afterwards,
2331 ADDR_EXPR nodes which have adjusted variables as their argument need to
2332 be modified in one of two ways:
2334 1. They can be recreated, making a pointer to the variable in the new
2337 2. The address of the variable in the new address space can be taken,
2338 converted to the default (original) address space, and the result of
2339 that conversion subsituted in place of the original ADDR_EXPR node.
2341 Which of these is done depends on the gimple statement being processed.
2342 At present atomic operations and inline asms use (1), and everything else
2343 uses (2). At least on AMD GCN, there are atomic operations that work
2344 directly in the LDS address space.
2346 COMPONENT_REFS, ARRAY_REFS and plain VAR_DECLs are also rewritten to use
2347 the new decl, adjusting types of appropriate tree nodes as necessary. */
2349 if (targetm
.goacc
.adjust_private_decl
2350 && !adjusted_vars
.is_empty ())
2352 FOR_ALL_BB_FN (bb
, cfun
)
2353 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
);
2357 gimple
*stmt
= gsi_stmt (gsi
);
2359 var_decl_rewrite_info info
;
2361 info
.avoid_pointer_conversion
2362 = (is_gimple_call (stmt
)
2363 && is_sync_builtin_call (as_a
<gcall
*> (stmt
)))
2364 || gimple_code (stmt
) == GIMPLE_ASM
;
2366 info
.modified
= false;
2367 info
.adjusted_vars
= &adjusted_vars
;
2369 memset (&wi
, 0, sizeof (wi
));
2372 walk_gimple_op (stmt
, oacc_rewrite_var_decl
, &wi
);
2382 /* Default launch dimension validator. Force everything to 1. A
2383 backend that wants to provide larger dimensions must override this
2387 default_goacc_validate_dims (tree
ARG_UNUSED (decl
), int *dims
,
2388 int ARG_UNUSED (fn_level
),
2389 unsigned ARG_UNUSED (used
))
2391 bool changed
= false;
2393 for (unsigned ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++)
2405 /* Default dimension bound is unknown on accelerator and 1 on host. */
2408 default_goacc_dim_limit (int ARG_UNUSED (axis
))
2410 #ifdef ACCEL_COMPILER
2419 const pass_data pass_data_oacc_loop_designation
=
2421 GIMPLE_PASS
, /* type */
2422 "oaccloops", /* name */
2423 OPTGROUP_OMP
, /* optinfo_flags */
2424 TV_NONE
, /* tv_id */
2425 PROP_cfg
, /* properties_required */
2426 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2427 0, /* properties_destroyed */
2428 0, /* todo_flags_start */
2429 TODO_update_ssa
| TODO_cleanup_cfg
, /* todo_flags_finish */
2432 class pass_oacc_loop_designation
: public gimple_opt_pass
2435 pass_oacc_loop_designation (gcc::context
*ctxt
)
2436 : gimple_opt_pass (pass_data_oacc_loop_designation
, ctxt
)
2439 /* opt_pass methods: */
2440 virtual bool gate (function
*) { return flag_openacc
; };
2442 virtual unsigned int execute (function
*)
2444 return execute_oacc_loop_designation ();
2447 }; // class pass_oacc_loop_designation
2449 const pass_data pass_data_oacc_device_lower
=
2451 GIMPLE_PASS
, /* type */
2452 "oaccdevlow", /* name */
2453 OPTGROUP_OMP
, /* optinfo_flags */
2454 TV_NONE
, /* tv_id */
2455 PROP_cfg
, /* properties_required */
2456 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2457 0, /* properties_destroyed */
2458 0, /* todo_flags_start */
2459 TODO_update_ssa
| TODO_cleanup_cfg
, /* todo_flags_finish */
2462 class pass_oacc_device_lower
: public gimple_opt_pass
2465 pass_oacc_device_lower (gcc::context
*ctxt
)
2466 : gimple_opt_pass (pass_data_oacc_device_lower
, ctxt
)
2469 /* opt_pass methods: */
2470 virtual bool gate (function
*) { return flag_openacc
; };
2472 virtual unsigned int execute (function
*)
2474 return execute_oacc_device_lower ();
2477 }; // class pass_oacc_device_lower
2482 make_pass_oacc_loop_designation (gcc::context
*ctxt
)
2484 return new pass_oacc_loop_designation (ctxt
);
2488 make_pass_oacc_device_lower (gcc::context
*ctxt
)
2490 return new pass_oacc_device_lower (ctxt
);
2494 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
2495 GOMP_SIMT_ENTER call identifying the privatized variables, which are
2496 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
2497 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
2500 ompdevlow_adjust_simt_enter (gimple_stmt_iterator
*gsi
, bool *regimplify
)
2502 gimple
*alloc_stmt
= gsi_stmt (*gsi
);
2503 tree simtrec
= gimple_call_lhs (alloc_stmt
);
2504 tree simduid
= gimple_call_arg (alloc_stmt
, 0);
2505 gimple
*enter_stmt
= SSA_NAME_DEF_STMT (simduid
);
2506 gcc_assert (gimple_call_internal_p (enter_stmt
, IFN_GOMP_SIMT_ENTER
));
2507 tree rectype
= lang_hooks
.types
.make_type (RECORD_TYPE
);
2508 TYPE_ARTIFICIAL (rectype
) = TYPE_NAMELESS (rectype
) = 1;
2509 TREE_ADDRESSABLE (rectype
) = 1;
2510 TREE_TYPE (simtrec
) = build_pointer_type (rectype
);
2511 for (unsigned i
= 1; i
< gimple_call_num_args (enter_stmt
); i
++)
2513 tree
*argp
= gimple_call_arg_ptr (enter_stmt
, i
);
2514 if (*argp
== null_pointer_node
)
2516 gcc_assert (TREE_CODE (*argp
) == ADDR_EXPR
2517 && VAR_P (TREE_OPERAND (*argp
, 0)));
2518 tree var
= TREE_OPERAND (*argp
, 0);
2520 tree field
= build_decl (DECL_SOURCE_LOCATION (var
), FIELD_DECL
,
2521 DECL_NAME (var
), TREE_TYPE (var
));
2522 SET_DECL_ALIGN (field
, DECL_ALIGN (var
));
2523 DECL_USER_ALIGN (field
) = DECL_USER_ALIGN (var
);
2524 TREE_THIS_VOLATILE (field
) = TREE_THIS_VOLATILE (var
);
2526 insert_field_into_struct (rectype
, field
);
2528 tree t
= build_simple_mem_ref (simtrec
);
2529 t
= build3 (COMPONENT_REF
, TREE_TYPE (var
), t
, field
, NULL
);
2530 TREE_THIS_VOLATILE (t
) = TREE_THIS_VOLATILE (var
);
2531 SET_DECL_VALUE_EXPR (var
, t
);
2532 DECL_HAS_VALUE_EXPR_P (var
) = 1;
2535 layout_type (rectype
);
2536 tree size
= TYPE_SIZE_UNIT (rectype
);
2537 tree align
= build_int_cst (TREE_TYPE (size
), TYPE_ALIGN_UNIT (rectype
));
2540 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC
, 2, size
, align
);
2541 gimple_call_set_lhs (alloc_stmt
, simtrec
);
2542 gsi_replace (gsi
, alloc_stmt
, false);
2543 gimple_stmt_iterator enter_gsi
= gsi_for_stmt (enter_stmt
);
2544 enter_stmt
= gimple_build_assign (simduid
, gimple_call_arg (enter_stmt
, 0));
2545 gsi_replace (&enter_gsi
, enter_stmt
, false);
2549 if (single_imm_use (simtrec
, &use
, &exit_stmt
))
2551 gcc_assert (gimple_call_internal_p (exit_stmt
, IFN_GOMP_SIMT_EXIT
));
2552 gimple_stmt_iterator exit_gsi
= gsi_for_stmt (exit_stmt
);
2553 tree clobber
= build_clobber (rectype
);
2554 exit_stmt
= gimple_build_assign (build_simple_mem_ref (simtrec
), clobber
);
2555 gsi_insert_before (&exit_gsi
, exit_stmt
, GSI_SAME_STMT
);
2558 gcc_checking_assert (has_zero_uses (simtrec
));
2561 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
2564 find_simtpriv_var_op (tree
*tp
, int *walk_subtrees
, void *)
2569 && DECL_HAS_VALUE_EXPR_P (t
)
2570 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t
)))
2578 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
2579 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
2580 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
2581 internal functions on non-SIMT targets, and likewise some SIMD internal
2582 functions on SIMT targets. */
2585 execute_omp_device_lower ()
2587 int vf
= targetm
.simt
.vf
? targetm
.simt
.vf () : 1;
2588 bool regimplify
= false;
2590 gimple_stmt_iterator gsi
;
2591 bool calls_declare_variant_alt
2592 = cgraph_node::get (cfun
->decl
)->calls_declare_variant_alt
;
2593 FOR_EACH_BB_FN (bb
, cfun
)
2594 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2596 gimple
*stmt
= gsi_stmt (gsi
);
2597 if (!is_gimple_call (stmt
))
2599 if (!gimple_call_internal_p (stmt
))
2601 if (calls_declare_variant_alt
)
2602 if (tree fndecl
= gimple_call_fndecl (stmt
))
2604 tree new_fndecl
= omp_resolve_declare_variant (fndecl
);
2605 if (new_fndecl
!= fndecl
)
2607 gimple_call_set_fndecl (stmt
, new_fndecl
);
2613 tree lhs
= gimple_call_lhs (stmt
), rhs
= NULL_TREE
;
2614 tree type
= lhs
? TREE_TYPE (lhs
) : integer_type_node
;
2615 switch (gimple_call_internal_fn (stmt
))
2617 case IFN_GOMP_USE_SIMT
:
2618 rhs
= vf
== 1 ? integer_zero_node
: integer_one_node
;
2620 case IFN_GOMP_SIMT_ENTER
:
2621 rhs
= vf
== 1 ? gimple_call_arg (stmt
, 0) : NULL_TREE
;
2622 goto simtreg_enter_exit
;
2623 case IFN_GOMP_SIMT_ENTER_ALLOC
:
2625 ompdevlow_adjust_simt_enter (&gsi
, ®implify
);
2626 rhs
= vf
== 1 ? null_pointer_node
: NULL_TREE
;
2627 goto simtreg_enter_exit
;
2628 case IFN_GOMP_SIMT_EXIT
:
2632 unlink_stmt_vdef (stmt
);
2634 case IFN_GOMP_SIMT_LANE
:
2635 case IFN_GOMP_SIMT_LAST_LANE
:
2636 rhs
= vf
== 1 ? build_zero_cst (type
) : NULL_TREE
;
2638 case IFN_GOMP_SIMT_VF
:
2639 rhs
= build_int_cst (type
, vf
);
2641 case IFN_GOMP_SIMT_ORDERED_PRED
:
2642 rhs
= vf
== 1 ? integer_zero_node
: NULL_TREE
;
2644 unlink_stmt_vdef (stmt
);
2646 case IFN_GOMP_SIMT_VOTE_ANY
:
2647 case IFN_GOMP_SIMT_XCHG_BFLY
:
2648 case IFN_GOMP_SIMT_XCHG_IDX
:
2649 rhs
= vf
== 1 ? gimple_call_arg (stmt
, 0) : NULL_TREE
;
2651 case IFN_GOMP_SIMD_LANE
:
2652 case IFN_GOMP_SIMD_LAST_LANE
:
2653 rhs
= vf
!= 1 ? build_zero_cst (type
) : NULL_TREE
;
2655 case IFN_GOMP_SIMD_VF
:
2656 rhs
= vf
!= 1 ? build_one_cst (type
) : NULL_TREE
;
2663 stmt
= lhs
? gimple_build_assign (lhs
, rhs
) : gimple_build_nop ();
2664 gsi_replace (&gsi
, stmt
, false);
2667 FOR_EACH_BB_REVERSE_FN (bb
, cfun
)
2668 for (gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
); gsi_prev (&gsi
))
2669 if (walk_gimple_stmt (&gsi
, NULL
, find_simtpriv_var_op
, NULL
))
2671 if (gimple_clobber_p (gsi_stmt (gsi
)))
2672 gsi_remove (&gsi
, true);
2674 gimple_regimplify_operands (gsi_stmt (gsi
), &gsi
);
2677 cfun
->has_force_vectorize_loops
= false;
2683 const pass_data pass_data_omp_device_lower
=
2685 GIMPLE_PASS
, /* type */
2686 "ompdevlow", /* name */
2687 OPTGROUP_OMP
, /* optinfo_flags */
2688 TV_NONE
, /* tv_id */
2689 PROP_cfg
, /* properties_required */
2690 PROP_gimple_lomp_dev
, /* properties_provided */
2691 0, /* properties_destroyed */
2692 0, /* todo_flags_start */
2693 TODO_update_ssa
, /* todo_flags_finish */
2696 class pass_omp_device_lower
: public gimple_opt_pass
2699 pass_omp_device_lower (gcc::context
*ctxt
)
2700 : gimple_opt_pass (pass_data_omp_device_lower
, ctxt
)
2703 /* opt_pass methods: */
2704 virtual bool gate (function
*fun
)
2706 return (!(fun
->curr_properties
& PROP_gimple_lomp_dev
)
2708 && cgraph_node::get (fun
->decl
)->calls_declare_variant_alt
));
2710 virtual unsigned int execute (function
*)
2712 return execute_omp_device_lower ();
2715 }; // class pass_expand_omp_ssa
2720 make_pass_omp_device_lower (gcc::context
*ctxt
)
2722 return new pass_omp_device_lower (ctxt
);
2725 /* "omp declare target link" handling pass. */
2729 const pass_data pass_data_omp_target_link
=
2731 GIMPLE_PASS
, /* type */
2732 "omptargetlink", /* name */
2733 OPTGROUP_OMP
, /* optinfo_flags */
2734 TV_NONE
, /* tv_id */
2735 PROP_ssa
, /* properties_required */
2736 0, /* properties_provided */
2737 0, /* properties_destroyed */
2738 0, /* todo_flags_start */
2739 TODO_update_ssa
, /* todo_flags_finish */
2742 class pass_omp_target_link
: public gimple_opt_pass
2745 pass_omp_target_link (gcc::context
*ctxt
)
2746 : gimple_opt_pass (pass_data_omp_target_link
, ctxt
)
2749 /* opt_pass methods: */
2750 virtual bool gate (function
*fun
)
2752 #ifdef ACCEL_COMPILER
2753 return offloading_function_p (fun
->decl
);
2760 virtual unsigned execute (function
*);
2763 /* Callback for walk_gimple_stmt used to scan for link var operands. */
2766 find_link_var_op (tree
*tp
, int *walk_subtrees
, void *)
2771 && DECL_HAS_VALUE_EXPR_P (t
)
2772 && is_global_var (t
)
2773 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t
)))
2783 pass_omp_target_link::execute (function
*fun
)
2786 FOR_EACH_BB_FN (bb
, fun
)
2788 gimple_stmt_iterator gsi
;
2789 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2791 if (gimple_call_builtin_p (gsi_stmt (gsi
), BUILT_IN_GOMP_TARGET
))
2793 /* Nullify the second argument of __builtin_GOMP_target_ext. */
2794 gimple_call_set_arg (gsi_stmt (gsi
), 1, null_pointer_node
);
2795 update_stmt (gsi_stmt (gsi
));
2797 if (walk_gimple_stmt (&gsi
, NULL
, find_link_var_op
, NULL
))
2798 gimple_regimplify_operands (gsi_stmt (gsi
), &gsi
);
2808 make_pass_omp_target_link (gcc::context
*ctxt
)
2810 return new pass_omp_target_link (ctxt
);