1 /* Target code for NVPTX.
2 Copyright (C) 2014-2023 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
26 #include "coretypes.h"
40 #include "diagnostic.h"
42 #include "insn-flags.h"
44 #include "insn-attr.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
57 #include "stor-layout.h"
59 #include "omp-general.h"
61 #include "omp-offload.h"
62 #include "gomp-constants.h"
64 #include "internal-fn.h"
65 #include "gimple-iterator.h"
66 #include "stringpool.h"
69 #include "tree-ssa-operands.h"
70 #include "tree-ssanames.h"
72 #include "tree-phinodes.h"
74 #include "fold-const.h"
77 #include "tree-pretty-print.h"
81 /* This file should be included last. */
82 #include "target-def.h"
84 #define WORKAROUND_PTXJIT_BUG 1
85 #define WORKAROUND_PTXJIT_BUG_2 1
86 #define WORKAROUND_PTXJIT_BUG_3 1
88 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
89 block, which has had a maximum number of threads of 1024 since CUDA version
91 #define PTX_CTA_SIZE 1024
93 #define PTX_CTA_NUM_BARRIERS 16
94 #define PTX_WARP_SIZE 32
96 #define PTX_PER_CTA_BARRIER 0
97 #define PTX_NUM_PER_CTA_BARRIERS 1
98 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
99 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
101 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
102 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
103 #define PTX_WORKER_LENGTH 32
104 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
106 /* The various PTX memory areas an object might reside in. */
118 /* We record the data area in the target symbol flags. */
119 #define SYMBOL_DATA_AREA(SYM) \
120 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
122 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
123 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
125 /* Record the function decls we've written, and the libfuncs and function
126 decls corresponding to them. */
127 static std::stringstream func_decls
;
129 struct declared_libfunc_hasher
: ggc_cache_ptr_hash
<rtx_def
>
131 static hashval_t
hash (rtx x
) { return htab_hash_pointer (x
); }
132 static bool equal (rtx a
, rtx b
) { return a
== b
; }
136 hash_table
<declared_libfunc_hasher
> *declared_libfuncs_htab
;
138 struct tree_hasher
: ggc_cache_ptr_hash
<tree_node
>
140 static hashval_t
hash (tree t
) { return htab_hash_pointer (t
); }
141 static bool equal (tree a
, tree b
) { return a
== b
; }
144 static GTY((cache
)) hash_table
<tree_hasher
> *declared_fndecls_htab
;
145 static GTY((cache
)) hash_table
<tree_hasher
> *needed_fndecls_htab
;
147 /* Buffer needed to broadcast across workers and vectors. This is
148 used for both worker-neutering and worker broadcasting, and
149 vector-neutering and boardcasting when vector_length > 32. It is
150 shared by all functions emitted. The buffer is placed in shared
151 memory. It'd be nice if PTX supported common blocks, because then
152 this could be shared across TUs (taking the largest size). */
153 static unsigned oacc_bcast_size
;
154 static unsigned oacc_bcast_partition
;
155 static unsigned oacc_bcast_align
;
156 static GTY(()) rtx oacc_bcast_sym
;
158 /* Buffer needed for worker reductions. This has to be distinct from
159 the worker broadcast array, as both may be live concurrently. */
160 static unsigned worker_red_size
;
161 static unsigned worker_red_align
;
162 static GTY(()) rtx worker_red_sym
;
164 /* Buffer needed for vector reductions, when vector_length >
165 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
166 array, as both may be live concurrently. */
167 static unsigned vector_red_size
;
168 static unsigned vector_red_align
;
169 static unsigned vector_red_partition
;
170 static GTY(()) rtx vector_red_sym
;
172 /* Shared memory block for gang-private variables. */
173 static unsigned gang_private_shared_size
;
174 static unsigned gang_private_shared_align
;
175 static GTY(()) rtx gang_private_shared_sym
;
176 static hash_map
<tree_decl_hash
, unsigned int> gang_private_shared_hmap
;
178 /* Global lock variable, needed for 128bit worker & gang reductions. */
179 static GTY(()) tree global_lock_var
;
181 /* True if any function references __nvptx_stacks. */
182 static bool need_softstack_decl
;
184 /* True if any function references __nvptx_uni. */
185 static bool need_unisimt_decl
;
187 static int nvptx_mach_max_workers ();
189 /* Allocate a new, cleared machine_function structure. */
191 static struct machine_function
*
192 nvptx_init_machine_status (void)
194 struct machine_function
*p
= ggc_cleared_alloc
<machine_function
> ();
195 p
->return_mode
= VOIDmode
;
199 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
200 and -fopenacc is also enabled. */
203 diagnose_openacc_conflict (bool optval
, const char *optname
)
205 if (flag_openacc
&& optval
)
206 error ("option %s is not supported together with %<-fopenacc%>", optname
);
209 static enum ptx_version
210 first_ptx_version_supporting_sm (enum ptx_isa sm
)
215 return PTX_VERSION_3_0
;
217 return PTX_VERSION_3_1
;
219 return PTX_VERSION_4_2
;
221 return PTX_VERSION_6_0
;
223 return PTX_VERSION_6_3
;
225 return PTX_VERSION_7_0
;
231 static enum ptx_version
232 default_ptx_version_option (void)
234 enum ptx_version first
235 = first_ptx_version_supporting_sm ((enum ptx_isa
) ptx_isa_option
);
237 /* Pick a version that supports the sm. */
238 enum ptx_version res
= first
;
240 /* Pick at least 3.1. This has been the smallest version historically. */
241 res
= MAX (res
, PTX_VERSION_3_1
);
243 /* Pick at least 6.0, to enable using bar.warp.sync to have a way to force
245 res
= MAX (res
, PTX_VERSION_6_0
);
247 /* Verify that we pick a version that supports the sm. */
248 gcc_assert (first
<= res
);
253 ptx_version_to_string (enum ptx_version v
)
257 case PTX_VERSION_3_0
:
259 case PTX_VERSION_3_1
:
261 case PTX_VERSION_4_2
:
263 case PTX_VERSION_6_0
:
265 case PTX_VERSION_6_3
:
267 case PTX_VERSION_7_0
:
275 ptx_version_to_number (enum ptx_version v
, bool major_p
)
279 case PTX_VERSION_3_0
:
280 return major_p
? 3 : 0;
281 case PTX_VERSION_3_1
:
282 return major_p
? 3 : 1;
283 case PTX_VERSION_4_2
:
284 return major_p
? 4 : 2;
285 case PTX_VERSION_6_0
:
286 return major_p
? 6 : 0;
287 case PTX_VERSION_6_3
:
288 return major_p
? 6 : 3;
289 case PTX_VERSION_7_0
:
290 return major_p
? 7 : 0;
297 sm_version_to_string (enum ptx_isa sm
)
301 #define NVPTX_SM(XX, SEP) \
302 case PTX_ISA_SM ## XX: \
304 #include "nvptx-sm.def"
312 handle_ptx_version_option (void)
314 if (!OPTION_SET_P (ptx_version_option
)
315 || ptx_version_option
== PTX_VERSION_default
)
317 ptx_version_option
= default_ptx_version_option ();
321 enum ptx_version first
322 = first_ptx_version_supporting_sm ((enum ptx_isa
) ptx_isa_option
);
324 if (ptx_version_option
< first
)
325 error ("PTX version (%<-mptx%>) needs to be at least %s to support selected"
326 " %<-misa%> (sm_%s)", ptx_version_to_string (first
),
327 sm_version_to_string ((enum ptx_isa
)ptx_isa_option
));
330 /* Implement TARGET_OPTION_OVERRIDE. */
333 nvptx_option_override (void)
335 init_machine_status
= nvptx_init_machine_status
;
337 /* Via nvptx 'OPTION_DEFAULT_SPECS', '-misa' always appears on the command
339 gcc_checking_assert (OPTION_SET_P (ptx_isa_option
));
341 handle_ptx_version_option ();
343 /* Set toplevel_reorder, unless explicitly disabled. We need
344 reordering so that we emit necessary assembler decls of
345 undeclared variables. */
346 if (!OPTION_SET_P (flag_toplevel_reorder
))
347 flag_toplevel_reorder
= 1;
349 debug_nonbind_markers_p
= 0;
351 /* Set flag_no_common, unless explicitly disabled. We fake common
352 using .weak, and that's not entirely accurate, so avoid it
354 if (!OPTION_SET_P (flag_no_common
))
357 /* The patch area requires nops, which we don't have. */
358 HOST_WIDE_INT patch_area_size
, patch_area_entry
;
359 parse_and_check_patch_area (flag_patchable_function_entry
, false,
360 &patch_area_size
, &patch_area_entry
);
361 if (patch_area_size
> 0)
362 sorry ("not generating patch area, nops not supported");
364 /* Assumes that it will see only hard registers. */
365 flag_var_tracking
= 0;
367 if (nvptx_optimize
< 0)
368 nvptx_optimize
= optimize
> 0;
370 declared_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
371 needed_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
372 declared_libfuncs_htab
373 = hash_table
<declared_libfunc_hasher
>::create_ggc (17);
375 oacc_bcast_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__oacc_bcast");
376 SET_SYMBOL_DATA_AREA (oacc_bcast_sym
, DATA_AREA_SHARED
);
377 oacc_bcast_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
378 oacc_bcast_partition
= 0;
380 worker_red_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__worker_red");
381 SET_SYMBOL_DATA_AREA (worker_red_sym
, DATA_AREA_SHARED
);
382 worker_red_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
384 vector_red_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__vector_red");
385 SET_SYMBOL_DATA_AREA (vector_red_sym
, DATA_AREA_SHARED
);
386 vector_red_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
387 vector_red_partition
= 0;
389 gang_private_shared_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__gang_private_shared");
390 SET_SYMBOL_DATA_AREA (gang_private_shared_sym
, DATA_AREA_SHARED
);
391 gang_private_shared_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
393 diagnose_openacc_conflict (TARGET_GOMP
, "-mgomp");
394 diagnose_openacc_conflict (TARGET_SOFT_STACK
, "-msoft-stack");
395 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT
, "-muniform-simt");
398 target_flags
|= MASK_SOFT_STACK
| MASK_UNIFORM_SIMT
;
401 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
402 deal with ptx ideosyncracies. */
405 nvptx_ptx_type_from_mode (machine_mode mode
, bool promote
)
442 /* Encode the PTX data area that DECL (which might not actually be a
443 _DECL) should reside in. */
446 nvptx_encode_section_info (tree decl
, rtx rtl
, int first
)
448 default_encode_section_info (decl
, rtl
, first
);
449 if (first
&& MEM_P (rtl
))
451 nvptx_data_area area
= DATA_AREA_GENERIC
;
453 if (TREE_CONSTANT (decl
))
454 area
= DATA_AREA_CONST
;
455 else if (TREE_CODE (decl
) == VAR_DECL
)
457 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl
)))
459 area
= DATA_AREA_SHARED
;
460 if (DECL_INITIAL (decl
))
461 error ("static initialization of variable %q+D in %<.shared%>"
462 " memory is not supported", decl
);
465 area
= TREE_READONLY (decl
) ? DATA_AREA_CONST
: DATA_AREA_GLOBAL
;
468 SET_SYMBOL_DATA_AREA (XEXP (rtl
, 0), area
);
472 /* Return the PTX name of the data area in which SYM should be
473 placed. The symbol must have already been processed by
474 nvptx_encode_seciton_info, or equivalent. */
477 section_for_sym (rtx sym
)
479 nvptx_data_area area
= SYMBOL_DATA_AREA (sym
);
480 /* Same order as nvptx_data_area enum. */
481 static char const *const areas
[] =
482 {"", ".global", ".shared", ".local", ".const", ".param"};
487 /* Similarly for a decl. */
490 section_for_decl (const_tree decl
)
492 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree
, decl
)), 0));
495 /* Check NAME for special function names and redirect them by returning a
496 replacement. This applies to malloc, free and realloc, for which we
497 want to use libgcc wrappers, and call, which triggers a bug in
498 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
499 not active in an offload compiler -- the names are all set by the
500 host-side compiler. */
503 nvptx_name_replacement (const char *name
)
505 if (strcmp (name
, "call") == 0)
506 return "__nvptx_call";
507 if (strcmp (name
, "malloc") == 0)
508 return "__nvptx_malloc";
509 if (strcmp (name
, "free") == 0)
510 return "__nvptx_free";
511 if (strcmp (name
, "realloc") == 0)
512 return "__nvptx_realloc";
516 /* Return NULL if NAME contains no dot. Otherwise return a copy of NAME
517 with the dots replaced with dollar signs. */
520 nvptx_replace_dot (const char *name
)
522 if (strchr (name
, '.') == NULL
)
525 char *p
= xstrdup (name
);
526 for (size_t i
= 0; i
< strlen (p
); ++i
)
532 /* If MODE should be treated as two registers of an inner mode, return
533 that inner mode. Otherwise return VOIDmode. */
536 maybe_split_mode (machine_mode mode
)
538 if (COMPLEX_MODE_P (mode
))
539 return GET_MODE_INNER (mode
);
547 /* Return true if mode should be treated as two registers. */
550 split_mode_p (machine_mode mode
)
552 return maybe_split_mode (mode
) != VOIDmode
;
555 /* Output a register, subreg, or register pair (with optional
556 enclosing braces). */
559 output_reg (FILE *file
, unsigned regno
, machine_mode inner_mode
,
560 int subreg_offset
= -1)
562 if (inner_mode
== VOIDmode
)
564 if (HARD_REGISTER_NUM_P (regno
))
565 fprintf (file
, "%s", reg_names
[regno
]);
567 fprintf (file
, "%%r%d", regno
);
569 else if (subreg_offset
>= 0)
571 output_reg (file
, regno
, VOIDmode
);
572 fprintf (file
, "$%d", subreg_offset
);
576 if (subreg_offset
== -1)
578 output_reg (file
, regno
, inner_mode
, GET_MODE_SIZE (inner_mode
));
580 output_reg (file
, regno
, inner_mode
, 0);
581 if (subreg_offset
== -1)
586 /* Emit forking instructions for MASK. */
589 nvptx_emit_forking (unsigned mask
, bool is_call
)
591 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
592 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
595 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
597 /* Emit fork at all levels. This helps form SESE regions, as
598 it creates a block with a single successor before entering a
599 partitooned region. That is a good candidate for the end of
601 emit_insn (gen_nvptx_fork (op
));
602 emit_insn (gen_nvptx_forked (op
));
606 /* Emit joining instructions for MASK. */
609 nvptx_emit_joining (unsigned mask
, bool is_call
)
611 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
612 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
615 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
617 /* Emit joining for all non-call pars to ensure there's a single
618 predecessor for the block the join insn ends up in. This is
619 needed for skipping entire loops. */
620 emit_insn (gen_nvptx_joining (op
));
621 emit_insn (gen_nvptx_join (op
));
626 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
627 returned in memory. Integer and floating types supported by the
628 machine are passed in registers, everything else is passed in
629 memory. Complex types are split. */
632 pass_in_memory (machine_mode mode
, const_tree type
, bool for_return
)
636 if (AGGREGATE_TYPE_P (type
))
638 if (TREE_CODE (type
) == VECTOR_TYPE
)
642 if (!for_return
&& COMPLEX_MODE_P (mode
))
643 /* Complex types are passed as two underlying args. */
644 mode
= GET_MODE_INNER (mode
);
646 if (GET_MODE_CLASS (mode
) != MODE_INT
647 && GET_MODE_CLASS (mode
) != MODE_FLOAT
)
650 if (GET_MODE_SIZE (mode
) > UNITS_PER_WORD
)
656 /* A non-memory argument of mode MODE is being passed, determine the mode it
657 should be promoted to. This is also used for determining return
661 promote_arg (machine_mode mode
, bool prototyped
)
663 if (!prototyped
&& mode
== SFmode
)
664 /* K&R float promotion for unprototyped functions. */
666 else if (GET_MODE_SIZE (mode
) < GET_MODE_SIZE (SImode
))
672 /* A non-memory return type of MODE is being returned. Determine the
673 mode it should be promoted to. */
676 promote_return (machine_mode mode
)
678 return promote_arg (mode
, true);
681 /* Implement TARGET_FUNCTION_ARG. */
684 nvptx_function_arg (cumulative_args_t
, const function_arg_info
&arg
)
686 if (arg
.end_marker_p () || !arg
.named
)
689 return gen_reg_rtx (arg
.mode
);
692 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
695 nvptx_function_incoming_arg (cumulative_args_t cum_v
,
696 const function_arg_info
&arg
)
698 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
700 if (arg
.end_marker_p () || !arg
.named
)
703 /* No need to deal with split modes here, the only case that can
704 happen is complex modes and those are dealt with by
705 TARGET_SPLIT_COMPLEX_ARG. */
706 return gen_rtx_UNSPEC (arg
.mode
,
707 gen_rtvec (1, GEN_INT (cum
->count
)),
711 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
714 nvptx_function_arg_advance (cumulative_args_t cum_v
, const function_arg_info
&)
716 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
721 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
723 For nvptx This is only used for varadic args. The type has already
724 been promoted and/or converted to invisible reference. */
727 nvptx_function_arg_boundary (machine_mode mode
, const_tree
ARG_UNUSED (type
))
729 return GET_MODE_ALIGNMENT (mode
);
732 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
734 For nvptx, we know how to handle functions declared as stdarg: by
735 passing an extra pointer to the unnamed arguments. However, the
736 Fortran frontend can produce a different situation, where a
737 function pointer is declared with no arguments, but the actual
738 function and calls to it take more arguments. In that case, we
739 want to ensure the call matches the definition of the function. */
742 nvptx_strict_argument_naming (cumulative_args_t cum_v
)
744 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
746 return cum
->fntype
== NULL_TREE
|| stdarg_p (cum
->fntype
);
749 /* Implement TARGET_LIBCALL_VALUE. */
752 nvptx_libcall_value (machine_mode mode
, const_rtx
)
754 if (!cfun
|| !cfun
->machine
->doing_call
)
755 /* Pretend to return in a hard reg for early uses before pseudos can be
757 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
759 return gen_reg_rtx (mode
);
762 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
763 where function FUNC returns or receives a value of data type TYPE. */
766 nvptx_function_value (const_tree type
, const_tree
ARG_UNUSED (func
),
769 machine_mode mode
= promote_return (TYPE_MODE (type
));
774 cfun
->machine
->return_mode
= mode
;
775 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
778 return nvptx_libcall_value (mode
, NULL_RTX
);
781 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
784 nvptx_function_value_regno_p (const unsigned int regno
)
786 return regno
== NVPTX_RETURN_REGNUM
;
789 /* Types with a mode other than those supported by the machine are passed by
790 reference in memory. */
793 nvptx_pass_by_reference (cumulative_args_t
, const function_arg_info
&arg
)
795 return pass_in_memory (arg
.mode
, arg
.type
, false);
798 /* Implement TARGET_RETURN_IN_MEMORY. */
801 nvptx_return_in_memory (const_tree type
, const_tree
)
803 return pass_in_memory (TYPE_MODE (type
), type
, true);
806 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
809 nvptx_promote_function_mode (const_tree type
, machine_mode mode
,
810 int *ARG_UNUSED (punsignedp
),
811 const_tree funtype
, int for_return
)
813 return promote_arg (mode
, for_return
|| !type
|| TYPE_ARG_TYPES (funtype
));
816 /* Helper for write_arg. Emit a single PTX argument of MODE, either
817 in a prototype, or as copy in a function prologue. ARGNO is the
818 index of this argument in the PTX function. FOR_REG is negative,
819 if we're emitting the PTX prototype. It is zero if we're copying
820 to an argument register and it is greater than zero if we're
821 copying to a specific hard register. */
824 write_arg_mode (std::stringstream
&s
, int for_reg
, int argno
,
827 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
831 /* Writing PTX prototype. */
832 s
<< (argno
? ", " : " (");
833 s
<< ".param" << ptx_type
<< " %in_ar" << argno
;
837 s
<< "\t.reg" << ptx_type
<< " ";
839 s
<< reg_names
[for_reg
];
845 s
<< "\tld.param" << ptx_type
<< " ";
847 s
<< reg_names
[for_reg
];
850 s
<< ", [%in_ar" << argno
<< "];\n";
856 /* Process function parameter TYPE to emit one or more PTX
857 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
858 is true, if this is a prototyped function, rather than an old-style
859 C declaration. Returns the next argument number to use.
861 The promotion behavior here must match the regular GCC function
862 parameter marshalling machinery. */
865 write_arg_type (std::stringstream
&s
, int for_reg
, int argno
,
866 tree type
, bool prototyped
)
868 machine_mode mode
= TYPE_MODE (type
);
870 if (mode
== VOIDmode
)
873 if (pass_in_memory (mode
, type
, false))
877 bool split
= TREE_CODE (type
) == COMPLEX_TYPE
;
881 /* Complex types are sent as two separate args. */
882 type
= TREE_TYPE (type
);
883 mode
= TYPE_MODE (type
);
887 mode
= promote_arg (mode
, prototyped
);
889 argno
= write_arg_mode (s
, for_reg
, argno
, mode
);
892 return write_arg_mode (s
, for_reg
, argno
, mode
);
895 /* Emit a PTX return as a prototype or function prologue declaration
899 write_return_mode (std::stringstream
&s
, bool for_proto
, machine_mode mode
)
901 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
902 const char *pfx
= "\t.reg";
903 const char *sfx
= ";\n";
906 pfx
= "(.param", sfx
= "_out) ";
908 s
<< pfx
<< ptx_type
<< " " << reg_names
[NVPTX_RETURN_REGNUM
] << sfx
;
911 /* Process a function return TYPE to emit a PTX return as a prototype
912 or function prologue declaration. Returns true if return is via an
913 additional pointer parameter. The promotion behavior here must
914 match the regular GCC function return mashalling. */
917 write_return_type (std::stringstream
&s
, bool for_proto
, tree type
)
919 machine_mode mode
= TYPE_MODE (type
);
921 if (mode
== VOIDmode
)
924 bool return_in_mem
= pass_in_memory (mode
, type
, true);
929 return return_in_mem
;
931 /* Named return values can cause us to return a pointer as well
932 as expect an argument for the return location. This is
933 optimization-level specific, so no caller can make use of
934 this data, but more importantly for us, we must ensure it
935 doesn't change the PTX prototype. */
936 mode
= (machine_mode
) cfun
->machine
->return_mode
;
938 if (mode
== VOIDmode
)
939 return return_in_mem
;
941 /* Clear return_mode to inhibit copy of retval to non-existent
943 cfun
->machine
->return_mode
= VOIDmode
;
946 mode
= promote_return (mode
);
948 write_return_mode (s
, for_proto
, mode
);
950 return return_in_mem
;
953 /* Look for attributes in ATTRS that would indicate we must write a function
954 as a .entry kernel rather than a .func. Return true if one is found. */
957 write_as_kernel (tree attrs
)
959 return (lookup_attribute ("kernel", attrs
) != NULL_TREE
960 || (lookup_attribute ("omp target entrypoint", attrs
) != NULL_TREE
961 && lookup_attribute ("oacc function", attrs
) != NULL_TREE
));
962 /* For OpenMP target regions, the corresponding kernel entry is emitted from
963 write_omp_entry as a separate function. */
966 /* Emit a linker marker for a function decl or defn. */
969 write_fn_marker (std::stringstream
&s
, bool is_defn
, bool globalize
,
975 s
<< " FUNCTION " << (is_defn
? "DEF: " : "DECL: ");
979 /* Emit a linker marker for a variable decl or defn. */
982 write_var_marker (FILE *file
, bool is_defn
, bool globalize
, const char *name
)
984 fprintf (file
, "\n// BEGIN%s VAR %s: ",
985 globalize
? " GLOBAL" : "",
986 is_defn
? "DEF" : "DECL");
987 assemble_name_raw (file
, name
);
991 /* Helper function for write_fn_proto. */
994 write_fn_proto_1 (std::stringstream
&s
, bool is_defn
,
995 const char *name
, const_tree decl
, bool force_public
)
997 if (lookup_attribute ("alias", DECL_ATTRIBUTES (decl
)) == NULL
)
998 write_fn_marker (s
, is_defn
, TREE_PUBLIC (decl
) || force_public
, name
);
1000 /* PTX declaration. */
1001 if (DECL_EXTERNAL (decl
))
1003 else if (TREE_PUBLIC (decl
) || force_public
)
1004 s
<< (DECL_WEAK (decl
) ? ".weak " : ".visible ");
1005 s
<< (write_as_kernel (DECL_ATTRIBUTES (decl
)) ? ".entry " : ".func ");
1007 tree fntype
= TREE_TYPE (decl
);
1008 tree result_type
= TREE_TYPE (fntype
);
1010 /* atomic_compare_exchange_$n builtins have an exceptional calling
1012 int not_atomic_weak_arg
= -1;
1013 if (DECL_BUILT_IN_CLASS (decl
) == BUILT_IN_NORMAL
)
1014 switch (DECL_FUNCTION_CODE (decl
))
1016 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1
:
1017 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2
:
1018 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4
:
1019 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8
:
1020 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16
:
1021 /* These atomics skip the 'weak' parm in an actual library
1022 call. We must skip it in the prototype too. */
1023 not_atomic_weak_arg
= 3;
1030 /* Declare the result. */
1031 bool return_in_mem
= write_return_type (s
, true, result_type
);
1037 /* Emit argument list. */
1039 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
1042 NULL in TYPE_ARG_TYPES, for old-style functions
1043 NULL in DECL_ARGUMENTS, for builtin functions without another
1045 So we have to pick the best one we have. */
1046 tree args
= TYPE_ARG_TYPES (fntype
);
1047 bool prototyped
= true;
1050 args
= DECL_ARGUMENTS (decl
);
1054 for (; args
; args
= TREE_CHAIN (args
), not_atomic_weak_arg
--)
1056 tree type
= prototyped
? TREE_VALUE (args
) : TREE_TYPE (args
);
1058 if (not_atomic_weak_arg
)
1059 argno
= write_arg_type (s
, -1, argno
, type
, prototyped
);
1061 gcc_assert (TREE_CODE (type
) == BOOLEAN_TYPE
);
1064 if (stdarg_p (fntype
))
1065 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
1067 if (DECL_STATIC_CHAIN (decl
))
1068 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
1070 if (argno
< 2 && strcmp (name
, "main") == 0)
1073 argno
= write_arg_type (s
, -1, argno
, integer_type_node
, true);
1076 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
1082 s
<< (is_defn
? "\n" : ";\n");
1085 /* Write a .func or .kernel declaration or definition along with
1086 a helper comment for use by ld. S is the stream to write to, DECL
1087 the decl for the function with name NAME. For definitions, emit
1088 a declaration too. */
1091 write_fn_proto (std::stringstream
&s
, bool is_defn
,
1092 const char *name
, const_tree decl
, bool force_public
=false)
1094 const char *replacement
= nvptx_name_replacement (name
);
1095 char *replaced_dots
= NULL
;
1096 if (replacement
!= name
)
1100 replaced_dots
= nvptx_replace_dot (name
);
1102 name
= replaced_dots
;
1108 /* Emit a declaration. The PTX assembler gets upset without it. */
1109 write_fn_proto_1 (s
, false, name
, decl
, force_public
);
1111 write_fn_proto_1 (s
, is_defn
, name
, decl
, force_public
);
1114 XDELETE (replaced_dots
);
1117 /* Construct a function declaration from a call insn. This can be
1118 necessary for two reasons - either we have an indirect call which
1119 requires a .callprototype declaration, or we have a libcall
1120 generated by emit_library_call for which no decl exists. */
1123 write_fn_proto_from_insn (std::stringstream
&s
, const char *name
,
1124 rtx result
, rtx pat
)
1126 char *replaced_dots
= NULL
;
1130 s
<< "\t.callprototype ";
1135 const char *replacement
= nvptx_name_replacement (name
);
1136 if (replacement
!= name
)
1140 replaced_dots
= nvptx_replace_dot (name
);
1142 name
= replaced_dots
;
1144 write_fn_marker (s
, false, true, name
);
1145 s
<< "\t.extern .func ";
1148 if (result
!= NULL_RTX
)
1149 write_return_mode (s
, true, GET_MODE (result
));
1153 XDELETE (replaced_dots
);
1155 int arg_end
= XVECLEN (pat
, 0);
1156 for (int i
= 1; i
< arg_end
; i
++)
1158 /* We don't have to deal with mode splitting & promotion here,
1159 as that was already done when generating the call
1161 machine_mode mode
= GET_MODE (XEXP (XVECEXP (pat
, 0, i
), 0));
1163 write_arg_mode (s
, -1, i
- 1, mode
);
1170 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
1171 table and write a ptx prototype. These are emitted at end of
1175 nvptx_record_fndecl (tree decl
)
1177 tree
*slot
= declared_fndecls_htab
->find_slot (decl
, INSERT
);
1181 const char *name
= get_fnname_from_decl (decl
);
1182 write_fn_proto (func_decls
, false, name
, decl
);
1186 /* Record a libcall or unprototyped external function. CALLEE is the
1187 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
1188 declaration for it. */
1191 nvptx_record_libfunc (rtx callee
, rtx retval
, rtx pat
)
1193 rtx
*slot
= declared_libfuncs_htab
->find_slot (callee
, INSERT
);
1198 const char *name
= XSTR (callee
, 0);
1199 write_fn_proto_from_insn (func_decls
, name
, retval
, pat
);
1203 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
1204 is prototyped, record it now. Otherwise record it as needed at end
1205 of compilation, when we might have more information about it. */
1208 nvptx_record_needed_fndecl (tree decl
)
1210 if (TYPE_ARG_TYPES (TREE_TYPE (decl
)) == NULL_TREE
)
1212 tree
*slot
= needed_fndecls_htab
->find_slot (decl
, INSERT
);
1217 nvptx_record_fndecl (decl
);
1220 /* SYM is a SYMBOL_REF. If it refers to an external function, record
1224 nvptx_maybe_record_fnsym (rtx sym
)
1226 tree decl
= SYMBOL_REF_DECL (sym
);
1228 if (decl
&& TREE_CODE (decl
) == FUNCTION_DECL
&& DECL_EXTERNAL (decl
))
1229 nvptx_record_needed_fndecl (decl
);
1232 /* Emit a local array to hold some part of a conventional stack frame
1233 and initialize REGNO to point to it. If the size is zero, it'll
1234 never be valid to dereference, so we can simply initialize to
1238 init_frame (FILE *file
, int regno
, unsigned align
, unsigned size
)
1241 fprintf (file
, "\t.local .align %d .b8 %s_ar[%u];\n",
1242 align
, reg_names
[regno
], size
);
1243 fprintf (file
, "\t.reg.u%d %s;\n",
1244 POINTER_SIZE
, reg_names
[regno
]);
1245 fprintf (file
, (size
? "\tcvta.local.u%d %s, %s_ar;\n"
1246 : "\tmov.u%d %s, 0;\n"),
1247 POINTER_SIZE
, reg_names
[regno
], reg_names
[regno
]);
1250 /* Emit soft stack frame setup sequence. */
1253 init_softstack_frame (FILE *file
, unsigned alignment
, HOST_WIDE_INT size
)
1255 /* Maintain 64-bit stack alignment. */
1256 unsigned keep_align
= BIGGEST_ALIGNMENT
/ BITS_PER_UNIT
;
1257 size
= ROUND_UP (size
, keep_align
);
1258 int bits
= POINTER_SIZE
;
1259 const char *reg_stack
= reg_names
[STACK_POINTER_REGNUM
];
1260 const char *reg_frame
= reg_names
[FRAME_POINTER_REGNUM
];
1261 const char *reg_sspslot
= reg_names
[SOFTSTACK_SLOT_REGNUM
];
1262 const char *reg_sspprev
= reg_names
[SOFTSTACK_PREV_REGNUM
];
1263 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_stack
);
1264 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_frame
);
1265 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_sspslot
);
1266 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_sspprev
);
1267 fprintf (file
, "\t{\n");
1268 fprintf (file
, "\t\t.reg.u32 %%fstmp0;\n");
1269 fprintf (file
, "\t\t.reg.u%d %%fstmp1;\n", bits
);
1270 fprintf (file
, "\t\t.reg.u%d %%fstmp2;\n", bits
);
1271 fprintf (file
, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1272 fprintf (file
, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1273 bits
== 64 ? ".wide" : ".lo", bits
/ 8);
1274 fprintf (file
, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits
);
1276 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1277 fprintf (file
, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits
, reg_sspslot
);
1279 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1280 fprintf (file
, "\t\tld.shared.u%d %s, [%s];\n",
1281 bits
, reg_sspprev
, reg_sspslot
);
1283 /* Initialize %frame = %sspprev - size. */
1284 fprintf (file
, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC
";\n",
1285 bits
, reg_frame
, reg_sspprev
, size
);
1287 /* Apply alignment, if larger than 64. */
1288 if (alignment
> keep_align
)
1289 fprintf (file
, "\t\tand.b%d %s, %s, %d;\n",
1290 bits
, reg_frame
, reg_frame
, -alignment
);
1292 size
= crtl
->outgoing_args_size
;
1293 gcc_assert (size
% keep_align
== 0);
1295 /* Initialize %stack. */
1296 fprintf (file
, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC
";\n",
1297 bits
, reg_stack
, reg_frame
, size
);
1300 fprintf (file
, "\t\tst.shared.u%d [%s], %s;\n",
1301 bits
, reg_sspslot
, reg_stack
);
1302 fprintf (file
, "\t}\n");
1303 cfun
->machine
->has_softstack
= true;
1304 need_softstack_decl
= true;
1307 /* Emit code to initialize the REGNO predicate register to indicate
1308 whether we are not lane zero on the NAME axis. */
1311 nvptx_init_axis_predicate (FILE *file
, int regno
, const char *name
)
1313 fprintf (file
, "\t{\n");
1314 fprintf (file
, "\t\t.reg.u32\t%%%s;\n", name
);
1315 if (strcmp (name
, "x") == 0 && cfun
->machine
->red_partition
)
1317 fprintf (file
, "\t\t.reg.u64\t%%t_red;\n");
1318 fprintf (file
, "\t\t.reg.u64\t%%y64;\n");
1320 fprintf (file
, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name
, name
);
1321 fprintf (file
, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno
, name
);
1322 if (strcmp (name
, "x") == 0 && cfun
->machine
->red_partition
)
1324 fprintf (file
, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1325 fprintf (file
, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1326 fprintf (file
, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1327 "// vector reduction buffer\n",
1328 REGNO (cfun
->machine
->red_partition
),
1329 vector_red_partition
);
1331 /* Verify vector_red_size. */
1332 gcc_assert (vector_red_partition
* nvptx_mach_max_workers ()
1333 <= vector_red_size
);
1334 fprintf (file
, "\t}\n");
1337 /* Emit code to initialize OpenACC worker broadcast and synchronization
1341 nvptx_init_oacc_workers (FILE *file
)
1343 fprintf (file
, "\t{\n");
1344 fprintf (file
, "\t\t.reg.u32\t%%tidy;\n");
1345 if (cfun
->machine
->bcast_partition
)
1347 fprintf (file
, "\t\t.reg.u64\t%%t_bcast;\n");
1348 fprintf (file
, "\t\t.reg.u64\t%%y64;\n");
1350 fprintf (file
, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1351 if (cfun
->machine
->bcast_partition
)
1353 fprintf (file
, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1354 fprintf (file
, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1355 fprintf (file
, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1356 fprintf (file
, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1357 "// vector broadcast offset\n",
1358 REGNO (cfun
->machine
->bcast_partition
),
1359 oacc_bcast_partition
);
1361 /* Verify oacc_bcast_size. */
1362 gcc_assert (oacc_bcast_partition
* (nvptx_mach_max_workers () + 1)
1363 <= oacc_bcast_size
);
1364 if (cfun
->machine
->sync_bar
)
1365 fprintf (file
, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1366 "// vector synchronization barrier\n",
1367 REGNO (cfun
->machine
->sync_bar
));
1368 fprintf (file
, "\t}\n");
1371 /* Emit code to initialize predicate and master lane index registers for
1372 -muniform-simt code generation variant. */
1375 nvptx_init_unisimt_predicate (FILE *file
)
1377 cfun
->machine
->unisimt_location
= gen_reg_rtx (Pmode
);
1378 int loc
= REGNO (cfun
->machine
->unisimt_location
);
1379 int bits
= POINTER_SIZE
;
1380 fprintf (file
, "\t.reg.u%d %%r%d;\n", bits
, loc
);
1381 fprintf (file
, "\t{\n");
1382 fprintf (file
, "\t\t.reg.u32 %%ustmp0;\n");
1383 fprintf (file
, "\t\t.reg.u%d %%ustmp1;\n", bits
);
1384 fprintf (file
, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1385 fprintf (file
, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1386 bits
== 64 ? ".wide" : ".lo");
1387 fprintf (file
, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits
, loc
);
1388 fprintf (file
, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits
, loc
, loc
);
1389 if (cfun
->machine
->unisimt_predicate
)
1391 int master
= REGNO (cfun
->machine
->unisimt_master
);
1392 int pred
= REGNO (cfun
->machine
->unisimt_predicate
);
1393 fprintf (file
, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master
, loc
);
1394 if (cfun
->machine
->unisimt_outside_simt_predicate
)
1396 int pred_outside_simt
1397 = REGNO (cfun
->machine
->unisimt_outside_simt_predicate
);
1398 fprintf (file
, "\t\tsetp.eq.u32 %%r%d, %%r%d, 0;\n",
1399 pred_outside_simt
, master
);
1401 fprintf (file
, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1402 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1403 fprintf (file
, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master
, master
);
1404 /* Compute predicate as 'tid.x == master'. */
1405 fprintf (file
, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred
, master
);
1407 fprintf (file
, "\t}\n");
1408 need_unisimt_decl
= true;
1411 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1413 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1414 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1416 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1417 __nvptx_uni[tid.y] = 0;
1418 gomp_nvptx_main (ORIG, arg);
1420 ORIG itself should not be emitted as a PTX .entry function. */
1423 write_omp_entry (FILE *file
, const char *name
, const char *orig
)
1425 static bool gomp_nvptx_main_declared
;
1426 if (!gomp_nvptx_main_declared
)
1428 gomp_nvptx_main_declared
= true;
1429 write_fn_marker (func_decls
, false, true, "gomp_nvptx_main");
1430 func_decls
<< ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1431 << " %in_ar1, .param.u" << POINTER_SIZE
<< " %in_ar2);\n";
1433 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1434 #define NTID_Y "%ntid.y"
1435 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1436 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1439 .reg.u" PS " %R<4>;\n\
1440 mov.u32 %r0, %tid.y;\n\
1441 mov.u32 %r1, " NTID_Y ";\n\
1442 mov.u32 %r2, %ctaid.x;\n\
1443 cvt.u" PS ".u32 %R1, %r0;\n\
1444 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1445 mov.u" PS " %R0, __nvptx_stacks;\n\
1446 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1447 ld.param.u" PS " %R2, [%stack];\n\
1448 ld.param.u" PS " %R3, [%sz];\n\
1449 add.u" PS " %R2, %R2, %R3;\n\
1450 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1451 st.shared.u" PS " [%R0], %R2;\n\
1452 mov.u" PS " %R0, __nvptx_uni;\n\
1453 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1455 st.shared.u32 [%R0], %r0;\n\
1456 mov.u" PS " %R0, \0;\n\
1457 ld.param.u" PS " %R1, [%arg];\n\
1459 .param.u" PS " %P<2>;\n\
1460 st.param.u" PS " [%P0], %R0;\n\
1461 st.param.u" PS " [%P1], %R1;\n\
1462 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1466 static const char entry64
[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1467 static const char entry32
[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1468 #undef ENTRY_TEMPLATE
1470 const char *entry_1
= TARGET_ABI64
? entry64
: entry32
;
1471 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1472 const char *entry_2
= entry_1
+ strlen (entry64
) + 1;
1473 fprintf (file
, ".visible .entry %s%s%s%s", name
, entry_1
, orig
, entry_2
);
1474 need_softstack_decl
= need_unisimt_decl
= true;
1477 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1478 function, including local var decls and copies from the arguments to
1482 nvptx_declare_function_name (FILE *file
, const char *name
, const_tree decl
)
1484 tree fntype
= TREE_TYPE (decl
);
1485 tree result_type
= TREE_TYPE (fntype
);
1487 bool force_public
= false;
1489 /* For reverse-offload 'nohost' functions: In order to be collectable in
1490 '$offload_func_table', cf. mkoffload.cc, the function has to be visible. */
1491 if (lookup_attribute ("omp target device_ancestor_nohost",
1492 DECL_ATTRIBUTES (decl
)))
1493 force_public
= true;
1494 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl
))
1495 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl
)))
1497 char *buf
= (char *) alloca (strlen (name
) + sizeof ("$impl"));
1498 sprintf (buf
, "%s$impl", name
);
1499 write_omp_entry (file
, name
, buf
);
1502 /* We construct the initial part of the function into a string
1503 stream, in order to share the prototype writing code. */
1504 std::stringstream s
;
1505 write_fn_proto (s
, true, name
, decl
, force_public
);
1508 bool return_in_mem
= write_return_type (s
, false, result_type
);
1510 argno
= write_arg_type (s
, 0, argno
, ptr_type_node
, true);
1512 /* Declare and initialize incoming arguments. */
1513 tree args
= TYPE_ARG_TYPES (fntype
);
1514 bool prototyped
= true;
1517 args
= DECL_ARGUMENTS (decl
);
1521 for (; args
!= NULL_TREE
; args
= TREE_CHAIN (args
))
1523 tree type
= prototyped
? TREE_VALUE (args
) : TREE_TYPE (args
);
1525 argno
= write_arg_type (s
, 0, argno
, type
, prototyped
);
1528 if (stdarg_p (fntype
))
1529 argno
= write_arg_type (s
, ARG_POINTER_REGNUM
, argno
, ptr_type_node
,
1532 if (DECL_STATIC_CHAIN (decl
) || cfun
->machine
->has_chain
)
1533 write_arg_type (s
, STATIC_CHAIN_REGNUM
,
1534 DECL_STATIC_CHAIN (decl
) ? argno
: -1, ptr_type_node
,
1537 fprintf (file
, "%s", s
.str().c_str());
1539 /* Usually 'crtl->is_leaf' is computed during register allocator
1540 initialization (which is not done on NVPTX) or for pressure-sensitive
1541 optimizations. Initialize it here, except if already set. */
1543 crtl
->is_leaf
= leaf_function_p ();
1545 HOST_WIDE_INT sz
= get_frame_size ();
1546 bool need_frameptr
= sz
|| cfun
->machine
->has_chain
;
1547 int alignment
= crtl
->stack_alignment_needed
/ BITS_PER_UNIT
;
1548 if (!TARGET_SOFT_STACK
)
1550 /* Declare a local var for outgoing varargs. */
1551 if (cfun
->machine
->has_varadic
)
1552 init_frame (file
, STACK_POINTER_REGNUM
,
1553 UNITS_PER_WORD
, crtl
->outgoing_args_size
);
1555 /* Declare a local variable for the frame. Force its size to be
1556 DImode-compatible. */
1558 init_frame (file
, FRAME_POINTER_REGNUM
, alignment
,
1559 ROUND_UP (sz
, GET_MODE_SIZE (DImode
)));
1561 else if (need_frameptr
|| cfun
->machine
->has_varadic
|| cfun
->calls_alloca
1562 || (cfun
->machine
->has_simtreg
&& !crtl
->is_leaf
))
1563 init_softstack_frame (file
, alignment
, sz
);
1565 if (cfun
->machine
->has_simtreg
)
1567 unsigned HOST_WIDE_INT
&simtsz
= cfun
->machine
->simt_stack_size
;
1568 unsigned HOST_WIDE_INT
&align
= cfun
->machine
->simt_stack_align
;
1569 align
= MAX (align
, GET_MODE_SIZE (DImode
));
1570 if (!crtl
->is_leaf
|| cfun
->calls_alloca
)
1571 simtsz
= HOST_WIDE_INT_M1U
;
1572 if (simtsz
== HOST_WIDE_INT_M1U
)
1573 simtsz
= nvptx_softstack_size
;
1574 if (cfun
->machine
->has_softstack
)
1575 simtsz
+= POINTER_SIZE
/ 8;
1576 simtsz
= ROUND_UP (simtsz
, GET_MODE_SIZE (DImode
));
1577 if (align
> GET_MODE_SIZE (DImode
))
1578 simtsz
+= align
- GET_MODE_SIZE (DImode
);
1580 fprintf (file
, "\t.local.align 8 .b8 %%simtstack_ar["
1581 HOST_WIDE_INT_PRINT_DEC
"];\n", simtsz
);
1584 /* Restore the vector reduction partition register, if necessary.
1585 FIXME: Find out when and why this is necessary, and fix it. */
1586 if (cfun
->machine
->red_partition
)
1587 regno_reg_rtx
[REGNO (cfun
->machine
->red_partition
)]
1588 = cfun
->machine
->red_partition
;
1590 /* Declare the pseudos we have as ptx registers. */
1591 int maxregs
= max_reg_num ();
1592 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< maxregs
; i
++)
1594 if (regno_reg_rtx
[i
] != const0_rtx
)
1596 machine_mode mode
= PSEUDO_REGNO_MODE (i
);
1597 machine_mode split
= maybe_split_mode (mode
);
1599 if (split_mode_p (mode
))
1601 fprintf (file
, "\t.reg%s ", nvptx_ptx_type_from_mode (mode
, true));
1602 output_reg (file
, i
, split
, -2);
1603 fprintf (file
, ";\n");
1607 /* Emit axis predicates. */
1608 if (cfun
->machine
->axis_predicate
[0])
1609 nvptx_init_axis_predicate (file
,
1610 REGNO (cfun
->machine
->axis_predicate
[0]), "y");
1611 if (cfun
->machine
->axis_predicate
[1])
1612 nvptx_init_axis_predicate (file
,
1613 REGNO (cfun
->machine
->axis_predicate
[1]), "x");
1614 if (cfun
->machine
->unisimt_predicate
1615 || (cfun
->machine
->has_simtreg
&& !crtl
->is_leaf
))
1616 nvptx_init_unisimt_predicate (file
);
1617 if (cfun
->machine
->bcast_partition
|| cfun
->machine
->sync_bar
)
1618 nvptx_init_oacc_workers (file
);
1621 /* Output code for switching uniform-simt state. ENTERING indicates whether
1622 we are entering or leaving non-uniform execution region. */
1625 nvptx_output_unisimt_switch (FILE *file
, bool entering
)
1627 if (crtl
->is_leaf
&& !cfun
->machine
->unisimt_predicate
)
1629 fprintf (file
, "\t{\n");
1630 fprintf (file
, "\t\t.reg.u32 %%ustmp2;\n");
1631 fprintf (file
, "\t\tmov.u32 %%ustmp2, %d;\n", entering
? -1 : 0);
1632 if (cfun
->machine
->unisimt_outside_simt_predicate
)
1634 int pred_outside_simt
1635 = REGNO (cfun
->machine
->unisimt_outside_simt_predicate
);
1636 fprintf (file
, "\t\tmov.pred %%r%d, %d;\n", pred_outside_simt
,
1641 int loc
= REGNO (cfun
->machine
->unisimt_location
);
1642 fprintf (file
, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc
);
1644 if (cfun
->machine
->unisimt_predicate
)
1646 int master
= REGNO (cfun
->machine
->unisimt_master
);
1647 int pred
= REGNO (cfun
->machine
->unisimt_predicate
);
1648 fprintf (file
, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1649 fprintf (file
, "\t\tmov.u32 %%r%d, %s;\n",
1650 master
, entering
? "%ustmp2" : "0");
1651 fprintf (file
, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred
, master
);
1653 fprintf (file
, "\t}\n");
1656 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1657 ENTERING indicates whether we are entering or leaving non-uniform execution.
1658 PTR is the register pointing to allocated storage, it is assigned to on
1659 entering and used to restore state on leaving. SIZE and ALIGN are used only
1663 nvptx_output_softstack_switch (FILE *file
, bool entering
,
1664 rtx ptr
, rtx size
, rtx align
)
1666 gcc_assert (REG_P (ptr
) && !HARD_REGISTER_P (ptr
));
1667 if (crtl
->is_leaf
&& !cfun
->machine
->simt_stack_size
)
1669 int bits
= POINTER_SIZE
, regno
= REGNO (ptr
);
1670 fprintf (file
, "\t{\n");
1673 fprintf (file
, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1674 HOST_WIDE_INT_PRINT_DEC
";\n", bits
, regno
,
1675 cfun
->machine
->simt_stack_size
);
1676 fprintf (file
, "\t\tsub.u%d %%r%d, %%r%d, ", bits
, regno
, regno
);
1677 if (CONST_INT_P (size
))
1678 fprintf (file
, HOST_WIDE_INT_PRINT_DEC
,
1679 ROUND_UP (UINTVAL (size
), GET_MODE_SIZE (DImode
)));
1681 output_reg (file
, REGNO (size
), VOIDmode
);
1682 fputs (";\n", file
);
1683 if (!CONST_INT_P (size
) || UINTVAL (align
) > GET_MODE_SIZE (DImode
))
1685 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC
";\n",
1686 bits
, regno
, regno
, UINTVAL (align
));
1688 if (cfun
->machine
->has_softstack
)
1690 const char *reg_stack
= reg_names
[STACK_POINTER_REGNUM
];
1693 fprintf (file
, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1694 bits
, regno
, bits
/ 8, reg_stack
);
1695 fprintf (file
, "\t\tsub.u%d %s, %%r%d, %d;\n",
1696 bits
, reg_stack
, regno
, bits
/ 8);
1700 fprintf (file
, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1701 bits
, reg_stack
, regno
, bits
/ 8);
1703 nvptx_output_set_softstack (REGNO (stack_pointer_rtx
));
1705 fprintf (file
, "\t}\n");
1708 /* Output code to enter non-uniform execution region. DEST is a register
1709 to hold a per-lane allocation given by SIZE and ALIGN. */
1712 nvptx_output_simt_enter (rtx dest
, rtx size
, rtx align
)
1714 nvptx_output_unisimt_switch (asm_out_file
, true);
1715 nvptx_output_softstack_switch (asm_out_file
, true, dest
, size
, align
);
1719 /* Output code to leave non-uniform execution region. SRC is the register
1720 holding per-lane storage previously allocated by omp_simt_enter insn. */
1723 nvptx_output_simt_exit (rtx src
)
1725 nvptx_output_unisimt_switch (asm_out_file
, false);
1726 nvptx_output_softstack_switch (asm_out_file
, false, src
, NULL_RTX
, NULL_RTX
);
1730 /* Output instruction that sets soft stack pointer in shared memory to the
1731 value in register given by SRC_REGNO. */
1734 nvptx_output_set_softstack (unsigned src_regno
)
1736 if (cfun
->machine
->has_softstack
&& !crtl
->is_leaf
)
1738 fprintf (asm_out_file
, "\tst.shared.u%d\t[%s], ",
1739 POINTER_SIZE
, reg_names
[SOFTSTACK_SLOT_REGNUM
]);
1740 output_reg (asm_out_file
, src_regno
, VOIDmode
);
1741 fprintf (asm_out_file
, ";\n");
1745 /* Output a return instruction. Also copy the return value to its outgoing
1749 nvptx_output_return (void)
1751 machine_mode mode
= (machine_mode
)cfun
->machine
->return_mode
;
1753 if (mode
!= VOIDmode
)
1754 fprintf (asm_out_file
, "\tst.param%s\t[%s_out], %s;\n",
1755 nvptx_ptx_type_from_mode (mode
, false),
1756 reg_names
[NVPTX_RETURN_REGNUM
],
1757 reg_names
[NVPTX_RETURN_REGNUM
]);
1762 /* Terminate a function by writing a closing brace to FILE. */
1765 nvptx_function_end (FILE *file
)
1767 fprintf (file
, "}\n");
1770 /* Decide whether we can make a sibling call to a function. For ptx, we
1774 nvptx_function_ok_for_sibcall (tree
, tree
)
1779 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1782 nvptx_get_drap_rtx (void)
1784 if (TARGET_SOFT_STACK
&& stack_realign_drap
)
1785 return arg_pointer_rtx
;
1789 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1790 argument to the next call. */
1793 nvptx_call_args (rtx arg
, tree fntype
)
1795 if (!cfun
->machine
->doing_call
)
1797 cfun
->machine
->doing_call
= true;
1798 cfun
->machine
->is_varadic
= false;
1799 cfun
->machine
->num_args
= 0;
1801 if (fntype
&& stdarg_p (fntype
))
1803 cfun
->machine
->is_varadic
= true;
1804 cfun
->machine
->has_varadic
= true;
1805 cfun
->machine
->num_args
++;
1809 if (REG_P (arg
) && arg
!= pc_rtx
)
1811 cfun
->machine
->num_args
++;
1812 cfun
->machine
->call_args
= alloc_EXPR_LIST (VOIDmode
, arg
,
1813 cfun
->machine
->call_args
);
1817 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1818 information we recorded. */
1821 nvptx_end_call_args (void)
1823 cfun
->machine
->doing_call
= false;
1824 free_EXPR_LIST_list (&cfun
->machine
->call_args
);
1827 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1828 track of whether calls involving static chains or varargs were seen
1829 in the current function.
1830 For libcalls, maintain a hash table of decls we have seen, and
1831 record a function decl for later when encountering a new one. */
1834 nvptx_expand_call (rtx retval
, rtx address
)
1836 rtx callee
= XEXP (address
, 0);
1837 rtx varargs
= NULL_RTX
;
1838 unsigned parallel
= 0;
1840 if (!call_insn_operand (callee
, Pmode
))
1842 callee
= force_reg (Pmode
, callee
);
1843 address
= change_address (address
, QImode
, callee
);
1846 if (GET_CODE (callee
) == SYMBOL_REF
)
1848 tree decl
= SYMBOL_REF_DECL (callee
);
1849 if (decl
!= NULL_TREE
)
1851 if (DECL_STATIC_CHAIN (decl
))
1852 cfun
->machine
->has_chain
= true;
1854 tree attr
= oacc_get_fn_attrib (decl
);
1857 tree dims
= TREE_VALUE (attr
);
1859 parallel
= GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1;
1860 for (int ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++)
1862 if (TREE_PURPOSE (dims
)
1863 && !integer_zerop (TREE_PURPOSE (dims
)))
1865 /* Not on this axis. */
1866 parallel
^= GOMP_DIM_MASK (ix
);
1867 dims
= TREE_CHAIN (dims
);
1873 unsigned nargs
= cfun
->machine
->num_args
;
1874 if (cfun
->machine
->is_varadic
)
1876 varargs
= gen_reg_rtx (Pmode
);
1877 emit_move_insn (varargs
, stack_pointer_rtx
);
1880 rtvec vec
= rtvec_alloc (nargs
+ 1);
1881 rtx pat
= gen_rtx_PARALLEL (VOIDmode
, vec
);
1884 rtx call
= gen_rtx_CALL (VOIDmode
, address
, const0_rtx
);
1885 rtx tmp_retval
= retval
;
1888 if (!nvptx_register_operand (retval
, GET_MODE (retval
)))
1889 tmp_retval
= gen_reg_rtx (GET_MODE (retval
));
1890 call
= gen_rtx_SET (tmp_retval
, call
);
1892 XVECEXP (pat
, 0, vec_pos
++) = call
;
1894 /* Construct the call insn, including a USE for each argument pseudo
1895 register. These will be used when printing the insn. */
1896 for (rtx arg
= cfun
->machine
->call_args
; arg
; arg
= XEXP (arg
, 1))
1897 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, XEXP (arg
, 0));
1900 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, varargs
);
1902 gcc_assert (vec_pos
= XVECLEN (pat
, 0));
1904 nvptx_emit_forking (parallel
, true);
1905 emit_call_insn (pat
);
1906 nvptx_emit_joining (parallel
, true);
1908 if (tmp_retval
!= retval
)
1909 emit_move_insn (retval
, tmp_retval
);
1912 /* Emit a comparison COMPARE, and return the new test to be used in the
1916 nvptx_expand_compare (rtx compare
)
1918 rtx pred
= gen_reg_rtx (BImode
);
1919 rtx cmp
= gen_rtx_fmt_ee (GET_CODE (compare
), BImode
,
1920 XEXP (compare
, 0), XEXP (compare
, 1));
1921 emit_insn (gen_rtx_SET (pred
, cmp
));
1922 return gen_rtx_NE (BImode
, pred
, const0_rtx
);
1925 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1928 nvptx_expand_oacc_fork (unsigned mode
)
1930 nvptx_emit_forking (GOMP_DIM_MASK (mode
), false);
1934 nvptx_expand_oacc_join (unsigned mode
)
1936 nvptx_emit_joining (GOMP_DIM_MASK (mode
), false);
1939 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1943 nvptx_gen_unpack (rtx dst0
, rtx dst1
, rtx src
)
1947 switch (GET_MODE (src
))
1950 res
= gen_unpackdisi2 (dst0
, dst1
, src
);
1953 res
= gen_unpackdfsi2 (dst0
, dst1
, src
);
1955 default: gcc_unreachable ();
1960 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1964 nvptx_gen_pack (rtx dst
, rtx src0
, rtx src1
)
1968 switch (GET_MODE (dst
))
1971 res
= gen_packsidi2 (dst
, src0
, src1
);
1974 res
= gen_packsidf2 (dst
, src0
, src1
);
1976 default: gcc_unreachable ();
1981 /* Generate an instruction or sequence to broadcast register REG
1982 across the vectors of a single warp. */
1985 nvptx_gen_shuffle (rtx dst
, rtx src
, rtx idx
, nvptx_shuffle_kind kind
)
1989 switch (GET_MODE (dst
))
1994 gcc_assert (GET_CODE (dst
) == CONCAT
);
1995 gcc_assert (GET_CODE (src
) == CONCAT
);
1996 rtx dst_real
= XEXP (dst
, 0);
1997 rtx dst_imag
= XEXP (dst
, 1);
1998 rtx src_real
= XEXP (src
, 0);
1999 rtx src_imag
= XEXP (src
, 1);
2002 emit_insn (nvptx_gen_shuffle (dst_real
, src_real
, idx
, kind
));
2003 emit_insn (nvptx_gen_shuffle (dst_imag
, src_imag
, idx
, kind
));
2009 res
= gen_nvptx_shufflesi (dst
, src
, idx
, GEN_INT (kind
));
2012 res
= gen_nvptx_shufflesf (dst
, src
, idx
, GEN_INT (kind
));
2017 rtx tmp0
= gen_reg_rtx (SImode
);
2018 rtx tmp1
= gen_reg_rtx (SImode
);
2021 emit_insn (nvptx_gen_unpack (tmp0
, tmp1
, src
));
2022 emit_insn (nvptx_gen_shuffle (tmp0
, tmp0
, idx
, kind
));
2023 emit_insn (nvptx_gen_shuffle (tmp1
, tmp1
, idx
, kind
));
2024 emit_insn (nvptx_gen_pack (dst
, tmp0
, tmp1
));
2031 rtx src0
= gen_rtx_SUBREG (SImode
, src
, 0);
2032 rtx src1
= gen_rtx_SUBREG (SImode
, src
, 4);
2033 rtx dst0
= gen_rtx_SUBREG (SImode
, dst
, 0);
2034 rtx dst1
= gen_rtx_SUBREG (SImode
, dst
, 4);
2035 rtx tmp0
= gen_reg_rtx (SImode
);
2036 rtx tmp1
= gen_reg_rtx (SImode
);
2038 emit_insn (gen_movsi (tmp0
, src0
));
2039 emit_insn (gen_movsi (tmp1
, src1
));
2040 emit_insn (nvptx_gen_shuffle (tmp0
, tmp0
, idx
, kind
));
2041 emit_insn (nvptx_gen_shuffle (tmp1
, tmp1
, idx
, kind
));
2042 emit_insn (gen_movsi (dst0
, tmp0
));
2043 emit_insn (gen_movsi (dst1
, tmp1
));
2050 rtx src0
= gen_rtx_SUBREG (DImode
, src
, 0);
2051 rtx src1
= gen_rtx_SUBREG (DImode
, src
, 8);
2052 rtx dst0
= gen_rtx_SUBREG (DImode
, dst
, 0);
2053 rtx dst1
= gen_rtx_SUBREG (DImode
, dst
, 8);
2054 rtx tmp0
= gen_reg_rtx (DImode
);
2055 rtx tmp1
= gen_reg_rtx (DImode
);
2057 emit_insn (gen_movdi (tmp0
, src0
));
2058 emit_insn (gen_movdi (tmp1
, src1
));
2059 emit_insn (nvptx_gen_shuffle (tmp0
, tmp0
, idx
, kind
));
2060 emit_insn (nvptx_gen_shuffle (tmp1
, tmp1
, idx
, kind
));
2061 emit_insn (gen_movdi (dst0
, tmp0
));
2062 emit_insn (gen_movdi (dst1
, tmp1
));
2069 rtx tmp
= gen_reg_rtx (SImode
);
2072 emit_insn (gen_sel_truesi (tmp
, src
, GEN_INT (1), const0_rtx
));
2073 emit_insn (nvptx_gen_shuffle (tmp
, tmp
, idx
, kind
));
2074 emit_insn (gen_rtx_SET (dst
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
2082 rtx tmp
= gen_reg_rtx (SImode
);
2085 emit_insn (gen_rtx_SET (tmp
, gen_rtx_fmt_e (ZERO_EXTEND
, SImode
, src
)));
2086 emit_insn (nvptx_gen_shuffle (tmp
, tmp
, idx
, kind
));
2087 emit_insn (gen_rtx_SET (dst
, gen_rtx_fmt_e (TRUNCATE
, GET_MODE (dst
),
2100 /* Generate an instruction or sequence to broadcast register REG
2101 across the vectors of a single warp. */
2104 nvptx_gen_warp_bcast (rtx reg
)
2106 return nvptx_gen_shuffle (reg
, reg
, const0_rtx
, SHUFFLE_IDX
);
2109 /* Structure used when generating a worker-level spill or fill. */
2111 struct broadcast_data_t
2113 rtx base
; /* Register holding base addr of buffer. */
2114 rtx ptr
; /* Iteration var, if needed. */
2115 unsigned offset
; /* Offset into worker buffer. */
2118 /* Direction of the spill/fill and looping setup/teardown indicator. */
2124 PM_loop_begin
= 1 << 2,
2125 PM_loop_end
= 1 << 3,
2127 PM_read_write
= PM_read
| PM_write
2130 /* Generate instruction(s) to spill or fill register REG to/from the
2131 worker broadcast array. PM indicates what is to be done, REP
2132 how many loop iterations will be executed (0 for not a loop). */
2135 nvptx_gen_shared_bcast (rtx reg
, propagate_mask pm
, unsigned rep
,
2136 broadcast_data_t
*data
, bool vector
)
2139 machine_mode mode
= GET_MODE (reg
);
2145 rtx tmp
= gen_reg_rtx (SImode
);
2149 emit_insn (gen_sel_truesi (tmp
, reg
, GEN_INT (1), const0_rtx
));
2150 emit_insn (nvptx_gen_shared_bcast (tmp
, pm
, rep
, data
, vector
));
2152 emit_insn (gen_rtx_SET (reg
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
2160 rtx addr
= data
->ptr
;
2164 unsigned align
= GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
;
2166 oacc_bcast_align
= MAX (oacc_bcast_align
, align
);
2167 data
->offset
= ROUND_UP (data
->offset
, align
);
2169 gcc_assert (data
->base
!= NULL
);
2171 addr
= gen_rtx_PLUS (Pmode
, addr
, GEN_INT (data
->offset
));
2174 addr
= gen_rtx_MEM (mode
, addr
);
2176 res
= gen_rtx_SET (addr
, reg
);
2177 else if (pm
== PM_write
)
2178 res
= gen_rtx_SET (reg
, addr
);
2184 /* We're using a ptr, increment it. */
2188 emit_insn (gen_adddi3 (data
->ptr
, data
->ptr
,
2189 GEN_INT (GET_MODE_SIZE (GET_MODE (reg
)))));
2195 data
->offset
+= rep
* GET_MODE_SIZE (GET_MODE (reg
));
2202 /* Returns true if X is a valid address for use in a memory reference. */
2205 nvptx_legitimate_address_p (machine_mode
, rtx x
, bool)
2207 enum rtx_code code
= GET_CODE (x
);
2215 if (REG_P (XEXP (x
, 0)) && CONST_INT_P (XEXP (x
, 1)))
2229 /* Machinery to output constant initializers. When beginning an
2230 initializer, we decide on a fragment size (which is visible in ptx
2231 in the type used), and then all initializer data is buffered until
2232 a fragment is filled and ready to be written out. */
2236 unsigned HOST_WIDE_INT mask
; /* Mask for storing fragment. */
2237 unsigned HOST_WIDE_INT val
; /* Current fragment value. */
2238 unsigned HOST_WIDE_INT remaining
; /* Remaining bytes to be written
2240 unsigned size
; /* Fragment size to accumulate. */
2241 unsigned offset
; /* Offset within current fragment. */
2242 bool started
; /* Whether we've output any initializer. */
2245 /* The current fragment is full, write it out. SYM may provide a
2246 symbolic reference we should output, in which case the fragment
2247 value is the addend. */
2250 output_init_frag (rtx sym
)
2252 fprintf (asm_out_file
, init_frag
.started
? ", " : " = { ");
2253 unsigned HOST_WIDE_INT val
= init_frag
.val
;
2255 init_frag
.started
= true;
2257 init_frag
.offset
= 0;
2258 init_frag
.remaining
--;
2262 bool function
= (SYMBOL_REF_DECL (sym
)
2263 && (TREE_CODE (SYMBOL_REF_DECL (sym
)) == FUNCTION_DECL
));
2265 fprintf (asm_out_file
, "generic(");
2266 output_address (VOIDmode
, sym
);
2268 fprintf (asm_out_file
, ")");
2270 fprintf (asm_out_file
, " + ");
2274 fprintf (asm_out_file
, HOST_WIDE_INT_PRINT_DEC
, val
);
2277 /* Add value VAL of size SIZE to the data we're emitting, and keep
2278 writing out chunks as they fill up. */
2281 nvptx_assemble_value (unsigned HOST_WIDE_INT val
, unsigned size
)
2284 = val
& (HOST_WIDE_INT_1U
<< (HOST_BITS_PER_WIDE_INT
- 1));
2286 /* Avoid undefined behaviour. */
2287 if (size
* BITS_PER_UNIT
< HOST_BITS_PER_WIDE_INT
)
2288 val
&= (HOST_WIDE_INT_1U
<< (size
* BITS_PER_UNIT
)) - 1;
2290 for (unsigned part
= 0; size
; size
-= part
)
2292 if (part
* BITS_PER_UNIT
== HOST_BITS_PER_WIDE_INT
)
2293 /* Avoid undefined behaviour. */
2294 val
= negative_p
? -1 : 0;
2296 val
>>= (part
* BITS_PER_UNIT
);
2297 part
= init_frag
.size
- init_frag
.offset
;
2298 part
= MIN (part
, size
);
2300 unsigned HOST_WIDE_INT partial
2301 = val
<< (init_frag
.offset
* BITS_PER_UNIT
);
2302 init_frag
.val
|= partial
& init_frag
.mask
;
2303 init_frag
.offset
+= part
;
2305 if (init_frag
.offset
== init_frag
.size
)
2306 output_init_frag (NULL
);
2310 /* Target hook for assembling integer object X of size SIZE. */
2313 nvptx_assemble_integer (rtx x
, unsigned int size
, int ARG_UNUSED (aligned_p
))
2315 HOST_WIDE_INT val
= 0;
2317 switch (GET_CODE (x
))
2320 /* Let the generic machinery figure it out, usually for a
2325 nvptx_assemble_value (INTVAL (x
), size
);
2330 gcc_assert (GET_CODE (x
) == PLUS
);
2331 val
= INTVAL (XEXP (x
, 1));
2333 gcc_assert (GET_CODE (x
) == SYMBOL_REF
);
2334 gcc_fallthrough (); /* FALLTHROUGH */
2337 gcc_assert (size
== init_frag
.size
);
2338 if (init_frag
.offset
)
2339 sorry ("cannot emit unaligned pointers in ptx assembly");
2341 nvptx_maybe_record_fnsym (x
);
2342 init_frag
.val
= val
;
2343 output_init_frag (x
);
2350 /* Output SIZE zero bytes. We ignore the FILE argument since the
2351 functions we're calling to perform the output just use
2355 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size
)
2357 /* Finish the current fragment, if it's started. */
2358 if (init_frag
.offset
)
2360 unsigned part
= init_frag
.size
- init_frag
.offset
;
2361 part
= MIN (part
, (unsigned)size
);
2363 nvptx_assemble_value (0, part
);
2366 /* If this skip doesn't terminate the initializer, write as many
2367 remaining pieces as possible directly. */
2368 if (size
< init_frag
.remaining
* init_frag
.size
)
2370 while (size
>= init_frag
.size
)
2372 size
-= init_frag
.size
;
2373 output_init_frag (NULL_RTX
);
2376 nvptx_assemble_value (0, size
);
2380 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2381 ignore the FILE arg. */
2384 nvptx_output_ascii (FILE *, const char *str
, unsigned HOST_WIDE_INT size
)
2386 for (unsigned HOST_WIDE_INT i
= 0; i
< size
; i
++)
2387 nvptx_assemble_value (str
[i
], 1);
2390 /* Return true if TYPE is a record type where the last field is an array without
2394 flexible_array_member_type_p (const_tree type
)
2396 if (TREE_CODE (type
) != RECORD_TYPE
)
2399 const_tree last_field
= NULL_TREE
;
2400 for (const_tree f
= TYPE_FIELDS (type
); f
; f
= TREE_CHAIN (f
))
2406 const_tree last_field_type
= TREE_TYPE (last_field
);
2407 if (TREE_CODE (last_field_type
) != ARRAY_TYPE
)
2410 return (! TYPE_DOMAIN (last_field_type
)
2411 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type
)));
2414 /* Emit a PTX variable decl and prepare for emission of its
2415 initializer. NAME is the symbol name and SETION the PTX data
2416 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2417 The caller has already emitted any indentation and linkage
2418 specifier. It is responsible for any initializer, terminating ;
2419 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2420 this is the opposite way round that PTX wants them! */
2423 nvptx_assemble_decl_begin (FILE *file
, const char *name
, const char *section
,
2424 const_tree type
, HOST_WIDE_INT size
, unsigned align
,
2425 bool undefined
= false)
2427 bool atype
= (TREE_CODE (type
) == ARRAY_TYPE
)
2428 && (TYPE_DOMAIN (type
) == NULL_TREE
);
2430 if (undefined
&& flexible_array_member_type_p (type
))
2436 while (TREE_CODE (type
) == ARRAY_TYPE
)
2437 type
= TREE_TYPE (type
);
2439 if (TREE_CODE (type
) == VECTOR_TYPE
2440 || TREE_CODE (type
) == COMPLEX_TYPE
)
2441 /* Neither vector nor complex types can contain the other. */
2442 type
= TREE_TYPE (type
);
2444 unsigned HOST_WIDE_INT elt_size
= int_size_in_bytes (type
);
2446 /* Largest mode we're prepared to accept. For BLKmode types we
2447 don't know if it'll contain pointer constants, so have to choose
2448 pointer size, otherwise we can choose DImode. */
2449 machine_mode elt_mode
= TYPE_MODE (type
) == BLKmode
? Pmode
: DImode
;
2451 elt_size
|= GET_MODE_SIZE (elt_mode
);
2452 elt_size
&= -elt_size
; /* Extract LSB set. */
2454 init_frag
.size
= elt_size
;
2455 /* Avoid undefined shift behavior by using '2'. */
2456 init_frag
.mask
= ((unsigned HOST_WIDE_INT
)2
2457 << (elt_size
* BITS_PER_UNIT
- 1)) - 1;
2459 init_frag
.offset
= 0;
2460 init_frag
.started
= false;
2461 /* Size might not be a multiple of elt size, if there's an
2462 initialized trailing struct array with smaller type than
2464 init_frag
.remaining
= (size
+ elt_size
- 1) / elt_size
;
2466 fprintf (file
, "%s .align %d .u" HOST_WIDE_INT_PRINT_UNSIGNED
" ",
2467 section
, align
/ BITS_PER_UNIT
,
2468 elt_size
* BITS_PER_UNIT
);
2469 assemble_name (file
, name
);
2472 /* We make everything an array, to simplify any initialization
2474 fprintf (file
, "[" HOST_WIDE_INT_PRINT_UNSIGNED
"]", init_frag
.remaining
);
2476 fprintf (file
, "[]");
2479 /* Called when the initializer for a decl has been completely output through
2480 combinations of the three functions above. */
2483 nvptx_assemble_decl_end (void)
2485 if (init_frag
.offset
)
2486 /* This can happen with a packed struct with trailing array member. */
2487 nvptx_assemble_value (0, init_frag
.size
- init_frag
.offset
);
2488 fprintf (asm_out_file
, init_frag
.started
? " };\n" : ";\n");
2491 /* Output an uninitialized common or file-scope variable. */
2494 nvptx_output_aligned_decl (FILE *file
, const char *name
,
2495 const_tree decl
, HOST_WIDE_INT size
, unsigned align
)
2497 write_var_marker (file
, true, TREE_PUBLIC (decl
), name
);
2499 /* If this is public, it is common. The nearest thing we have to
2501 fprintf (file
, "\t%s", TREE_PUBLIC (decl
) ? ".weak " : "");
2503 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2504 TREE_TYPE (decl
), size
, align
);
2505 nvptx_assemble_decl_end ();
2508 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2509 writing a constant variable EXP with NAME and SIZE and its
2510 initializer to FILE. */
2513 nvptx_asm_declare_constant_name (FILE *file
, const char *name
,
2514 const_tree exp
, HOST_WIDE_INT obj_size
)
2516 write_var_marker (file
, true, false, name
);
2518 fprintf (file
, "\t");
2520 tree type
= TREE_TYPE (exp
);
2521 nvptx_assemble_decl_begin (file
, name
, ".const", type
, obj_size
,
2525 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2526 a variable DECL with NAME to FILE. */
2529 nvptx_declare_object_name (FILE *file
, const char *name
, const_tree decl
)
2531 write_var_marker (file
, true, TREE_PUBLIC (decl
), name
);
2533 fprintf (file
, "\t%s", (!TREE_PUBLIC (decl
) ? ""
2534 : DECL_WEAK (decl
) ? ".weak " : ".visible "));
2536 tree type
= TREE_TYPE (decl
);
2537 HOST_WIDE_INT obj_size
= tree_to_shwi (DECL_SIZE_UNIT (decl
));
2538 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2539 type
, obj_size
, DECL_ALIGN (decl
));
2542 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2545 nvptx_globalize_label (FILE *, const char *)
2549 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2550 declaration only for variable DECL with NAME to FILE. */
2553 nvptx_assemble_undefined_decl (FILE *file
, const char *name
, const_tree decl
)
2555 /* The middle end can place constant pool decls into the varpool as
2556 undefined. Until that is fixed, catch the problem here. */
2557 if (DECL_IN_CONSTANT_POOL (decl
))
2560 /* We support weak defintions, and hence have the right
2561 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2562 if (DECL_WEAK (decl
))
2563 error_at (DECL_SOURCE_LOCATION (decl
),
2564 "PTX does not support weak declarations"
2565 " (only weak definitions)");
2566 write_var_marker (file
, false, TREE_PUBLIC (decl
), name
);
2568 fprintf (file
, "\t.extern ");
2569 tree size
= DECL_SIZE_UNIT (decl
);
2570 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2571 TREE_TYPE (decl
), size
? tree_to_shwi (size
) : 0,
2572 DECL_ALIGN (decl
), true);
2573 nvptx_assemble_decl_end ();
2576 /* Output a pattern for a move instruction. */
2579 nvptx_output_mov_insn (rtx dst
, rtx src
)
2581 machine_mode dst_mode
= GET_MODE (dst
);
2582 machine_mode src_mode
= GET_MODE (src
);
2583 machine_mode dst_inner
= (GET_CODE (dst
) == SUBREG
2584 ? GET_MODE (XEXP (dst
, 0)) : dst_mode
);
2585 machine_mode src_inner
= (GET_CODE (src
) == SUBREG
2586 ? GET_MODE (XEXP (src
, 0)) : dst_mode
);
2589 if (GET_CODE (sym
) == CONST
)
2590 sym
= XEXP (XEXP (sym
, 0), 0);
2591 if (SYMBOL_REF_P (sym
))
2593 if (SYMBOL_DATA_AREA (sym
) != DATA_AREA_GENERIC
)
2594 return "%.\tcvta%D1%t0\t%0, %1;";
2595 nvptx_maybe_record_fnsym (sym
);
2598 if (src_inner
== dst_inner
)
2599 return "%.\tmov%t0\t%0, %1;";
2601 if (CONSTANT_P (src
))
2602 return (GET_MODE_CLASS (dst_inner
) == MODE_INT
2603 && GET_MODE_CLASS (src_inner
) != MODE_FLOAT
2604 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2606 if (GET_MODE_SIZE (dst_inner
) == GET_MODE_SIZE (src_inner
))
2608 if (GET_MODE_BITSIZE (dst_mode
) == 128
2609 && GET_MODE_BITSIZE (src_mode
) == 128)
2611 /* mov.b128 is not supported. */
2612 if (dst_inner
== V2DImode
&& src_inner
== TImode
)
2613 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2614 else if (dst_inner
== TImode
&& src_inner
== V2DImode
)
2615 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2619 return "%.\tmov.b%T0\t%0, %1;";
2622 if (GET_MODE_BITSIZE (src_inner
) == 128
2623 && GET_MODE_BITSIZE (src_mode
) == 64)
2624 return "%.\tmov.b%T0\t%0, %1;";
2626 return "%.\tcvt%t0%t1\t%0, %1;";
2629 /* Output a pre/post barrier for MEM_OPERAND according to MEMMODEL. */
2632 nvptx_output_barrier (rtx
*mem_operand
, int memmodel
, bool pre_p
)
2634 bool post_p
= !pre_p
;
2638 case MEMMODEL_RELAXED
:
2640 case MEMMODEL_CONSUME
:
2641 case MEMMODEL_ACQUIRE
:
2642 case MEMMODEL_SYNC_ACQUIRE
:
2646 case MEMMODEL_RELEASE
:
2647 case MEMMODEL_SYNC_RELEASE
:
2651 case MEMMODEL_ACQ_REL
:
2652 case MEMMODEL_SEQ_CST
:
2653 case MEMMODEL_SYNC_SEQ_CST
:
2654 if (pre_p
|| post_p
)
2661 output_asm_insn ("%.\tmembar%B0;", mem_operand
);
2665 nvptx_output_atomic_insn (const char *asm_template
, rtx
*operands
, int mem_pos
,
2668 nvptx_output_barrier (&operands
[mem_pos
], INTVAL (operands
[memmodel_pos
]),
2670 output_asm_insn (asm_template
, operands
);
2671 nvptx_output_barrier (&operands
[mem_pos
], INTVAL (operands
[memmodel_pos
]),
2676 static void nvptx_print_operand (FILE *, rtx
, int);
2678 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2679 involves writing .param declarations and in/out copies into them. For
2680 indirect calls, also write the .callprototype. */
2683 nvptx_output_call_insn (rtx_insn
*insn
, rtx result
, rtx callee
)
2687 bool needs_tgt
= register_operand (callee
, Pmode
);
2688 rtx pat
= PATTERN (insn
);
2689 if (GET_CODE (pat
) == COND_EXEC
)
2690 pat
= COND_EXEC_CODE (pat
);
2691 int arg_end
= XVECLEN (pat
, 0);
2692 tree decl
= NULL_TREE
;
2694 fprintf (asm_out_file
, "\t{\n");
2696 fprintf (asm_out_file
, "\t\t.param%s %s_in;\n",
2697 nvptx_ptx_type_from_mode (GET_MODE (result
), false),
2698 reg_names
[NVPTX_RETURN_REGNUM
]);
2700 /* Ensure we have a ptx declaration in the output if necessary. */
2701 if (GET_CODE (callee
) == SYMBOL_REF
)
2703 decl
= SYMBOL_REF_DECL (callee
);
2705 || (DECL_EXTERNAL (decl
) && !TYPE_ARG_TYPES (TREE_TYPE (decl
))))
2706 nvptx_record_libfunc (callee
, result
, pat
);
2707 else if (DECL_EXTERNAL (decl
))
2708 nvptx_record_fndecl (decl
);
2713 ASM_GENERATE_INTERNAL_LABEL (buf
, "LCT", labelno
);
2715 ASM_OUTPUT_LABEL (asm_out_file
, buf
);
2716 std::stringstream s
;
2717 write_fn_proto_from_insn (s
, NULL
, result
, pat
);
2718 fputs (s
.str().c_str(), asm_out_file
);
2721 for (int argno
= 1; argno
< arg_end
; argno
++)
2723 rtx t
= XEXP (XVECEXP (pat
, 0, argno
), 0);
2724 machine_mode mode
= GET_MODE (t
);
2725 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
2727 /* Mode splitting has already been done. */
2728 fprintf (asm_out_file
, "\t\t.param%s %%out_arg%d;\n"
2729 "\t\tst.param%s [%%out_arg%d], ",
2730 ptx_type
, argno
, ptx_type
, argno
);
2731 output_reg (asm_out_file
, REGNO (t
), VOIDmode
);
2732 fprintf (asm_out_file
, ";\n");
2735 /* The '.' stands for the call's predicate, if any. */
2736 nvptx_print_operand (asm_out_file
, NULL_RTX
, '.');
2737 fprintf (asm_out_file
, "\t\tcall ");
2738 if (result
!= NULL_RTX
)
2739 fprintf (asm_out_file
, "(%s_in), ", reg_names
[NVPTX_RETURN_REGNUM
]);
2743 char *replaced_dots
= NULL
;
2744 const char *name
= get_fnname_from_decl (decl
);
2745 const char *replacement
= nvptx_name_replacement (name
);
2746 if (replacement
!= name
)
2750 replaced_dots
= nvptx_replace_dot (name
);
2752 name
= replaced_dots
;
2754 assemble_name (asm_out_file
, name
);
2756 XDELETE (replaced_dots
);
2759 output_address (VOIDmode
, callee
);
2761 const char *open
= "(";
2762 for (int argno
= 1; argno
< arg_end
; argno
++)
2764 fprintf (asm_out_file
, ", %s%%out_arg%d", open
, argno
);
2767 if (decl
&& DECL_STATIC_CHAIN (decl
))
2769 fprintf (asm_out_file
, ", %s%s", open
, reg_names
[STATIC_CHAIN_REGNUM
]);
2773 fprintf (asm_out_file
, ")");
2777 fprintf (asm_out_file
, ", ");
2778 assemble_name (asm_out_file
, buf
);
2780 fprintf (asm_out_file
, ";\n");
2782 if (find_reg_note (insn
, REG_NORETURN
, NULL
))
2784 /* No return functions confuse the PTX JIT, as it doesn't realize
2785 the flow control barrier they imply. It can seg fault if it
2786 encounters what looks like an unexitable loop. Emit a trailing
2787 trap and exit, which it does grok. */
2788 fprintf (asm_out_file
, "\t\ttrap; // (noreturn)\n");
2789 fprintf (asm_out_file
, "\t\texit; // (noreturn)\n");
2794 static char rval
[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2797 /* We must escape the '%' that starts RETURN_REGNUM. */
2798 sprintf (rval
, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2799 reg_names
[NVPTX_RETURN_REGNUM
]);
2806 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2809 nvptx_print_operand_punct_valid_p (unsigned char c
)
2811 return c
== '.' || c
== '#';
2814 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2817 nvptx_print_address_operand (FILE *file
, rtx x
, machine_mode
)
2820 if (GET_CODE (x
) == CONST
)
2822 switch (GET_CODE (x
))
2826 output_address (VOIDmode
, XEXP (x
, 0));
2827 fprintf (file
, "+");
2828 output_address (VOIDmode
, off
);
2833 output_addr_const (file
, x
);
2837 gcc_assert (GET_CODE (x
) != MEM
);
2838 nvptx_print_operand (file
, x
, 0);
2843 /* Write assembly language output for the address ADDR to FILE. */
2846 nvptx_print_operand_address (FILE *file
, machine_mode mode
, rtx addr
)
2848 nvptx_print_address_operand (file
, addr
, mode
);
2851 static nvptx_data_area
2852 nvptx_mem_data_area (const_rtx x
)
2854 gcc_assert (GET_CODE (x
) == MEM
);
2856 const_rtx addr
= XEXP (x
, 0);
2857 subrtx_iterator::array_type array
;
2858 FOR_EACH_SUBRTX (iter
, array
, addr
, ALL
)
2859 if (SYMBOL_REF_P (*iter
))
2860 return SYMBOL_DATA_AREA (*iter
);
2862 return DATA_AREA_GENERIC
;
2866 nvptx_mem_maybe_shared_p (const_rtx x
)
2868 nvptx_data_area area
= nvptx_mem_data_area (x
);
2869 return area
== DATA_AREA_SHARED
|| area
== DATA_AREA_GENERIC
;
2872 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2875 . -- print the predicate for the instruction or an emptry string for an
2877 # -- print a rounding mode for the instruction
2879 A -- print a data area for a MEM
2880 c -- print an opcode suffix for a comparison operator, including a type code
2881 D -- print a data area for a MEM operand
2882 S -- print a shuffle kind specified by CONST_INT
2883 t -- print a type opcode suffix, promoting QImode to 32 bits
2884 T -- print a type size in bits
2885 u -- print a type opcode suffix without promotions.
2886 p -- print a '!' for constant 0.
2887 x -- print a destination operand that may also be a bit bucket. */
2890 nvptx_print_operand (FILE *file
, rtx x
, int code
)
2894 x
= current_insn_predicate
;
2898 if (GET_CODE (x
) == EQ
)
2900 output_reg (file
, REGNO (XEXP (x
, 0)), VOIDmode
);
2904 else if (code
== '#')
2906 fputs (".rn", file
);
2910 enum rtx_code x_code
= GET_CODE (x
);
2911 machine_mode mode
= GET_MODE (x
);
2916 if (current_output_insn
!= NULL
2917 && find_reg_note (current_output_insn
, REG_UNUSED
, x
) != NULL_RTX
)
2924 if (SYMBOL_REF_P (XEXP (x
, 0)))
2925 switch (SYMBOL_DATA_AREA (XEXP (x
, 0)))
2927 case DATA_AREA_GENERIC
:
2928 /* Assume worst-case: global. */
2929 gcc_fallthrough (); /* FALLTHROUGH. */
2930 case DATA_AREA_GLOBAL
:
2932 case DATA_AREA_SHARED
:
2933 fputs (".cta", file
);
2935 case DATA_AREA_LOCAL
:
2936 case DATA_AREA_CONST
:
2937 case DATA_AREA_PARAM
:
2942 /* There are 2 cases where membar.sys differs from membar.gl:
2943 - host accesses global memory (f.i. systemwide atomics)
2944 - 2 or more devices are setup in peer-to-peer mode, and one
2945 peer can access global memory of other peer.
2946 Neither are currently supported by openMP/OpenACC on nvptx, but
2947 that could change, so we default to membar.sys. We could support
2948 this more optimally by adding DATA_AREA_SYS and then emitting
2949 .gl for DATA_AREA_GLOBAL and .sys for DATA_AREA_SYS. */
2950 fputs (".sys", file
);
2955 gcc_fallthrough (); /* FALLTHROUGH. */
2958 if (GET_CODE (x
) == CONST
)
2960 if (GET_CODE (x
) == PLUS
)
2963 if (GET_CODE (x
) == SYMBOL_REF
)
2964 fputs (section_for_sym (x
), file
);
2969 if (x_code
== SUBREG
)
2971 machine_mode inner_mode
= GET_MODE (SUBREG_REG (x
));
2972 if (VECTOR_MODE_P (inner_mode
)
2973 && (GET_MODE_SIZE (mode
)
2974 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode
))))
2975 mode
= GET_MODE_INNER (inner_mode
);
2976 else if (split_mode_p (inner_mode
))
2977 mode
= maybe_split_mode (inner_mode
);
2981 fprintf (file
, "%s", nvptx_ptx_type_from_mode (mode
, code
== 't'));
2987 rtx inner_x
= SUBREG_REG (x
);
2988 machine_mode inner_mode
= GET_MODE (inner_x
);
2989 machine_mode split
= maybe_split_mode (inner_mode
);
2991 output_reg (file
, REGNO (inner_x
), split
,
2993 ? GET_MODE_SIZE (inner_mode
) / 2
3000 nvptx_shuffle_kind kind
= (nvptx_shuffle_kind
) UINTVAL (x
);
3001 /* Same order as nvptx_shuffle_kind. */
3002 static const char *const kinds
[] =
3003 {".up", ".down", ".bfly", ".idx"};
3004 fputs (kinds
[kind
], file
);
3009 fprintf (file
, "%d", GET_MODE_BITSIZE (mode
));
3013 fprintf (file
, "@");
3017 fprintf (file
, "@!");
3021 if (INTVAL (x
) == 0)
3022 fprintf (file
, "!");
3026 mode
= GET_MODE (XEXP (x
, 0));
3030 fputs (".eq", file
);
3033 if (FLOAT_MODE_P (mode
))
3034 fputs (".neu", file
);
3036 fputs (".ne", file
);
3040 fputs (".le", file
);
3044 fputs (".ge", file
);
3048 fputs (".lt", file
);
3052 fputs (".gt", file
);
3055 fputs (".ne", file
);
3058 fputs (".equ", file
);
3061 fputs (".leu", file
);
3064 fputs (".geu", file
);
3067 fputs (".ltu", file
);
3070 fputs (".gtu", file
);
3073 fputs (".nan", file
);
3076 fputs (".num", file
);
3081 if (FLOAT_MODE_P (mode
)
3082 || x_code
== EQ
|| x_code
== NE
3083 || x_code
== GEU
|| x_code
== GTU
3084 || x_code
== LEU
|| x_code
== LTU
)
3085 fputs (nvptx_ptx_type_from_mode (mode
, true), file
);
3087 fprintf (file
, ".s%d", GET_MODE_BITSIZE (mode
));
3095 rtx inner_x
= SUBREG_REG (x
);
3096 machine_mode inner_mode
= GET_MODE (inner_x
);
3097 machine_mode split
= maybe_split_mode (inner_mode
);
3099 if (VECTOR_MODE_P (inner_mode
)
3100 && (GET_MODE_SIZE (mode
)
3101 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode
))))
3103 output_reg (file
, REGNO (inner_x
), VOIDmode
);
3104 fprintf (file
, ".%s", SUBREG_BYTE (x
) == 0 ? "x" : "y");
3106 else if (split_mode_p (inner_mode
)
3107 && (GET_MODE_SIZE (inner_mode
) == GET_MODE_SIZE (mode
)))
3108 output_reg (file
, REGNO (inner_x
), split
);
3110 output_reg (file
, REGNO (inner_x
), split
, SUBREG_BYTE (x
));
3115 output_reg (file
, REGNO (x
), maybe_split_mode (mode
));
3120 nvptx_print_address_operand (file
, XEXP (x
, 0), mode
);
3125 output_addr_const (file
, x
);
3131 /* We could use output_addr_const, but that can print things like
3132 "x-8", which breaks ptxas. Need to ensure it is output as
3134 nvptx_print_address_operand (file
, x
, VOIDmode
);
3139 real_to_target (vals
, CONST_DOUBLE_REAL_VALUE (x
), mode
);
3140 vals
[0] &= 0xffffffff;
3141 vals
[1] &= 0xffffffff;
3143 fprintf (file
, "0f%08lx", vals
[0]);
3145 fprintf (file
, "0d%08lx%08lx", vals
[1], vals
[0]);
3150 unsigned n
= CONST_VECTOR_NUNITS (x
);
3151 fprintf (file
, "{ ");
3152 for (unsigned i
= 0; i
< n
; ++i
)
3155 fprintf (file
, ", ");
3157 rtx elem
= CONST_VECTOR_ELT (x
, i
);
3158 output_addr_const (file
, elem
);
3160 fprintf (file
, " }");
3165 output_addr_const (file
, x
);
3170 /* Record replacement regs used to deal with subreg operands. */
3173 rtx replacement
[MAX_RECOG_OPERANDS
];
3179 /* Allocate or reuse a replacement in R and return the rtx. */
3182 get_replacement (struct reg_replace
*r
)
3184 if (r
->n_allocated
== r
->n_in_use
)
3185 r
->replacement
[r
->n_allocated
++] = gen_reg_rtx (r
->mode
);
3186 return r
->replacement
[r
->n_in_use
++];
3189 /* Clean up subreg operands. In ptx assembly, everything is typed, and
3190 the presence of subregs would break the rules for most instructions.
3191 Replace them with a suitable new register of the right size, plus
3192 conversion copyin/copyout instructions. */
3195 nvptx_reorg_subreg (void)
3197 struct reg_replace qiregs
, hiregs
, siregs
, diregs
;
3198 rtx_insn
*insn
, *next
;
3200 qiregs
.n_allocated
= 0;
3201 hiregs
.n_allocated
= 0;
3202 siregs
.n_allocated
= 0;
3203 diregs
.n_allocated
= 0;
3204 qiregs
.mode
= QImode
;
3205 hiregs
.mode
= HImode
;
3206 siregs
.mode
= SImode
;
3207 diregs
.mode
= DImode
;
3209 for (insn
= get_insns (); insn
; insn
= next
)
3211 next
= NEXT_INSN (insn
);
3212 if (!NONDEBUG_INSN_P (insn
)
3213 || asm_noperands (PATTERN (insn
)) >= 0
3214 || GET_CODE (PATTERN (insn
)) == USE
3215 || GET_CODE (PATTERN (insn
)) == CLOBBER
)
3218 qiregs
.n_in_use
= 0;
3219 hiregs
.n_in_use
= 0;
3220 siregs
.n_in_use
= 0;
3221 diregs
.n_in_use
= 0;
3222 extract_insn (insn
);
3223 enum attr_subregs_ok s_ok
= get_attr_subregs_ok (insn
);
3225 for (int i
= 0; i
< recog_data
.n_operands
; i
++)
3227 rtx op
= recog_data
.operand
[i
];
3228 if (GET_CODE (op
) != SUBREG
)
3231 rtx inner
= SUBREG_REG (op
);
3233 machine_mode outer_mode
= GET_MODE (op
);
3234 machine_mode inner_mode
= GET_MODE (inner
);
3237 && (GET_MODE_PRECISION (inner_mode
)
3238 >= GET_MODE_PRECISION (outer_mode
)))
3240 gcc_assert (SCALAR_INT_MODE_P (outer_mode
));
3241 struct reg_replace
*r
= (outer_mode
== QImode
? &qiregs
3242 : outer_mode
== HImode
? &hiregs
3243 : outer_mode
== SImode
? &siregs
3245 rtx new_reg
= get_replacement (r
);
3247 if (recog_data
.operand_type
[i
] != OP_OUT
)
3250 if (GET_MODE_PRECISION (inner_mode
)
3251 < GET_MODE_PRECISION (outer_mode
))
3256 rtx pat
= gen_rtx_SET (new_reg
,
3257 gen_rtx_fmt_e (code
, outer_mode
, inner
));
3258 emit_insn_before (pat
, insn
);
3261 if (recog_data
.operand_type
[i
] != OP_IN
)
3264 if (GET_MODE_PRECISION (inner_mode
)
3265 < GET_MODE_PRECISION (outer_mode
))
3270 rtx pat
= gen_rtx_SET (inner
,
3271 gen_rtx_fmt_e (code
, inner_mode
, new_reg
));
3272 emit_insn_after (pat
, insn
);
3274 validate_change (insn
, recog_data
.operand_loc
[i
], new_reg
, false);
3279 /* Return a SImode "master lane index" register for uniform-simt, allocating on
3283 nvptx_get_unisimt_master ()
3285 rtx
&master
= cfun
->machine
->unisimt_master
;
3286 return master
? master
: master
= gen_reg_rtx (SImode
);
3289 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
3292 nvptx_get_unisimt_predicate ()
3294 rtx
&pred
= cfun
->machine
->unisimt_predicate
;
3295 return pred
? pred
: pred
= gen_reg_rtx (BImode
);
3299 nvptx_get_unisimt_outside_simt_predicate ()
3301 rtx
&pred
= cfun
->machine
->unisimt_outside_simt_predicate
;
3302 return pred
? pred
: pred
= gen_reg_rtx (BImode
);
3305 /* Return true if given call insn references one of the functions provided by
3306 the CUDA runtime: malloc, free, vprintf. */
3309 nvptx_call_insn_is_syscall_p (rtx_insn
*insn
)
3311 rtx pat
= PATTERN (insn
);
3312 gcc_checking_assert (GET_CODE (pat
) == PARALLEL
);
3313 pat
= XVECEXP (pat
, 0, 0);
3314 if (GET_CODE (pat
) == SET
)
3315 pat
= SET_SRC (pat
);
3316 gcc_checking_assert (GET_CODE (pat
) == CALL
3317 && GET_CODE (XEXP (pat
, 0)) == MEM
);
3318 rtx addr
= XEXP (XEXP (pat
, 0), 0);
3319 if (GET_CODE (addr
) != SYMBOL_REF
)
3321 const char *name
= XSTR (addr
, 0);
3322 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
3323 references with forced assembler name refer to PTX syscalls. For vprintf,
3324 accept both normal and forced-assembler-name references. */
3325 return (!strcmp (name
, "vprintf") || !strcmp (name
, "*vprintf")
3326 || !strcmp (name
, "*malloc")
3327 || !strcmp (name
, "*free"));
3330 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
3331 propagate its value from lane MASTER to current lane. */
3334 nvptx_unisimt_handle_set (rtx set
, rtx_insn
*insn
, rtx master
)
3337 if (GET_CODE (set
) == SET
3338 && REG_P (reg
= SET_DEST (set
))
3339 && find_reg_note (insn
, REG_UNUSED
, reg
) == NULL_RTX
)
3341 emit_insn_after (nvptx_gen_shuffle (reg
, reg
, master
, SHUFFLE_IDX
),
3350 predicate_insn (rtx_insn
*insn
, rtx pred
)
3352 rtx pat
= PATTERN (insn
);
3353 pred
= gen_rtx_NE (BImode
, pred
, const0_rtx
);
3354 pat
= gen_rtx_COND_EXEC (VOIDmode
, pred
, pat
);
3355 bool changed_p
= validate_change (insn
, &PATTERN (insn
), pat
, false);
3356 gcc_assert (changed_p
);
3359 /* Adjust code for uniform-simt code generation variant by making atomics and
3360 "syscalls" conditionally executed, and inserting shuffle-based propagation
3361 for registers being set. */
3364 nvptx_reorg_uniform_simt ()
3366 rtx_insn
*insn
, *next
;
3368 for (insn
= get_insns (); insn
; insn
= next
)
3370 next
= NEXT_INSN (insn
);
3372 /* Skip NOTE, USE, etc. */
3373 if (!INSN_P (insn
) || recog_memoized (insn
) == -1)
3376 if (CALL_P (insn
) && nvptx_call_insn_is_syscall_p (insn
))
3378 /* Handle syscall. */
3380 else if (get_attr_atomic (insn
))
3382 /* Handle atomic insn. */
3387 rtx pat
= PATTERN (insn
);
3388 rtx master
= nvptx_get_unisimt_master ();
3389 bool shuffle_p
= false;
3390 switch (GET_CODE (pat
))
3393 for (int i
= 0; i
< XVECLEN (pat
, 0); i
++)
3395 |= nvptx_unisimt_handle_set (XVECEXP (pat
, 0, i
), insn
, master
);
3398 shuffle_p
|= nvptx_unisimt_handle_set (pat
, insn
, master
);
3404 if (shuffle_p
&& TARGET_PTX_6_0
)
3406 /* The shuffle is a sync, so uniformity is guaranteed. */
3412 gcc_assert (!shuffle_p
);
3413 /* Emit after the insn, to guarantee uniformity. */
3414 emit_insn_after (gen_nvptx_warpsync (), insn
);
3418 /* Emit after the insn (and before the shuffle, if there are any)
3419 to check uniformity. */
3420 emit_insn_after (gen_nvptx_uniform_warp_check (), insn
);
3424 rtx pred
= nvptx_get_unisimt_predicate ();
3425 predicate_insn (insn
, pred
);
3428 for (rtx_insn
*post
= NEXT_INSN (insn
); post
!= next
;
3429 post
= NEXT_INSN (post
))
3431 if (pred
== NULL_RTX
)
3432 pred
= nvptx_get_unisimt_outside_simt_predicate ();
3433 predicate_insn (post
, pred
);
3438 /* Offloading function attributes. */
3440 struct offload_attrs
3448 /* Define entries for cfun->machine->axis_dim. */
3450 #define MACH_VECTOR_LENGTH 0
3451 #define MACH_MAX_WORKERS 1
3453 static void populate_offload_attrs (offload_attrs
*oa
);
3456 init_axis_dim (void)
3461 populate_offload_attrs (&oa
);
3463 if (oa
.num_workers
== 0)
3464 max_workers
= PTX_CTA_SIZE
/ oa
.vector_length
;
3466 max_workers
= oa
.num_workers
;
3468 cfun
->machine
->axis_dim
[MACH_VECTOR_LENGTH
] = oa
.vector_length
;
3469 cfun
->machine
->axis_dim
[MACH_MAX_WORKERS
] = max_workers
;
3470 cfun
->machine
->axis_dim_init_p
= true;
3473 static int ATTRIBUTE_UNUSED
3474 nvptx_mach_max_workers ()
3476 if (!cfun
->machine
->axis_dim_init_p
)
3478 return cfun
->machine
->axis_dim
[MACH_MAX_WORKERS
];
3481 static int ATTRIBUTE_UNUSED
3482 nvptx_mach_vector_length ()
3484 if (!cfun
->machine
->axis_dim_init_p
)
3486 return cfun
->machine
->axis_dim
[MACH_VECTOR_LENGTH
];
3489 /* Loop structure of the function. The entire function is described as
3491 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:struct parallel_g'. */
3495 /* Parent parallel. */
3498 /* Next sibling parallel. */
3501 /* First child parallel. */
3504 /* Partitioning mask of the parallel. */
3507 /* Partitioning used within inner parallels. */
3508 unsigned inner_mask
;
3510 /* Location of parallel forked and join. The forked is the first
3511 block in the parallel and the join is the first block after of
3513 basic_block forked_block
;
3514 basic_block join_block
;
3516 rtx_insn
*forked_insn
;
3517 rtx_insn
*join_insn
;
3519 rtx_insn
*fork_insn
;
3520 rtx_insn
*joining_insn
;
3522 /* Basic blocks in this parallel, but not in child parallels. The
3523 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3525 auto_vec
<basic_block
> blocks
;
3528 parallel (parallel
*parent
, unsigned mode
);
3532 /* Constructor links the new parallel into it's parent's chain of
3535 parallel::parallel (parallel
*parent_
, unsigned mask_
)
3536 :parent (parent_
), next (0), inner (0), mask (mask_
), inner_mask (0)
3538 forked_block
= join_block
= 0;
3539 forked_insn
= join_insn
= 0;
3540 fork_insn
= joining_insn
= 0;
3544 next
= parent
->inner
;
3545 parent
->inner
= this;
3549 parallel::~parallel ()
3555 /* Map of basic blocks to insns */
3556 typedef hash_map
<basic_block
, rtx_insn
*> bb_insn_map_t
;
3558 /* A tuple of an insn of interest and the BB in which it resides. */
3559 typedef std::pair
<rtx_insn
*, basic_block
> insn_bb_t
;
3560 typedef auto_vec
<insn_bb_t
> insn_bb_vec_t
;
3562 /* Split basic blocks such that each forked and join unspecs are at
3563 the start of their basic blocks. Thus afterwards each block will
3564 have a single partitioning mode. We also do the same for return
3565 insns, as they are executed by every thread. Return the
3566 partitioning mode of the function as a whole. Populate MAP with
3567 head and tail blocks. We also clear the BB visited flag, which is
3568 used when finding partitions. */
3569 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_split_blocks'. */
3572 nvptx_split_blocks (bb_insn_map_t
*map
)
3574 insn_bb_vec_t worklist
;
3578 /* Locate all the reorg instructions of interest. */
3579 FOR_ALL_BB_FN (block
, cfun
)
3581 bool seen_insn
= false;
3583 /* Clear visited flag, for use by parallel locator */
3584 block
->flags
&= ~BB_VISITED
;
3586 FOR_BB_INSNS (block
, insn
)
3590 switch (recog_memoized (insn
))
3595 case CODE_FOR_nvptx_forked
:
3596 case CODE_FOR_nvptx_join
:
3599 case CODE_FOR_return
:
3600 /* We also need to split just before return insns, as
3601 that insn needs executing by all threads, but the
3602 block it is in probably does not. */
3607 /* We've found an instruction that must be at the start of
3608 a block, but isn't. Add it to the worklist. */
3609 worklist
.safe_push (insn_bb_t (insn
, block
));
3611 /* It was already the first instruction. Just add it to
3613 map
->get_or_insert (block
) = insn
;
3618 /* Split blocks on the worklist. */
3621 basic_block remap
= 0;
3622 for (ix
= 0; worklist
.iterate (ix
, &elt
); ix
++)
3624 if (remap
!= elt
->second
)
3626 block
= elt
->second
;
3630 /* Split block before insn. The insn is in the new block */
3631 edge e
= split_block (block
, PREV_INSN (elt
->first
));
3634 map
->get_or_insert (block
) = elt
->first
;
3638 /* Return true if MASK contains parallelism that requires shared
3639 memory to broadcast. */
3642 nvptx_needs_shared_bcast (unsigned mask
)
3644 bool worker
= mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
);
3645 bool large_vector
= (mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
3646 && nvptx_mach_vector_length () != PTX_WARP_SIZE
;
3648 return worker
|| large_vector
;
3651 /* BLOCK is a basic block containing a head or tail instruction.
3652 Locate the associated prehead or pretail instruction, which must be
3653 in the single predecessor block. */
3656 nvptx_discover_pre (basic_block block
, int expected
)
3658 gcc_assert (block
->preds
->length () == 1);
3659 basic_block pre_block
= (*block
->preds
)[0]->src
;
3662 for (pre_insn
= BB_END (pre_block
); !INSN_P (pre_insn
);
3663 pre_insn
= PREV_INSN (pre_insn
))
3664 gcc_assert (pre_insn
!= BB_HEAD (pre_block
));
3666 gcc_assert (recog_memoized (pre_insn
) == expected
);
3670 /* Dump this parallel and all its inner parallels. */
3671 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_dump_pars'. */
3674 nvptx_dump_pars (parallel
*par
, unsigned depth
)
3676 fprintf (dump_file
, "%u: mask %d head=%d, tail=%d\n",
3678 par
->forked_block
? par
->forked_block
->index
: -1,
3679 par
->join_block
? par
->join_block
->index
: -1);
3681 fprintf (dump_file
, " blocks:");
3684 for (unsigned ix
= 0; par
->blocks
.iterate (ix
, &block
); ix
++)
3685 fprintf (dump_file
, " %d", block
->index
);
3686 fprintf (dump_file
, "\n");
3688 nvptx_dump_pars (par
->inner
, depth
+ 1);
3691 nvptx_dump_pars (par
->next
, depth
);
3694 /* If BLOCK contains a fork/join marker, process it to create or
3695 terminate a loop structure. Add this block to the current loop,
3696 and then walk successor blocks. */
3697 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_find_par'. */
3700 nvptx_find_par (bb_insn_map_t
*map
, parallel
*par
, basic_block block
)
3702 if (block
->flags
& BB_VISITED
)
3704 block
->flags
|= BB_VISITED
;
3706 if (rtx_insn
**endp
= map
->get (block
))
3708 rtx_insn
*end
= *endp
;
3710 /* This is a block head or tail, or return instruction. */
3711 switch (recog_memoized (end
))
3713 case CODE_FOR_return
:
3714 /* Return instructions are in their own block, and we
3715 don't need to do anything more. */
3718 case CODE_FOR_nvptx_forked
:
3719 /* Loop head, create a new inner loop and add it into
3720 our parent's child list. */
3722 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
3725 par
= new parallel (par
, mask
);
3726 par
->forked_block
= block
;
3727 par
->forked_insn
= end
;
3728 if (nvptx_needs_shared_bcast (mask
))
3730 = nvptx_discover_pre (block
, CODE_FOR_nvptx_fork
);
3734 case CODE_FOR_nvptx_join
:
3735 /* A loop tail. Finish the current loop and return to
3738 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
3740 gcc_assert (par
->mask
== mask
);
3741 gcc_assert (par
->join_block
== NULL
);
3742 par
->join_block
= block
;
3743 par
->join_insn
= end
;
3744 if (nvptx_needs_shared_bcast (mask
))
3746 = nvptx_discover_pre (block
, CODE_FOR_nvptx_joining
);
3757 /* Add this block onto the current loop's list of blocks. */
3758 par
->blocks
.safe_push (block
);
3760 /* This must be the entry block. Create a NULL parallel. */
3761 par
= new parallel (0, 0);
3763 /* Walk successor blocks. */
3767 FOR_EACH_EDGE (e
, ei
, block
->succs
)
3768 nvptx_find_par (map
, par
, e
->dest
);
3773 /* DFS walk the CFG looking for fork & join markers. Construct
3774 loop structures as we go. MAP is a mapping of basic blocks
3775 to head & tail markers, discovered when splitting blocks. This
3776 speeds up the discovery. We rely on the BB visited flag having
3777 been cleared when splitting blocks. */
3778 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_discover_pars'. */
3781 nvptx_discover_pars (bb_insn_map_t
*map
)
3785 /* Mark exit blocks as visited. */
3786 block
= EXIT_BLOCK_PTR_FOR_FN (cfun
);
3787 block
->flags
|= BB_VISITED
;
3789 /* And entry block as not. */
3790 block
= ENTRY_BLOCK_PTR_FOR_FN (cfun
);
3791 block
->flags
&= ~BB_VISITED
;
3793 parallel
*par
= nvptx_find_par (map
, 0, block
);
3797 fprintf (dump_file
, "\nLoops\n");
3798 nvptx_dump_pars (par
, 0);
3799 fprintf (dump_file
, "\n");
3805 /* Analyse a group of BBs within a partitioned region and create N
3806 Single-Entry-Single-Exit regions. Some of those regions will be
3807 trivial ones consisting of a single BB. The blocks of a
3808 partitioned region might form a set of disjoint graphs -- because
3809 the region encloses a differently partitoned sub region.
3811 We use the linear time algorithm described in 'Finding Regions Fast:
3812 Single Entry Single Exit and control Regions in Linear Time'
3813 Johnson, Pearson & Pingali. That algorithm deals with complete
3814 CFGs, where a back edge is inserted from END to START, and thus the
3815 problem becomes one of finding equivalent loops.
3817 In this case we have a partial CFG. We complete it by redirecting
3818 any incoming edge to the graph to be from an arbitrary external BB,
3819 and similarly redirecting any outgoing edge to be to that BB.
3820 Thus we end up with a closed graph.
3822 The algorithm works by building a spanning tree of an undirected
3823 graph and keeping track of back edges from nodes further from the
3824 root in the tree to nodes nearer to the root in the tree. In the
3825 description below, the root is up and the tree grows downwards.
3827 We avoid having to deal with degenerate back-edges to the same
3828 block, by splitting each BB into 3 -- one for input edges, one for
3829 the node itself and one for the output edges. Such back edges are
3830 referred to as 'Brackets'. Cycle equivalent nodes will have the
3831 same set of brackets.
3833 Determining bracket equivalency is done by maintaining a list of
3834 brackets in such a manner that the list length and final bracket
3835 uniquely identify the set.
3837 We use coloring to mark all BBs with cycle equivalency with the
3838 same color. This is the output of the 'Finding Regions Fast'
3839 algorithm. Notice it doesn't actually find the set of nodes within
3840 a particular region, just unorderd sets of nodes that are the
3841 entries and exits of SESE regions.
3843 After determining cycle equivalency, we need to find the minimal
3844 set of SESE regions. Do this with a DFS coloring walk of the
3845 complete graph. We're either 'looking' or 'coloring'. When
3846 looking, and we're in the subgraph, we start coloring the color of
3847 the current node, and remember that node as the start of the
3848 current color's SESE region. Every time we go to a new node, we
3849 decrement the count of nodes with thet color. If it reaches zero,
3850 we remember that node as the end of the current color's SESE region
3851 and return to 'looking'. Otherwise we color the node the current
3854 This way we end up with coloring the inside of non-trivial SESE
3855 regions with the color of that region. */
3857 /* A pair of BBs. We use this to represent SESE regions. */
3858 typedef std::pair
<basic_block
, basic_block
> bb_pair_t
;
3859 typedef auto_vec
<bb_pair_t
> bb_pair_vec_t
;
3861 /* A node in the undirected CFG. The discriminator SECOND indicates just
3862 above or just below the BB idicated by FIRST. */
3863 typedef std::pair
<basic_block
, int> pseudo_node_t
;
3865 /* A bracket indicates an edge towards the root of the spanning tree of the
3866 undirected graph. Each bracket has a color, determined
3867 from the currrent set of brackets. */
3870 pseudo_node_t back
; /* Back target */
3872 /* Current color and size of set. */
3876 bracket (pseudo_node_t back_
)
3877 : back (back_
), color (~0u), size (~0u)
3881 unsigned get_color (auto_vec
<unsigned> &color_counts
, unsigned length
)
3886 color
= color_counts
.length ();
3887 color_counts
.quick_push (0);
3889 color_counts
[color
]++;
3894 typedef auto_vec
<bracket
> bracket_vec_t
;
3896 /* Basic block info for finding SESE regions. */
3900 int node
; /* Node number in spanning tree. */
3901 int parent
; /* Parent node number. */
3903 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3904 edges arrive at pseudo-node Ai and the outgoing edges leave at
3905 pseudo-node Ao. We have to remember which way we arrived at a
3906 particular node when generating the spanning tree. dir > 0 means
3907 we arrived at Ai, dir < 0 means we arrived at Ao. */
3910 /* Lowest numbered pseudo-node reached via a backedge from thsis
3911 node, or any descendant. */
3914 int color
; /* Cycle-equivalence color */
3916 /* Stack of brackets for this node. */
3917 bracket_vec_t brackets
;
3919 bb_sese (unsigned node_
, unsigned p
, int dir_
)
3920 :node (node_
), parent (p
), dir (dir_
)
3925 /* Push a bracket ending at BACK. */
3926 void push (const pseudo_node_t
&back
)
3929 fprintf (dump_file
, "Pushing backedge %d:%+d\n",
3930 back
.first
? back
.first
->index
: 0, back
.second
);
3931 brackets
.safe_push (bracket (back
));
3934 void append (bb_sese
*child
);
3935 void remove (const pseudo_node_t
&);
3937 /* Set node's color. */
3938 void set_color (auto_vec
<unsigned> &color_counts
)
3940 color
= brackets
.last ().get_color (color_counts
, brackets
.length ());
3944 bb_sese::~bb_sese ()
3948 /* Destructively append CHILD's brackets. */
3951 bb_sese::append (bb_sese
*child
)
3953 if (int len
= child
->brackets
.length ())
3959 for (ix
= 0; ix
< len
; ix
++)
3961 const pseudo_node_t
&pseudo
= child
->brackets
[ix
].back
;
3962 fprintf (dump_file
, "Appending (%d)'s backedge %d:%+d\n",
3963 child
->node
, pseudo
.first
? pseudo
.first
->index
: 0,
3967 if (!brackets
.length ())
3968 std::swap (brackets
, child
->brackets
);
3971 brackets
.reserve (len
);
3972 for (ix
= 0; ix
< len
; ix
++)
3973 brackets
.quick_push (child
->brackets
[ix
]);
3978 /* Remove brackets that terminate at PSEUDO. */
3981 bb_sese::remove (const pseudo_node_t
&pseudo
)
3983 unsigned removed
= 0;
3984 int len
= brackets
.length ();
3986 for (int ix
= 0; ix
< len
; ix
++)
3988 if (brackets
[ix
].back
== pseudo
)
3991 fprintf (dump_file
, "Removing backedge %d:%+d\n",
3992 pseudo
.first
? pseudo
.first
->index
: 0, pseudo
.second
);
3996 brackets
[ix
-removed
] = brackets
[ix
];
4002 /* Accessors for BB's aux pointer. */
4003 #define BB_SET_SESE(B, S) ((B)->aux = (S))
4004 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
4006 /* DFS walk creating SESE data structures. Only cover nodes with
4007 BB_VISITED set. Append discovered blocks to LIST. We number in
4008 increments of 3 so that the above and below pseudo nodes can be
4009 implicitly numbered too. */
4012 nvptx_sese_number (int n
, int p
, int dir
, basic_block b
,
4013 auto_vec
<basic_block
> *list
)
4015 if (BB_GET_SESE (b
))
4019 fprintf (dump_file
, "Block %d(%d), parent (%d), orientation %+d\n",
4020 b
->index
, n
, p
, dir
);
4022 BB_SET_SESE (b
, new bb_sese (n
, p
, dir
));
4026 list
->quick_push (b
);
4028 /* First walk the nodes on the 'other side' of this node, then walk
4029 the nodes on the same side. */
4030 for (unsigned ix
= 2; ix
; ix
--)
4032 vec
<edge
, va_gc
> *edges
= dir
> 0 ? b
->succs
: b
->preds
;
4033 size_t offset
= (dir
> 0 ? offsetof (edge_def
, dest
)
4034 : offsetof (edge_def
, src
));
4038 FOR_EACH_EDGE (e
, ei
, edges
)
4040 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
4042 if (target
->flags
& BB_VISITED
)
4043 n
= nvptx_sese_number (n
, p
, dir
, target
, list
);
4050 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
4051 EDGES are the outgoing edges and OFFSET is the offset to the src
4052 or dst block on the edges. */
4055 nvptx_sese_pseudo (basic_block me
, bb_sese
*sese
, int depth
, int dir
,
4056 vec
<edge
, va_gc
> *edges
, size_t offset
)
4060 int hi_back
= depth
;
4061 pseudo_node_t
node_back (nullptr, depth
);
4062 int hi_child
= depth
;
4063 pseudo_node_t
node_child (nullptr, depth
);
4064 basic_block child
= NULL
;
4065 unsigned num_children
= 0;
4066 int usd
= -dir
* sese
->dir
;
4069 fprintf (dump_file
, "\nProcessing %d(%d) %+d\n",
4070 me
->index
, sese
->node
, dir
);
4074 /* This is the above pseudo-child. It has the BB itself as an
4075 additional child node. */
4076 node_child
= sese
->high
;
4077 hi_child
= node_child
.second
;
4078 if (node_child
.first
)
4079 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
4083 /* Examine each edge.
4084 - if it is a child (a) append its bracket list and (b) record
4085 whether it is the child with the highest reaching bracket.
4086 - if it is an edge to ancestor, record whether it's the highest
4087 reaching backlink. */
4088 FOR_EACH_EDGE (e
, ei
, edges
)
4090 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
4092 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
4094 if (t_sese
->parent
== sese
->node
&& !(t_sese
->dir
+ usd
))
4096 /* Child node. Append its bracket list. */
4098 sese
->append (t_sese
);
4100 /* Compare it's hi value. */
4101 int t_hi
= t_sese
->high
.second
;
4103 if (basic_block child_hi_block
= t_sese
->high
.first
)
4104 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
4106 if (hi_child
> t_hi
)
4109 node_child
= t_sese
->high
;
4113 else if (t_sese
->node
< sese
->node
+ dir
4114 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
4116 /* Non-parental ancestor node -- a backlink. */
4117 int d
= usd
* t_sese
->dir
;
4118 int back
= t_sese
->node
+ d
;
4123 node_back
= pseudo_node_t (target
, d
);
4128 { /* Fallen off graph, backlink to entry node. */
4130 node_back
= pseudo_node_t (nullptr, 0);
4134 /* Remove any brackets that terminate at this pseudo node. */
4135 sese
->remove (pseudo_node_t (me
, dir
));
4137 /* Now push any backlinks from this pseudo node. */
4138 FOR_EACH_EDGE (e
, ei
, edges
)
4140 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
4141 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
4143 if (t_sese
->node
< sese
->node
+ dir
4144 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
4145 /* Non-parental ancestor node - backedge from me. */
4146 sese
->push (pseudo_node_t (target
, usd
* t_sese
->dir
));
4150 /* back edge to entry node */
4151 sese
->push (pseudo_node_t (nullptr, 0));
4155 /* If this node leads directly or indirectly to a no-return region of
4156 the graph, then fake a backedge to entry node. */
4157 if (!sese
->brackets
.length () || !edges
|| !edges
->length ())
4160 node_back
= pseudo_node_t (nullptr, 0);
4161 sese
->push (node_back
);
4164 /* Record the highest reaching backedge from us or a descendant. */
4165 sese
->high
= hi_back
< hi_child
? node_back
: node_child
;
4167 if (num_children
> 1)
4169 /* There is more than one child -- this is a Y shaped piece of
4170 spanning tree. We have to insert a fake backedge from this
4171 node to the highest ancestor reached by not-the-highest
4172 reaching child. Note that there may be multiple children
4173 with backedges to the same highest node. That's ok and we
4174 insert the edge to that highest node. */
4176 if (dir
< 0 && child
)
4178 node_child
= sese
->high
;
4179 hi_child
= node_child
.second
;
4180 if (node_child
.first
)
4181 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
4184 FOR_EACH_EDGE (e
, ei
, edges
)
4186 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
4188 if (target
== child
)
4189 /* Ignore the highest child. */
4192 bb_sese
*t_sese
= BB_GET_SESE (target
);
4195 if (t_sese
->parent
!= sese
->node
)
4199 /* Compare its hi value. */
4200 int t_hi
= t_sese
->high
.second
;
4202 if (basic_block child_hi_block
= t_sese
->high
.first
)
4203 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
4205 if (hi_child
> t_hi
)
4208 node_child
= t_sese
->high
;
4212 sese
->push (node_child
);
4217 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
4218 proceed to successors. Set SESE entry and exit nodes of
4222 nvptx_sese_color (auto_vec
<unsigned> &color_counts
, bb_pair_vec_t
®ions
,
4223 basic_block block
, int coloring
)
4225 bb_sese
*sese
= BB_GET_SESE (block
);
4227 if (block
->flags
& BB_VISITED
)
4229 /* If we've already encountered this block, either we must not
4230 be coloring, or it must have been colored the current color. */
4231 gcc_assert (coloring
< 0 || (sese
&& coloring
== sese
->color
));
4235 block
->flags
|= BB_VISITED
;
4241 /* Start coloring a region. */
4242 regions
[sese
->color
].first
= block
;
4243 coloring
= sese
->color
;
4246 if (!--color_counts
[sese
->color
] && sese
->color
== coloring
)
4248 /* Found final block of SESE region. */
4249 regions
[sese
->color
].second
= block
;
4253 /* Color the node, so we can assert on revisiting the node
4254 that the graph is indeed SESE. */
4255 sese
->color
= coloring
;
4258 /* Fallen off the subgraph, we cannot be coloring. */
4259 gcc_assert (coloring
< 0);
4261 /* Walk each successor block. */
4262 if (block
->succs
&& block
->succs
->length ())
4267 FOR_EACH_EDGE (e
, ei
, block
->succs
)
4268 nvptx_sese_color (color_counts
, regions
, e
->dest
, coloring
);
4271 gcc_assert (coloring
< 0);
4274 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
4275 end up with NULL entries in it. */
4278 nvptx_find_sese (auto_vec
<basic_block
> &blocks
, bb_pair_vec_t
®ions
)
4283 /* First clear each BB of the whole function. */
4284 FOR_ALL_BB_FN (block
, cfun
)
4286 block
->flags
&= ~BB_VISITED
;
4287 BB_SET_SESE (block
, 0);
4290 /* Mark blocks in the function that are in this graph. */
4291 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
4292 block
->flags
|= BB_VISITED
;
4294 /* Counts of nodes assigned to each color. There cannot be more
4295 colors than blocks (and hopefully there will be fewer). */
4296 auto_vec
<unsigned> color_counts
;
4297 color_counts
.reserve (blocks
.length ());
4299 /* Worklist of nodes in the spanning tree. Again, there cannot be
4300 more nodes in the tree than blocks (there will be fewer if the
4301 CFG of blocks is disjoint). */
4302 auto_vec
<basic_block
> spanlist
;
4303 spanlist
.reserve (blocks
.length ());
4305 /* Make sure every block has its cycle class determined. */
4306 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
4308 if (BB_GET_SESE (block
))
4309 /* We already met this block in an earlier graph solve. */
4313 fprintf (dump_file
, "Searching graph starting at %d\n", block
->index
);
4315 /* Number the nodes reachable from block initial DFS order. */
4316 int depth
= nvptx_sese_number (2, 0, +1, block
, &spanlist
);
4318 /* Now walk in reverse DFS order to find cycle equivalents. */
4319 while (spanlist
.length ())
4321 block
= spanlist
.pop ();
4322 bb_sese
*sese
= BB_GET_SESE (block
);
4324 /* Do the pseudo node below. */
4325 nvptx_sese_pseudo (block
, sese
, depth
, +1,
4326 sese
->dir
> 0 ? block
->succs
: block
->preds
,
4327 (sese
->dir
> 0 ? offsetof (edge_def
, dest
)
4328 : offsetof (edge_def
, src
)));
4329 sese
->set_color (color_counts
);
4330 /* Do the pseudo node above. */
4331 nvptx_sese_pseudo (block
, sese
, depth
, -1,
4332 sese
->dir
< 0 ? block
->succs
: block
->preds
,
4333 (sese
->dir
< 0 ? offsetof (edge_def
, dest
)
4334 : offsetof (edge_def
, src
)));
4337 fprintf (dump_file
, "\n");
4343 const char *comma
= "";
4345 fprintf (dump_file
, "Found %d cycle equivalents\n",
4346 color_counts
.length ());
4347 for (ix
= 0; color_counts
.iterate (ix
, &count
); ix
++)
4349 fprintf (dump_file
, "%s%d[%d]={", comma
, ix
, count
);
4352 for (unsigned jx
= 0; blocks
.iterate (jx
, &block
); jx
++)
4353 if (BB_GET_SESE (block
)->color
== ix
)
4355 block
->flags
|= BB_VISITED
;
4356 fprintf (dump_file
, "%s%d", comma
, block
->index
);
4359 fprintf (dump_file
, "}");
4362 fprintf (dump_file
, "\n");
4365 /* Now we've colored every block in the subgraph. We now need to
4366 determine the minimal set of SESE regions that cover that
4367 subgraph. Do this with a DFS walk of the complete function.
4368 During the walk we're either 'looking' or 'coloring'. When we
4369 reach the last node of a particular color, we stop coloring and
4370 return to looking. */
4372 /* There cannot be more SESE regions than colors. */
4373 regions
.reserve (color_counts
.length ());
4374 for (ix
= color_counts
.length (); ix
--;)
4375 regions
.quick_push (bb_pair_t (0, 0));
4377 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
4378 block
->flags
&= ~BB_VISITED
;
4380 nvptx_sese_color (color_counts
, regions
, ENTRY_BLOCK_PTR_FOR_FN (cfun
), -1);
4384 const char *comma
= "";
4385 int len
= regions
.length ();
4387 fprintf (dump_file
, "SESE regions:");
4388 for (ix
= 0; ix
!= len
; ix
++)
4390 basic_block from
= regions
[ix
].first
;
4391 basic_block to
= regions
[ix
].second
;
4395 fprintf (dump_file
, "%s %d{%d", comma
, ix
, from
->index
);
4397 fprintf (dump_file
, "->%d", to
->index
);
4399 int color
= BB_GET_SESE (from
)->color
;
4401 /* Print the blocks within the region (excluding ends). */
4402 FOR_EACH_BB_FN (block
, cfun
)
4404 bb_sese
*sese
= BB_GET_SESE (block
);
4406 if (sese
&& sese
->color
== color
4407 && block
!= from
&& block
!= to
)
4408 fprintf (dump_file
, ".%d", block
->index
);
4410 fprintf (dump_file
, "}");
4414 fprintf (dump_file
, "\n\n");
4417 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
4418 delete BB_GET_SESE (block
);
4424 /* Propagate live state at the start of a partitioned region. IS_CALL
4425 indicates whether the propagation is for a (partitioned) call
4426 instruction. BLOCK provides the live register information, and
4427 might not contain INSN. Propagation is inserted just after INSN. RW
4428 indicates whether we are reading and/or writing state. This
4429 separation is needed for worker-level proppagation where we
4430 essentially do a spill & fill. FN is the underlying worker
4431 function to generate the propagation instructions for single
4432 register. DATA is user data.
4434 Returns true if we didn't emit any instructions.
4436 We propagate the live register set for non-calls and the entire
4437 frame for calls and non-calls. We could do better by (a)
4438 propagating just the live set that is used within the partitioned
4439 regions and (b) only propagating stack entries that are used. The
4440 latter might be quite hard to determine. */
4442 typedef rtx (*propagator_fn
) (rtx
, propagate_mask
, unsigned, void *, bool);
4445 nvptx_propagate (bool is_call
, basic_block block
, rtx_insn
*insn
,
4446 propagate_mask rw
, propagator_fn fn
, void *data
, bool vector
)
4448 bitmap live
= DF_LIVE_IN (block
);
4449 bitmap_iterator iterator
;
4453 /* Copy the frame array. */
4454 HOST_WIDE_INT fs
= get_frame_size ();
4457 rtx tmp
= gen_reg_rtx (DImode
);
4459 rtx ptr
= gen_reg_rtx (Pmode
);
4460 rtx pred
= NULL_RTX
;
4461 rtx_code_label
*label
= NULL
;
4464 /* The frame size might not be DImode compatible, but the frame
4465 array's declaration will be. So it's ok to round up here. */
4466 fs
= (fs
+ GET_MODE_SIZE (DImode
) - 1) / GET_MODE_SIZE (DImode
);
4467 /* Detect single iteration loop. */
4472 emit_insn (gen_rtx_SET (ptr
, frame_pointer_rtx
));
4475 idx
= gen_reg_rtx (SImode
);
4476 pred
= gen_reg_rtx (BImode
);
4477 label
= gen_label_rtx ();
4479 emit_insn (gen_rtx_SET (idx
, GEN_INT (fs
)));
4480 /* Allow worker function to initialize anything needed. */
4481 rtx init
= fn (tmp
, PM_loop_begin
, fs
, data
, vector
);
4485 LABEL_NUSES (label
)++;
4486 emit_insn (gen_addsi3 (idx
, idx
, GEN_INT (-1)));
4489 emit_insn (gen_rtx_SET (tmp
, gen_rtx_MEM (DImode
, ptr
)));
4490 emit_insn (fn (tmp
, rw
, fs
, data
, vector
));
4492 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode
, ptr
), tmp
));
4495 emit_insn (gen_rtx_SET (pred
, gen_rtx_NE (BImode
, idx
, const0_rtx
)));
4496 emit_insn (gen_adddi3 (ptr
, ptr
, GEN_INT (GET_MODE_SIZE (DImode
))));
4497 emit_insn (gen_br_true_uni (pred
, label
));
4498 rtx fini
= fn (tmp
, PM_loop_end
, fs
, data
, vector
);
4501 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx
), idx
));
4503 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp
), tmp
));
4504 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr
), ptr
));
4505 rtx cpy
= get_insns ();
4507 insn
= emit_insn_after (cpy
, insn
);
4511 /* Copy live registers. */
4512 EXECUTE_IF_SET_IN_BITMAP (live
, 0, ix
, iterator
)
4514 rtx reg
= regno_reg_rtx
[ix
];
4516 if (REGNO (reg
) >= FIRST_PSEUDO_REGISTER
)
4518 rtx bcast
= fn (reg
, rw
, 0, data
, vector
);
4520 insn
= emit_insn_after (bcast
, insn
);
4527 /* Worker for nvptx_warp_propagate. */
4530 warp_prop_gen (rtx reg
, propagate_mask pm
,
4531 unsigned ARG_UNUSED (count
), void *ARG_UNUSED (data
),
4532 bool ARG_UNUSED (vector
))
4534 if (!(pm
& PM_read_write
))
4537 return nvptx_gen_warp_bcast (reg
);
4540 /* Propagate state that is live at start of BLOCK across the vectors
4541 of a single warp. Propagation is inserted just after INSN.
4542 IS_CALL and return as for nvptx_propagate. */
4545 nvptx_warp_propagate (bool is_call
, basic_block block
, rtx_insn
*insn
)
4547 return nvptx_propagate (is_call
, block
, insn
, PM_read_write
,
4548 warp_prop_gen
, 0, false);
4551 /* Worker for nvptx_shared_propagate. */
4554 shared_prop_gen (rtx reg
, propagate_mask pm
, unsigned rep
, void *data_
,
4557 broadcast_data_t
*data
= (broadcast_data_t
*)data_
;
4559 if (pm
& PM_loop_begin
)
4561 /* Starting a loop, initialize pointer. */
4562 unsigned align
= GET_MODE_ALIGNMENT (GET_MODE (reg
)) / BITS_PER_UNIT
;
4564 oacc_bcast_align
= MAX (oacc_bcast_align
, align
);
4565 data
->offset
= ROUND_UP (data
->offset
, align
);
4567 data
->ptr
= gen_reg_rtx (Pmode
);
4569 return gen_adddi3 (data
->ptr
, data
->base
, GEN_INT (data
->offset
));
4571 else if (pm
& PM_loop_end
)
4573 rtx clobber
= gen_rtx_CLOBBER (GET_MODE (data
->ptr
), data
->ptr
);
4574 data
->ptr
= NULL_RTX
;
4578 return nvptx_gen_shared_bcast (reg
, pm
, rep
, data
, vector
);
4581 /* Spill or fill live state that is live at start of BLOCK. PRE_P
4582 indicates if this is just before partitioned mode (do spill), or
4583 just after it starts (do fill). Sequence is inserted just after
4584 INSN. IS_CALL and return as for nvptx_propagate. */
4587 nvptx_shared_propagate (bool pre_p
, bool is_call
, basic_block block
,
4588 rtx_insn
*insn
, bool vector
)
4590 broadcast_data_t data
;
4592 data
.base
= gen_reg_rtx (Pmode
);
4594 data
.ptr
= NULL_RTX
;
4596 bool empty
= nvptx_propagate (is_call
, block
, insn
,
4597 pre_p
? PM_read
: PM_write
, shared_prop_gen
,
4599 gcc_assert (empty
== !data
.offset
);
4602 rtx bcast_sym
= oacc_bcast_sym
;
4604 /* Stuff was emitted, initialize the base pointer now. */
4605 if (vector
&& nvptx_mach_max_workers () > 1)
4607 if (!cfun
->machine
->bcast_partition
)
4609 /* It would be nice to place this register in
4610 DATA_AREA_SHARED. */
4611 cfun
->machine
->bcast_partition
= gen_reg_rtx (DImode
);
4613 if (!cfun
->machine
->sync_bar
)
4614 cfun
->machine
->sync_bar
= gen_reg_rtx (SImode
);
4616 bcast_sym
= cfun
->machine
->bcast_partition
;
4619 rtx init
= gen_rtx_SET (data
.base
, bcast_sym
);
4620 emit_insn_after (init
, insn
);
4622 unsigned int psize
= ROUND_UP (data
.offset
, oacc_bcast_align
);
4623 unsigned int pnum
= (nvptx_mach_vector_length () > PTX_WARP_SIZE
4624 ? nvptx_mach_max_workers () + 1
4627 oacc_bcast_partition
= MAX (oacc_bcast_partition
, psize
);
4628 oacc_bcast_size
= MAX (oacc_bcast_size
, psize
* pnum
);
4633 /* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4634 which is an integer or a register. THREADS is the number of threads
4635 controlled by the barrier. */
4638 nvptx_cta_sync (rtx lock
, int threads
)
4640 return gen_nvptx_barsync (lock
, GEN_INT (threads
));
4643 #if WORKAROUND_PTXJIT_BUG
4644 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4648 bb_first_real_insn (basic_block bb
)
4652 /* Find first insn of from block. */
4653 FOR_BB_INSNS (bb
, insn
)
4661 /* Return true if INSN needs neutering. */
4664 needs_neutering_p (rtx_insn
*insn
)
4669 switch (recog_memoized (insn
))
4671 case CODE_FOR_nvptx_fork
:
4672 case CODE_FOR_nvptx_forked
:
4673 case CODE_FOR_nvptx_joining
:
4674 case CODE_FOR_nvptx_join
:
4675 case CODE_FOR_nvptx_barsync
:
4682 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4685 verify_neutering_jumps (basic_block from
,
4686 rtx_insn
*vector_jump
, rtx_insn
*worker_jump
,
4687 rtx_insn
*vector_label
, rtx_insn
*worker_label
)
4689 basic_block bb
= from
;
4690 rtx_insn
*insn
= BB_HEAD (bb
);
4691 bool seen_worker_jump
= false;
4692 bool seen_vector_jump
= false;
4693 bool seen_worker_label
= false;
4694 bool seen_vector_label
= false;
4695 bool worker_neutered
= false;
4696 bool vector_neutered
= false;
4699 if (insn
== worker_jump
)
4701 seen_worker_jump
= true;
4702 worker_neutered
= true;
4703 gcc_assert (!vector_neutered
);
4705 else if (insn
== vector_jump
)
4707 seen_vector_jump
= true;
4708 vector_neutered
= true;
4710 else if (insn
== worker_label
)
4712 seen_worker_label
= true;
4713 gcc_assert (worker_neutered
);
4714 worker_neutered
= false;
4716 else if (insn
== vector_label
)
4718 seen_vector_label
= true;
4719 gcc_assert (vector_neutered
);
4720 vector_neutered
= false;
4722 else if (INSN_P (insn
))
4723 switch (recog_memoized (insn
))
4725 case CODE_FOR_nvptx_barsync
:
4726 gcc_assert (!vector_neutered
&& !worker_neutered
);
4732 if (insn
!= BB_END (bb
))
4733 insn
= NEXT_INSN (insn
);
4734 else if (JUMP_P (insn
) && single_succ_p (bb
)
4735 && !seen_vector_jump
&& !seen_worker_jump
)
4737 bb
= single_succ (bb
);
4738 insn
= BB_HEAD (bb
);
4744 gcc_assert (!(vector_jump
&& !seen_vector_jump
));
4745 gcc_assert (!(worker_jump
&& !seen_worker_jump
));
4747 if (seen_vector_label
|| seen_worker_label
)
4749 gcc_assert (!(vector_label
&& !seen_vector_label
));
4750 gcc_assert (!(worker_label
&& !seen_worker_label
));
4758 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4761 verify_neutering_labels (basic_block to
, rtx_insn
*vector_label
,
4762 rtx_insn
*worker_label
)
4764 basic_block bb
= to
;
4765 rtx_insn
*insn
= BB_END (bb
);
4766 bool seen_worker_label
= false;
4767 bool seen_vector_label
= false;
4770 if (insn
== worker_label
)
4772 seen_worker_label
= true;
4773 gcc_assert (!seen_vector_label
);
4775 else if (insn
== vector_label
)
4776 seen_vector_label
= true;
4777 else if (INSN_P (insn
))
4778 switch (recog_memoized (insn
))
4780 case CODE_FOR_nvptx_barsync
:
4781 gcc_assert (!seen_vector_label
&& !seen_worker_label
);
4785 if (insn
!= BB_HEAD (bb
))
4786 insn
= PREV_INSN (insn
);
4791 gcc_assert (!(vector_label
&& !seen_vector_label
));
4792 gcc_assert (!(worker_label
&& !seen_worker_label
));
4795 /* Single neutering according to MASK. FROM is the incoming block and
4796 TO is the outgoing block. These may be the same block. Insert at
4799 if (tid.<axis>) goto end.
4801 and insert before ending branch of TO (if there is such an insn):
4804 <possibly-broadcast-cond>
4807 We currently only use differnt FROM and TO when skipping an entire
4808 loop. We could do more if we detected superblocks. */
4811 nvptx_single (unsigned mask
, basic_block from
, basic_block to
)
4813 rtx_insn
*head
= BB_HEAD (from
);
4814 rtx_insn
*tail
= BB_END (to
);
4815 unsigned skip_mask
= mask
;
4819 /* Find first insn of from block. */
4820 while (head
!= BB_END (from
) && !needs_neutering_p (head
))
4821 head
= NEXT_INSN (head
);
4826 if (!(JUMP_P (head
) && single_succ_p (from
)))
4829 basic_block jump_target
= single_succ (from
);
4830 if (!single_pred_p (jump_target
))
4834 head
= BB_HEAD (from
);
4837 /* Find last insn of to block */
4838 rtx_insn
*limit
= from
== to
? head
: BB_HEAD (to
);
4839 while (tail
!= limit
&& !INSN_P (tail
) && !LABEL_P (tail
))
4840 tail
= PREV_INSN (tail
);
4842 /* Detect if tail is a branch. */
4843 rtx tail_branch
= NULL_RTX
;
4844 rtx cond_branch
= NULL_RTX
;
4845 if (tail
&& INSN_P (tail
))
4847 tail_branch
= PATTERN (tail
);
4848 if (GET_CODE (tail_branch
) != SET
|| SET_DEST (tail_branch
) != pc_rtx
)
4849 tail_branch
= NULL_RTX
;
4852 cond_branch
= SET_SRC (tail_branch
);
4853 if (GET_CODE (cond_branch
) != IF_THEN_ELSE
)
4854 cond_branch
= NULL_RTX
;
4860 /* If this is empty, do nothing. */
4861 if (!head
|| !needs_neutering_p (head
))
4866 /* If we're only doing vector single, there's no need to
4867 emit skip code because we'll not insert anything. */
4868 if (!(mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)))
4871 else if (tail_branch
)
4872 /* Block with only unconditional branch. Nothing to do. */
4876 /* Insert the vector test inside the worker test. */
4878 rtx_insn
*before
= tail
;
4879 rtx_insn
*neuter_start
= NULL
;
4880 rtx_insn
*worker_label
= NULL
, *vector_label
= NULL
;
4881 rtx_insn
*worker_jump
= NULL
, *vector_jump
= NULL
;
4882 rtx_insn
*warp_sync
= NULL
;
4883 for (mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
4884 if (GOMP_DIM_MASK (mode
) & skip_mask
)
4886 rtx_code_label
*label
= gen_label_rtx ();
4887 rtx pred
= cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
];
4888 rtx_insn
**mode_jump
4889 = mode
== GOMP_DIM_VECTOR
? &vector_jump
: &worker_jump
;
4890 rtx_insn
**mode_label
4891 = mode
== GOMP_DIM_VECTOR
? &vector_label
: &worker_label
;
4895 pred
= gen_reg_rtx (BImode
);
4896 cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
] = pred
;
4900 if (mode
== GOMP_DIM_VECTOR
)
4901 br
= gen_br_true (pred
, label
);
4903 br
= gen_br_true_uni (pred
, label
);
4905 neuter_start
= emit_insn_after (br
, neuter_start
);
4907 neuter_start
= emit_insn_before (br
, head
);
4908 *mode_jump
= neuter_start
;
4910 LABEL_NUSES (label
)++;
4911 rtx_insn
*label_insn
;
4914 label_insn
= emit_label_before (label
, before
);
4915 if (mode
== GOMP_DIM_VECTOR
)
4918 warp_sync
= emit_insn_after (gen_nvptx_warpsync (),
4921 warp_sync
= emit_insn_after (gen_nvptx_uniform_warp_check (),
4924 before
= label_insn
;
4928 label_insn
= emit_label_after (label
, tail
);
4929 if (mode
== GOMP_DIM_VECTOR
)
4932 warp_sync
= emit_insn_after (gen_nvptx_warpsync (),
4935 warp_sync
= emit_insn_after (gen_nvptx_uniform_warp_check (),
4938 if ((mode
== GOMP_DIM_VECTOR
|| mode
== GOMP_DIM_WORKER
)
4939 && CALL_P (tail
) && find_reg_note (tail
, REG_NORETURN
, NULL
))
4940 emit_insn_after (gen_exit (), label_insn
);
4943 *mode_label
= label_insn
;
4946 /* Now deal with propagating the branch condition. */
4949 rtx pvar
= XEXP (XEXP (cond_branch
, 0), 0);
4951 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR
) == mask
4952 && nvptx_mach_vector_length () == PTX_WARP_SIZE
)
4954 /* Vector mode only, do a shuffle. */
4955 #if WORKAROUND_PTXJIT_BUG
4956 /* The branch condition %rcond is propagated like this:
4961 setp.ne.u32 %rnotvzero,%x,0;
4964 @%rnotvzero bra Lskip;
4965 setp.<op>.<type> %rcond,op1,op2;
4967 selp.u32 %rcondu32,1,0,%rcond;
4968 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4969 setp.ne.u32 %rcond,%rcondu32,0;
4971 There seems to be a bug in the ptx JIT compiler (observed at driver
4972 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4973 unless %rcond is initialized to something before 'bra Lskip'. The
4974 bug is not observed with ptxas from cuda 8.0.61.
4976 It is true that the code is non-trivial: at Lskip, %rcond is
4977 uninitialized in threads 1-31, and after the selp the same holds
4978 for %rcondu32. But shfl propagates the defined value in thread 0
4979 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4980 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4982 There is nothing in the PTX spec to suggest that this is wrong, or
4983 to explain why the extra initialization is needed. So, we classify
4984 it as a JIT bug, and the extra initialization as workaround:
4989 setp.ne.u32 %rnotvzero,%x,0;
4992 +.reg .pred %rcond2;
4993 +setp.eq.u32 %rcond2, 1, 0;
4995 @%rnotvzero bra Lskip;
4996 setp.<op>.<type> %rcond,op1,op2;
4997 +mov.pred %rcond2, %rcond;
4999 +mov.pred %rcond, %rcond2;
5000 selp.u32 %rcondu32,1,0,%rcond;
5001 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
5002 setp.ne.u32 %rcond,%rcondu32,0;
5004 rtx_insn
*label
= PREV_INSN (tail
);
5005 if (label
== warp_sync
)
5006 label
= PREV_INSN (label
);
5007 gcc_assert (label
&& LABEL_P (label
));
5008 rtx tmp
= gen_reg_rtx (BImode
);
5009 emit_insn_before (gen_movbi (tmp
, const0_rtx
),
5010 bb_first_real_insn (from
));
5011 emit_insn_before (gen_rtx_SET (tmp
, pvar
), label
);
5012 emit_insn_before (gen_rtx_SET (pvar
, tmp
), tail
);
5014 emit_insn_before (nvptx_gen_warp_bcast (pvar
), tail
);
5018 /* Includes worker mode, do spill & fill. By construction
5019 we should never have worker mode only. */
5020 broadcast_data_t data
;
5021 unsigned size
= GET_MODE_SIZE (SImode
);
5022 bool vector
= (GOMP_DIM_MASK (GOMP_DIM_VECTOR
) == mask
) != 0;
5023 bool worker
= (GOMP_DIM_MASK (GOMP_DIM_WORKER
) == mask
) != 0;
5024 rtx barrier
= GEN_INT (0);
5027 data
.base
= oacc_bcast_sym
;
5030 bool use_partitioning_p
= (vector
&& !worker
5031 && nvptx_mach_max_workers () > 1
5032 && cfun
->machine
->bcast_partition
);
5033 if (use_partitioning_p
)
5035 data
.base
= cfun
->machine
->bcast_partition
;
5036 barrier
= cfun
->machine
->sync_bar
;
5037 threads
= nvptx_mach_vector_length ();
5039 gcc_assert (data
.base
!= NULL
);
5040 gcc_assert (barrier
);
5042 unsigned int psize
= ROUND_UP (size
, oacc_bcast_align
);
5043 unsigned int pnum
= (nvptx_mach_vector_length () > PTX_WARP_SIZE
5044 ? nvptx_mach_max_workers () + 1
5047 oacc_bcast_partition
= MAX (oacc_bcast_partition
, psize
);
5048 oacc_bcast_size
= MAX (oacc_bcast_size
, psize
* pnum
);
5051 emit_insn_before (nvptx_gen_shared_bcast (pvar
, PM_read
, 0, &data
,
5055 /* Barrier so other workers can see the write. */
5056 emit_insn_before (nvptx_cta_sync (barrier
, threads
), tail
);
5058 emit_insn_before (nvptx_gen_shared_bcast (pvar
, PM_write
, 0, &data
,
5061 /* This barrier is needed to avoid worker zero clobbering
5062 the broadcast buffer before all the other workers have
5063 had a chance to read this instance of it. */
5064 emit_insn_before (nvptx_cta_sync (barrier
, threads
), tail
);
5067 extract_insn (tail
);
5068 rtx unsp
= gen_rtx_UNSPEC (BImode
, gen_rtvec (1, pvar
),
5070 validate_change (tail
, recog_data
.operand_loc
[0], unsp
, false);
5073 bool seen_label
= verify_neutering_jumps (from
, vector_jump
, worker_jump
,
5074 vector_label
, worker_label
);
5076 verify_neutering_labels (to
, vector_label
, worker_label
);
5079 /* PAR is a parallel that is being skipped in its entirety according to
5080 MASK. Treat this as skipping a superblock starting at forked
5081 and ending at joining. */
5084 nvptx_skip_par (unsigned mask
, parallel
*par
)
5086 basic_block tail
= par
->join_block
;
5087 gcc_assert (tail
->preds
->length () == 1);
5089 basic_block pre_tail
= (*tail
->preds
)[0]->src
;
5090 gcc_assert (pre_tail
->succs
->length () == 1);
5092 nvptx_single (mask
, par
->forked_block
, pre_tail
);
5095 /* If PAR has a single inner parallel and PAR itself only contains
5096 empty entry and exit blocks, swallow the inner PAR. */
5099 nvptx_optimize_inner (parallel
*par
)
5101 parallel
*inner
= par
->inner
;
5103 /* We mustn't be the outer dummy par. */
5107 /* We must have a single inner par. */
5108 if (!inner
|| inner
->next
)
5111 /* We must only contain 2 blocks ourselves -- the head and tail of
5113 if (par
->blocks
.length () != 2)
5116 /* We must be disjoint partitioning. As we only have vector and
5117 worker partitioning, this is sufficient to guarantee the pars
5118 have adjacent partitioning. */
5119 if ((par
->mask
& inner
->mask
) & (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1))
5120 /* This indicates malformed code generation. */
5123 /* The outer forked insn should be immediately followed by the inner
5125 rtx_insn
*forked
= par
->forked_insn
;
5126 rtx_insn
*fork
= BB_END (par
->forked_block
);
5128 if (NEXT_INSN (forked
) != fork
)
5130 gcc_checking_assert (recog_memoized (fork
) == CODE_FOR_nvptx_fork
);
5132 /* The outer joining insn must immediately follow the inner join
5134 rtx_insn
*joining
= par
->joining_insn
;
5135 rtx_insn
*join
= inner
->join_insn
;
5136 if (NEXT_INSN (join
) != joining
)
5139 /* Preconditions met. Swallow the inner par. */
5141 fprintf (dump_file
, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
5142 inner
->mask
, inner
->forked_block
->index
,
5143 inner
->join_block
->index
,
5144 par
->mask
, par
->forked_block
->index
, par
->join_block
->index
);
5146 par
->mask
|= inner
->mask
& (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1);
5148 par
->blocks
.reserve (inner
->blocks
.length ());
5149 while (inner
->blocks
.length ())
5150 par
->blocks
.quick_push (inner
->blocks
.pop ());
5152 par
->inner
= inner
->inner
;
5153 inner
->inner
= NULL
;
5158 /* Process the parallel PAR and all its contained
5159 parallels. We do everything but the neutering. Return mask of
5160 partitioned modes used within this parallel. */
5163 nvptx_process_pars (parallel
*par
)
5166 nvptx_optimize_inner (par
);
5168 unsigned inner_mask
= par
->mask
;
5170 /* Do the inner parallels first. */
5173 par
->inner_mask
= nvptx_process_pars (par
->inner
);
5174 inner_mask
|= par
->inner_mask
;
5177 bool is_call
= (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_MAX
)) != 0;
5178 bool worker
= (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
));
5179 bool large_vector
= ((par
->mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
5180 && nvptx_mach_vector_length () > PTX_WARP_SIZE
);
5182 if (worker
|| large_vector
)
5184 nvptx_shared_propagate (false, is_call
, par
->forked_block
,
5185 par
->forked_insn
, !worker
);
5187 = nvptx_shared_propagate (true, is_call
, par
->forked_block
,
5188 par
->fork_insn
, !worker
);
5190 = !is_call
&& (NEXT_INSN (par
->forked_insn
)
5191 && NEXT_INSN (par
->forked_insn
) == par
->joining_insn
);
5192 rtx barrier
= GEN_INT (0);
5195 if (!worker
&& cfun
->machine
->sync_bar
)
5197 barrier
= cfun
->machine
->sync_bar
;
5198 threads
= nvptx_mach_vector_length ();
5201 if (no_prop_p
&& empty_loop_p
)
5203 else if (no_prop_p
&& is_call
)
5207 /* Insert begin and end synchronizations. */
5208 emit_insn_before (nvptx_cta_sync (barrier
, threads
),
5210 emit_insn_before (nvptx_cta_sync (barrier
, threads
), par
->join_insn
);
5213 else if (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
5214 nvptx_warp_propagate (is_call
, par
->forked_block
, par
->forked_insn
);
5216 /* Now do siblings. */
5218 inner_mask
|= nvptx_process_pars (par
->next
);
5222 /* Neuter the parallel described by PAR. We recurse in depth-first
5223 order. MODES are the partitioning of the execution and OUTER is
5224 the partitioning of the parallels we are contained in. */
5227 nvptx_neuter_pars (parallel
*par
, unsigned modes
, unsigned outer
)
5229 unsigned me
= (par
->mask
5230 & (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
5231 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
5232 unsigned skip_mask
= 0, neuter_mask
= 0;
5235 nvptx_neuter_pars (par
->inner
, modes
, outer
| me
);
5237 for (unsigned mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
5239 if ((outer
| me
) & GOMP_DIM_MASK (mode
))
5240 {} /* Mode is partitioned: no neutering. */
5241 else if (!(modes
& GOMP_DIM_MASK (mode
)))
5242 {} /* Mode is not used: nothing to do. */
5243 else if (par
->inner_mask
& GOMP_DIM_MASK (mode
)
5244 || !par
->forked_insn
)
5245 /* Partitioned in inner parallels, or we're not a partitioned
5246 at all: neuter individual blocks. */
5247 neuter_mask
|= GOMP_DIM_MASK (mode
);
5248 else if (!par
->parent
|| !par
->parent
->forked_insn
5249 || par
->parent
->inner_mask
& GOMP_DIM_MASK (mode
))
5250 /* Parent isn't a parallel or contains this paralleling: skip
5251 parallel at this level. */
5252 skip_mask
|= GOMP_DIM_MASK (mode
);
5254 {} /* Parent will skip this parallel itself. */
5263 /* Neuter whole SESE regions. */
5264 bb_pair_vec_t regions
;
5266 nvptx_find_sese (par
->blocks
, regions
);
5267 len
= regions
.length ();
5268 for (ix
= 0; ix
!= len
; ix
++)
5270 basic_block from
= regions
[ix
].first
;
5271 basic_block to
= regions
[ix
].second
;
5274 nvptx_single (neuter_mask
, from
, to
);
5281 /* Neuter each BB individually. */
5282 len
= par
->blocks
.length ();
5283 for (ix
= 0; ix
!= len
; ix
++)
5285 basic_block block
= par
->blocks
[ix
];
5287 nvptx_single (neuter_mask
, block
, block
);
5293 nvptx_skip_par (skip_mask
, par
);
5296 nvptx_neuter_pars (par
->next
, modes
, outer
);
5300 populate_offload_attrs (offload_attrs
*oa
)
5302 tree attr
= oacc_get_fn_attrib (current_function_decl
);
5303 tree dims
= TREE_VALUE (attr
);
5308 for (ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++, dims
= TREE_CHAIN (dims
))
5310 tree t
= TREE_VALUE (dims
);
5311 int size
= (t
== NULL_TREE
) ? -1 : TREE_INT_CST_LOW (t
);
5312 tree allowed
= TREE_PURPOSE (dims
);
5314 if (size
!= 1 && !(allowed
&& integer_zerop (allowed
)))
5315 oa
->mask
|= GOMP_DIM_MASK (ix
);
5320 oa
->num_gangs
= size
;
5323 case GOMP_DIM_WORKER
:
5324 oa
->num_workers
= size
;
5327 case GOMP_DIM_VECTOR
:
5328 oa
->vector_length
= size
;
5334 #if WORKAROUND_PTXJIT_BUG_2
5335 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
5336 is needed in the nvptx target because the branches generated for
5337 parititioning are NONJUMP_INSN_P, not JUMP_P. */
5340 nvptx_pc_set (const rtx_insn
*insn
, bool strict
= true)
5343 if ((strict
&& !JUMP_P (insn
))
5344 || (!strict
&& !INSN_P (insn
)))
5346 pat
= PATTERN (insn
);
5348 /* The set is allowed to appear either as the insn pattern or
5349 the first set in a PARALLEL. */
5350 if (GET_CODE (pat
) == PARALLEL
)
5351 pat
= XVECEXP (pat
, 0, 0);
5352 if (GET_CODE (pat
) == SET
&& GET_CODE (SET_DEST (pat
)) == PC
)
5358 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
5361 nvptx_condjump_label (const rtx_insn
*insn
, bool strict
= true)
5363 rtx x
= nvptx_pc_set (insn
, strict
);
5368 if (GET_CODE (x
) == LABEL_REF
)
5370 if (GET_CODE (x
) != IF_THEN_ELSE
)
5372 if (XEXP (x
, 2) == pc_rtx
&& GET_CODE (XEXP (x
, 1)) == LABEL_REF
)
5374 if (XEXP (x
, 1) == pc_rtx
&& GET_CODE (XEXP (x
, 2)) == LABEL_REF
)
5379 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
5380 insn inbetween the branch and the label. This works around a JIT bug
5381 observed at driver version 384.111, at -O0 for sm_50. */
5384 prevent_branch_around_nothing (void)
5386 rtx_insn
*seen_label
= NULL
;
5387 for (rtx_insn
*insn
= get_insns (); insn
; insn
= NEXT_INSN (insn
))
5389 if (INSN_P (insn
) && condjump_p (insn
))
5391 seen_label
= label_ref_label (nvptx_condjump_label (insn
, false));
5395 if (seen_label
== NULL
)
5398 if (NOTE_P (insn
) || DEBUG_INSN_P (insn
))
5402 switch (recog_memoized (insn
))
5404 case CODE_FOR_nvptx_fork
:
5405 case CODE_FOR_nvptx_forked
:
5406 case CODE_FOR_nvptx_joining
:
5407 case CODE_FOR_nvptx_join
:
5411 /* Handle asm ("") and similar. */
5412 if (GET_CODE (PATTERN (insn
)) == ASM_INPUT
5413 || GET_CODE (PATTERN (insn
)) == ASM_OPERANDS
5414 || (GET_CODE (PATTERN (insn
)) == PARALLEL
5415 && asm_noperands (PATTERN (insn
)) >= 0))
5423 if (LABEL_P (insn
) && insn
== seen_label
)
5424 emit_insn_before (gen_fake_nop (), insn
);
5431 #ifdef WORKAROUND_PTXJIT_BUG_3
5432 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
5433 works around a hang observed at driver version 390.48 for sm_50. */
5436 workaround_barsyncs (void)
5438 bool seen_barsync
= false;
5439 for (rtx_insn
*insn
= get_insns (); insn
; insn
= NEXT_INSN (insn
))
5441 if (INSN_P (insn
) && recog_memoized (insn
) == CODE_FOR_nvptx_barsync
)
5445 emit_insn_before (gen_nvptx_membar_cta (), insn
);
5446 emit_insn_before (gen_nvptx_membar_cta (), insn
);
5449 seen_barsync
= true;
5456 if (NOTE_P (insn
) || DEBUG_INSN_P (insn
))
5458 else if (INSN_P (insn
))
5459 switch (recog_memoized (insn
))
5461 case CODE_FOR_nvptx_fork
:
5462 case CODE_FOR_nvptx_forked
:
5463 case CODE_FOR_nvptx_joining
:
5464 case CODE_FOR_nvptx_join
:
5470 seen_barsync
= false;
5476 gen_comment (const char *s
)
5478 const char *sep
= " ";
5479 size_t len
= strlen (ASM_COMMENT_START
) + strlen (sep
) + strlen (s
) + 1;
5480 char *comment
= (char *) alloca (len
);
5481 snprintf (comment
, len
, "%s%s%s", ASM_COMMENT_START
, sep
, s
);
5482 return gen_rtx_ASM_INPUT_loc (VOIDmode
, ggc_strdup (comment
),
5483 DECL_SOURCE_LOCATION (cfun
->decl
));
5486 /* Initialize all declared regs at function entry.
5487 Advantage : Fool-proof.
5488 Disadvantage: Potentially creates a lot of long live ranges and adds a lot
5492 workaround_uninit_method_1 (void)
5494 rtx_insn
*first
= get_insns ();
5495 rtx_insn
*insert_here
= NULL
;
5497 for (int ix
= LAST_VIRTUAL_REGISTER
+ 1; ix
< max_reg_num (); ix
++)
5499 rtx reg
= regno_reg_rtx
[ix
];
5501 /* Skip undeclared registers. */
5502 if (reg
== const0_rtx
)
5505 gcc_assert (CONST0_RTX (GET_MODE (reg
)));
5508 if (nvptx_comment
&& first
!= NULL
)
5509 emit_insn (gen_comment ("Start: Added by -minit-regs=1"));
5510 emit_move_insn (reg
, CONST0_RTX (GET_MODE (reg
)));
5511 rtx_insn
*inits
= get_insns ();
5514 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
5515 for (rtx_insn
*init
= inits
; init
!= NULL
; init
= NEXT_INSN (init
))
5516 fprintf (dump_file
, "Default init of reg %u inserted: insn %u\n",
5517 ix
, INSN_UID (init
));
5521 insert_here
= emit_insn_before (inits
, first
);
5525 insert_here
= emit_insn_after (inits
, insert_here
);
5528 if (nvptx_comment
&& insert_here
!= NULL
)
5529 emit_insn_after (gen_comment ("End: Added by -minit-regs=1"), insert_here
);
5532 /* Find uses of regs that are not defined on all incoming paths, and insert a
5533 corresponding def at function entry.
5535 Disadvantage: Potentially creates long live ranges.
5536 May not catch all cases. F.i. a clobber cuts a live range in
5537 the compiler and may prevent entry_lr_in from being set for a
5538 reg, but the clobber does not translate to a ptx insn, so in
5539 ptx there still may be an uninitialized ptx reg. See f.i.
5540 gcc.c-torture/compile/20020926-1.c. */
5543 workaround_uninit_method_2 (void)
5545 auto_bitmap entry_pseudo_uninit
;
5547 auto_bitmap not_pseudo
;
5548 bitmap_set_range (not_pseudo
, 0, LAST_VIRTUAL_REGISTER
);
5550 bitmap entry_lr_in
= DF_LR_IN (ENTRY_BLOCK_PTR_FOR_FN (cfun
));
5551 bitmap_and_compl (entry_pseudo_uninit
, entry_lr_in
, not_pseudo
);
5554 rtx_insn
*first
= get_insns ();
5555 rtx_insn
*insert_here
= NULL
;
5557 bitmap_iterator iterator
;
5559 EXECUTE_IF_SET_IN_BITMAP (entry_pseudo_uninit
, 0, ix
, iterator
)
5561 rtx reg
= regno_reg_rtx
[ix
];
5562 gcc_assert (CONST0_RTX (GET_MODE (reg
)));
5565 if (nvptx_comment
&& first
!= NULL
)
5566 emit_insn (gen_comment ("Start: Added by -minit-regs=2:"));
5567 emit_move_insn (reg
, CONST0_RTX (GET_MODE (reg
)));
5568 rtx_insn
*inits
= get_insns ();
5571 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
5572 for (rtx_insn
*init
= inits
; init
!= NULL
; init
= NEXT_INSN (init
))
5573 fprintf (dump_file
, "Missing init of reg %u inserted: insn %u\n",
5574 ix
, INSN_UID (init
));
5578 insert_here
= emit_insn_before (inits
, first
);
5582 insert_here
= emit_insn_after (inits
, insert_here
);
5585 if (nvptx_comment
&& insert_here
!= NULL
)
5586 emit_insn_after (gen_comment ("End: Added by -minit-regs=2"), insert_here
);
5589 /* Find uses of regs that are not defined on all incoming paths, and insert a
5590 corresponding def on those.
5591 Advantage : Doesn't create long live ranges.
5592 Disadvantage: More complex, and potentially also more defs. */
5595 workaround_uninit_method_3 (void)
5597 auto_bitmap not_pseudo
;
5598 bitmap_set_range (not_pseudo
, 0, LAST_VIRTUAL_REGISTER
);
5601 FOR_EACH_BB_FN (bb
, cfun
)
5603 if (single_pred_p (bb
))
5606 auto_bitmap bb_pseudo_uninit
;
5607 bitmap_and_compl (bb_pseudo_uninit
, DF_LIVE_IN (bb
), DF_MIR_IN (bb
));
5608 bitmap_and_compl_into (bb_pseudo_uninit
, not_pseudo
);
5610 bitmap_iterator iterator
;
5612 EXECUTE_IF_SET_IN_BITMAP (bb_pseudo_uninit
, 0, ix
, iterator
)
5614 bool have_false
= false;
5615 bool have_true
= false;
5619 FOR_EACH_EDGE (e
, ei
, bb
->preds
)
5621 if (bitmap_bit_p (DF_LIVE_OUT (e
->src
), ix
))
5626 if (have_false
^ have_true
)
5629 FOR_EACH_EDGE (e
, ei
, bb
->preds
)
5631 if (bitmap_bit_p (DF_LIVE_OUT (e
->src
), ix
))
5634 rtx reg
= regno_reg_rtx
[ix
];
5635 gcc_assert (CONST0_RTX (GET_MODE (reg
)));
5638 emit_move_insn (reg
, CONST0_RTX (GET_MODE (reg
)));
5639 rtx_insn
*inits
= get_insns ();
5642 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
5643 for (rtx_insn
*init
= inits
; init
!= NULL
;
5644 init
= NEXT_INSN (init
))
5646 "Missing init of reg %u inserted on edge: %d -> %d:"
5647 " insn %u\n", ix
, e
->src
->index
, e
->dest
->index
,
5650 insert_insn_on_edge (inits
, e
);
5656 FOR_EACH_BB_FN (bb
, cfun
)
5658 if (single_pred_p (bb
))
5663 FOR_EACH_EDGE (e
, ei
, bb
->preds
)
5665 if (e
->insns
.r
== NULL_RTX
)
5668 emit_insn (gen_comment ("Start: Added by -minit-regs=3:"));
5669 emit_insn (e
->insns
.r
);
5670 emit_insn (gen_comment ("End: Added by -minit-regs=3:"));
5671 e
->insns
.r
= get_insns ();
5676 commit_edge_insertions ();
5680 workaround_uninit (void)
5682 switch (nvptx_init_regs
)
5688 workaround_uninit_method_1 ();
5691 workaround_uninit_method_2 ();
5694 workaround_uninit_method_3 ();
5701 /* PTX-specific reorganization
5702 - Split blocks at fork and join instructions
5703 - Compute live registers
5704 - Mark now-unused registers, so function begin doesn't declare
5706 - Insert state propagation when entering partitioned mode
5707 - Insert neutering instructions when in single mode
5708 - Replace subregs with suitable sequences.
5714 /* We are freeing block_for_insn in the toplev to keep compatibility
5715 with old MDEP_REORGS that are not CFG based. Recompute it now. */
5716 compute_bb_for_insn ();
5718 thread_prologue_and_epilogue_insns ();
5720 /* Split blocks and record interesting unspecs. */
5721 bb_insn_map_t bb_insn_map
;
5723 nvptx_split_blocks (&bb_insn_map
);
5725 /* Compute live regs */
5726 df_clear_flags (DF_LR_RUN_DCE
);
5727 df_set_flags (DF_NO_INSN_RESCAN
| DF_NO_HARD_REGS
);
5728 df_live_add_problem ();
5729 df_live_set_all_dirty ();
5730 if (nvptx_init_regs
== 3)
5731 df_mir_add_problem ();
5733 regstat_init_n_sets_and_refs ();
5736 df_dump (dump_file
);
5738 /* Mark unused regs as unused. */
5739 int max_regs
= max_reg_num ();
5740 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< max_regs
; i
++)
5741 if (REG_N_SETS (i
) == 0 && REG_N_REFS (i
) == 0)
5742 regno_reg_rtx
[i
] = const0_rtx
;
5744 workaround_uninit ();
5746 /* Determine launch dimensions of the function. If it is not an
5747 offloaded function (i.e. this is a regular compiler), the
5748 function has no neutering. */
5749 tree attr
= oacc_get_fn_attrib (current_function_decl
);
5752 /* If we determined this mask before RTL expansion, we could
5753 elide emission of some levels of forks and joins. */
5756 populate_offload_attrs (&oa
);
5758 /* If there is worker neutering, there must be vector
5759 neutering. Otherwise the hardware will fail. */
5760 gcc_assert (!(oa
.mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
5761 || (oa
.mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
5763 /* Discover & process partitioned regions. */
5764 parallel
*pars
= nvptx_discover_pars (&bb_insn_map
);
5765 nvptx_process_pars (pars
);
5766 nvptx_neuter_pars (pars
, oa
.mask
, 0);
5770 /* Replace subregs. */
5771 nvptx_reorg_subreg ();
5773 if (TARGET_UNIFORM_SIMT
)
5774 nvptx_reorg_uniform_simt ();
5776 #if WORKAROUND_PTXJIT_BUG_2
5777 prevent_branch_around_nothing ();
5780 #ifdef WORKAROUND_PTXJIT_BUG_3
5781 workaround_barsyncs ();
5784 regstat_free_n_sets_and_refs ();
5786 df_finish_pass (true);
5789 /* Handle a "kernel" attribute; arguments as in
5790 struct attribute_spec.handler. */
5793 nvptx_handle_kernel_attribute (tree
*node
, tree name
, tree
ARG_UNUSED (args
),
5794 int ARG_UNUSED (flags
), bool *no_add_attrs
)
5798 if (TREE_CODE (decl
) != FUNCTION_DECL
)
5800 error ("%qE attribute only applies to functions", name
);
5801 *no_add_attrs
= true;
5803 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl
))))
5805 error ("%qE attribute requires a void return type", name
);
5806 *no_add_attrs
= true;
5812 /* Handle a "shared" attribute; arguments as in
5813 struct attribute_spec.handler. */
5816 nvptx_handle_shared_attribute (tree
*node
, tree name
, tree
ARG_UNUSED (args
),
5817 int ARG_UNUSED (flags
), bool *no_add_attrs
)
5821 if (TREE_CODE (decl
) != VAR_DECL
)
5823 error ("%qE attribute only applies to variables", name
);
5824 *no_add_attrs
= true;
5826 else if (!(TREE_PUBLIC (decl
) || TREE_STATIC (decl
)))
5828 error ("%qE attribute not allowed with auto storage class", name
);
5829 *no_add_attrs
= true;
5835 /* Table of valid machine attributes. */
5836 static const struct attribute_spec nvptx_attribute_table
[] =
5838 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5839 affects_type_identity, handler, exclude } */
5840 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute
,
5842 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute
,
5844 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
5847 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
5849 static HOST_WIDE_INT
5850 nvptx_vector_alignment (const_tree type
)
5852 unsigned HOST_WIDE_INT align
;
5853 tree size
= TYPE_SIZE (type
);
5855 /* Ensure align is not bigger than BIGGEST_ALIGNMENT. */
5856 if (tree_fits_uhwi_p (size
))
5858 align
= tree_to_uhwi (size
);
5859 align
= MIN (align
, BIGGEST_ALIGNMENT
);
5862 align
= BIGGEST_ALIGNMENT
;
5864 /* Ensure align is not smaller than mode alignment. */
5865 align
= MAX (align
, GET_MODE_ALIGNMENT (TYPE_MODE (type
)));
5870 /* Indicate that INSN cannot be duplicated. */
5873 nvptx_cannot_copy_insn_p (rtx_insn
*insn
)
5875 switch (recog_memoized (insn
))
5877 case CODE_FOR_nvptx_shufflesi
:
5878 case CODE_FOR_nvptx_shufflesf
:
5879 case CODE_FOR_nvptx_barsync
:
5880 case CODE_FOR_nvptx_fork
:
5881 case CODE_FOR_nvptx_forked
:
5882 case CODE_FOR_nvptx_joining
:
5883 case CODE_FOR_nvptx_join
:
5890 /* Section anchors do not work. Initialization for flag_section_anchor
5891 probes the existence of the anchoring target hooks and prevents
5892 anchoring if they don't exist. However, we may be being used with
5893 a host-side compiler that does support anchoring, and hence see
5894 the anchor flag set (as it's not recalculated). So provide an
5895 implementation denying anchoring. */
5898 nvptx_use_anchors_for_symbol_p (const_rtx
ARG_UNUSED (a
))
5903 /* Record a symbol for mkoffload to enter into the mapping table. */
5906 nvptx_record_offload_symbol (tree decl
)
5908 switch (TREE_CODE (decl
))
5911 fprintf (asm_out_file
, "//:VAR_MAP \"%s\"\n",
5912 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
5917 tree attr
= oacc_get_fn_attrib (decl
);
5918 /* OpenMP offloading does not set this attribute. */
5919 tree dims
= attr
? TREE_VALUE (attr
) : NULL_TREE
;
5921 fprintf (asm_out_file
, "//:FUNC_MAP \"%s\"",
5922 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
5924 for (; dims
; dims
= TREE_CHAIN (dims
))
5926 int size
= TREE_INT_CST_LOW (TREE_VALUE (dims
));
5928 gcc_assert (!TREE_PURPOSE (dims
));
5929 fprintf (asm_out_file
, ", %#x", size
);
5932 fprintf (asm_out_file
, "\n");
5941 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5942 at the start of a file. */
5945 nvptx_file_start (void)
5947 fputs ("// BEGIN PREAMBLE\n", asm_out_file
);
5949 fputs ("\t.version\t", asm_out_file
);
5950 fputs (ptx_version_to_string ((enum ptx_version
)ptx_version_option
),
5952 fputs ("\n", asm_out_file
);
5954 fputs ("\t.target\tsm_", asm_out_file
);
5955 fputs (sm_version_to_string ((enum ptx_isa
)ptx_isa_option
),
5957 fputs ("\n", asm_out_file
);
5959 fprintf (asm_out_file
, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode
));
5961 fputs ("// END PREAMBLE\n", asm_out_file
);
5964 /* Emit a declaration for a worker and vector-level buffer in .shared
5968 write_shared_buffer (FILE *file
, rtx sym
, unsigned align
, unsigned size
)
5970 const char *name
= XSTR (sym
, 0);
5972 write_var_marker (file
, true, false, name
);
5973 fprintf (file
, ".shared .align %d .u8 %s[%d];\n",
5977 /* Write out the function declarations we've collected and declare storage
5978 for the broadcast buffer. */
5981 nvptx_file_end (void)
5983 hash_table
<tree_hasher
>::iterator iter
;
5985 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab
, decl
, tree
, iter
)
5986 nvptx_record_fndecl (decl
);
5987 fputs (func_decls
.str().c_str(), asm_out_file
);
5989 if (oacc_bcast_size
)
5990 write_shared_buffer (asm_out_file
, oacc_bcast_sym
,
5991 oacc_bcast_align
, oacc_bcast_size
);
5993 if (worker_red_size
)
5994 write_shared_buffer (asm_out_file
, worker_red_sym
,
5995 worker_red_align
, worker_red_size
);
5997 if (vector_red_size
)
5998 write_shared_buffer (asm_out_file
, vector_red_sym
,
5999 vector_red_align
, vector_red_size
);
6001 if (gang_private_shared_size
)
6002 write_shared_buffer (asm_out_file
, gang_private_shared_sym
,
6003 gang_private_shared_align
, gang_private_shared_size
);
6005 if (need_softstack_decl
)
6007 write_var_marker (asm_out_file
, false, true, "__nvptx_stacks");
6008 /* 32 is the maximum number of warps in a block. Even though it's an
6009 external declaration, emit the array size explicitly; otherwise, it
6010 may fail at PTX JIT time if the definition is later in link order. */
6011 fprintf (asm_out_file
, ".extern .shared .u%d __nvptx_stacks[32];\n",
6014 if (need_unisimt_decl
)
6016 write_var_marker (asm_out_file
, false, true, "__nvptx_uni");
6017 fprintf (asm_out_file
, ".extern .shared .u32 __nvptx_uni[32];\n");
6021 /* Expander for the shuffle builtins. */
6024 nvptx_expand_shuffle (tree exp
, rtx target
, machine_mode mode
, int ignore
)
6029 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 0),
6030 NULL_RTX
, mode
, EXPAND_NORMAL
);
6032 src
= copy_to_mode_reg (mode
, src
);
6034 rtx idx
= expand_expr (CALL_EXPR_ARG (exp
, 1),
6035 NULL_RTX
, SImode
, EXPAND_NORMAL
);
6036 rtx op
= expand_expr (CALL_EXPR_ARG (exp
, 2),
6037 NULL_RTX
, SImode
, EXPAND_NORMAL
);
6039 if (!REG_P (idx
) && GET_CODE (idx
) != CONST_INT
)
6040 idx
= copy_to_mode_reg (SImode
, idx
);
6042 rtx pat
= nvptx_gen_shuffle (target
, src
, idx
,
6043 (nvptx_shuffle_kind
) INTVAL (op
));
6051 nvptx_output_red_partition (rtx dst
, rtx offset
)
6053 const char *zero_offset
= "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
6054 const char *with_offset
= "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
6056 if (offset
== const0_rtx
)
6057 fprintf (asm_out_file
, zero_offset
, REGNO (dst
),
6058 REGNO (cfun
->machine
->red_partition
));
6060 fprintf (asm_out_file
, with_offset
, REGNO (dst
),
6061 REGNO (cfun
->machine
->red_partition
), UINTVAL (offset
));
6066 /* Shared-memory reduction address expander. */
6069 nvptx_expand_shared_addr (tree exp
, rtx target
,
6070 machine_mode
ARG_UNUSED (mode
), int ignore
,
6076 unsigned align
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 2));
6077 unsigned offset
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 0));
6078 unsigned size
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 1));
6079 rtx addr
= worker_red_sym
;
6085 populate_offload_attrs (&oa
);
6087 unsigned int psize
= ROUND_UP (size
+ offset
, align
);
6088 unsigned int pnum
= nvptx_mach_max_workers ();
6089 vector_red_partition
= MAX (vector_red_partition
, psize
);
6090 vector_red_size
= MAX (vector_red_size
, psize
* pnum
);
6091 vector_red_align
= MAX (vector_red_align
, align
);
6093 if (cfun
->machine
->red_partition
== NULL
)
6094 cfun
->machine
->red_partition
= gen_reg_rtx (Pmode
);
6096 addr
= gen_reg_rtx (Pmode
);
6097 emit_insn (gen_nvptx_red_partition (addr
, GEN_INT (offset
)));
6101 worker_red_align
= MAX (worker_red_align
, align
);
6102 worker_red_size
= MAX (worker_red_size
, size
+ offset
);
6106 addr
= gen_rtx_PLUS (Pmode
, addr
, GEN_INT (offset
));
6107 addr
= gen_rtx_CONST (Pmode
, addr
);
6111 emit_move_insn (target
, addr
);
6115 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
6116 not require taking the address of any object, other than the memory
6117 cell being operated on. */
6120 nvptx_expand_cmp_swap (tree exp
, rtx target
,
6121 machine_mode
ARG_UNUSED (m
), int ARG_UNUSED (ignore
))
6123 machine_mode mode
= TYPE_MODE (TREE_TYPE (exp
));
6126 target
= gen_reg_rtx (mode
);
6128 rtx mem
= expand_expr (CALL_EXPR_ARG (exp
, 0),
6129 NULL_RTX
, Pmode
, EXPAND_NORMAL
);
6130 rtx cmp
= expand_expr (CALL_EXPR_ARG (exp
, 1),
6131 NULL_RTX
, mode
, EXPAND_NORMAL
);
6132 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 2),
6133 NULL_RTX
, mode
, EXPAND_NORMAL
);
6136 mem
= gen_rtx_MEM (mode
, mem
);
6138 cmp
= copy_to_mode_reg (mode
, cmp
);
6140 src
= copy_to_mode_reg (mode
, src
);
6143 pat
= gen_atomic_compare_and_swapsi_1 (target
, mem
, cmp
, src
, const0_rtx
);
6145 pat
= gen_atomic_compare_and_swapdi_1 (target
, mem
, cmp
, src
, const0_rtx
);
6153 /* Codes for all the NVPTX builtins. */
6156 NVPTX_BUILTIN_SHUFFLE
,
6157 NVPTX_BUILTIN_SHUFFLELL
,
6158 NVPTX_BUILTIN_WORKER_ADDR
,
6159 NVPTX_BUILTIN_VECTOR_ADDR
,
6160 NVPTX_BUILTIN_CMP_SWAP
,
6161 NVPTX_BUILTIN_CMP_SWAPLL
,
6162 NVPTX_BUILTIN_MEMBAR_GL
,
6163 NVPTX_BUILTIN_MEMBAR_CTA
,
6164 NVPTX_BUILTIN_BAR_RED_AND
,
6165 NVPTX_BUILTIN_BAR_RED_OR
,
6166 NVPTX_BUILTIN_BAR_RED_POPC
,
6170 /* Expander for 'bar.red' instruction builtins. */
6173 nvptx_expand_bar_red (tree exp
, rtx target
,
6174 machine_mode
ARG_UNUSED (m
), int ARG_UNUSED (ignore
))
6176 int code
= DECL_MD_FUNCTION_CODE (TREE_OPERAND (CALL_EXPR_FN (exp
), 0));
6177 machine_mode mode
= TYPE_MODE (TREE_TYPE (exp
));
6180 target
= gen_reg_rtx (mode
);
6183 rtx bar
= expand_expr (CALL_EXPR_ARG (exp
, 0),
6184 NULL_RTX
, SImode
, EXPAND_NORMAL
);
6185 rtx nthr
= expand_expr (CALL_EXPR_ARG (exp
, 1),
6186 NULL_RTX
, SImode
, EXPAND_NORMAL
);
6187 rtx cpl
= expand_expr (CALL_EXPR_ARG (exp
, 2),
6188 NULL_RTX
, SImode
, EXPAND_NORMAL
);
6189 rtx redop
= expand_expr (CALL_EXPR_ARG (exp
, 3),
6190 NULL_RTX
, SImode
, EXPAND_NORMAL
);
6191 if (CONST_INT_P (bar
))
6193 if (INTVAL (bar
) < 0 || INTVAL (bar
) > 15)
6195 error_at (EXPR_LOCATION (exp
),
6196 "barrier value must be within [0,15]");
6200 else if (!REG_P (bar
))
6201 bar
= copy_to_mode_reg (SImode
, bar
);
6203 if (!CONST_INT_P (nthr
) && !REG_P (nthr
))
6204 nthr
= copy_to_mode_reg (SImode
, nthr
);
6206 if (!CONST_INT_P (cpl
))
6208 error_at (EXPR_LOCATION (exp
),
6209 "complement argument must be constant");
6213 pred
= gen_reg_rtx (BImode
);
6215 redop
= copy_to_mode_reg (SImode
, redop
);
6216 emit_insn (gen_rtx_SET (pred
, gen_rtx_NE (BImode
, redop
, GEN_INT (0))));
6222 case NVPTX_BUILTIN_BAR_RED_AND
:
6223 dst
= gen_reg_rtx (BImode
);
6224 pat
= gen_nvptx_barred_and (dst
, bar
, nthr
, cpl
, redop
);
6226 case NVPTX_BUILTIN_BAR_RED_OR
:
6227 dst
= gen_reg_rtx (BImode
);
6228 pat
= gen_nvptx_barred_or (dst
, bar
, nthr
, cpl
, redop
);
6230 case NVPTX_BUILTIN_BAR_RED_POPC
:
6231 dst
= gen_reg_rtx (SImode
);
6232 pat
= gen_nvptx_barred_popc (dst
, bar
, nthr
, cpl
, redop
);
6238 if (GET_MODE (dst
) == BImode
)
6240 rtx tmp
= gen_reg_rtx (mode
);
6241 emit_insn (gen_rtx_SET (tmp
, gen_rtx_NE (mode
, dst
, GEN_INT (0))));
6244 emit_move_insn (target
, dst
);
6248 static GTY(()) tree nvptx_builtin_decls
[NVPTX_BUILTIN_MAX
];
6250 /* Return the NVPTX builtin for CODE. */
6253 nvptx_builtin_decl (unsigned code
, bool ARG_UNUSED (initialize_p
))
6255 if (code
>= NVPTX_BUILTIN_MAX
)
6256 return error_mark_node
;
6258 return nvptx_builtin_decls
[code
];
6261 /* Set up all builtin functions for this target. */
6264 nvptx_init_builtins (void)
6266 #define DEF(ID, NAME, T) \
6267 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
6268 = add_builtin_function ("__builtin_nvptx_" NAME, \
6269 build_function_type_list T, \
6270 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
6272 #define UINT unsigned_type_node
6273 #define LLUINT long_long_unsigned_type_node
6274 #define PTRVOID ptr_type_node
6275 #define VOID void_type_node
6277 DEF (SHUFFLE
, "shuffle", (UINT
, UINT
, UINT
, UINT
, NULL_TREE
));
6278 DEF (SHUFFLELL
, "shufflell", (LLUINT
, LLUINT
, UINT
, UINT
, NULL_TREE
));
6279 DEF (WORKER_ADDR
, "worker_addr",
6280 (PTRVOID
, ST
, UINT
, UINT
, NULL_TREE
));
6281 DEF (VECTOR_ADDR
, "vector_addr",
6282 (PTRVOID
, ST
, UINT
, UINT
, NULL_TREE
));
6283 DEF (CMP_SWAP
, "cmp_swap", (UINT
, PTRVOID
, UINT
, UINT
, NULL_TREE
));
6284 DEF (CMP_SWAPLL
, "cmp_swapll", (LLUINT
, PTRVOID
, LLUINT
, LLUINT
, NULL_TREE
));
6285 DEF (MEMBAR_GL
, "membar_gl", (VOID
, VOID
, NULL_TREE
));
6286 DEF (MEMBAR_CTA
, "membar_cta", (VOID
, VOID
, NULL_TREE
));
6288 DEF (BAR_RED_AND
, "bar_red_and",
6289 (UINT
, UINT
, UINT
, UINT
, UINT
, NULL_TREE
));
6290 DEF (BAR_RED_OR
, "bar_red_or",
6291 (UINT
, UINT
, UINT
, UINT
, UINT
, NULL_TREE
));
6292 DEF (BAR_RED_POPC
, "bar_red_popc",
6293 (UINT
, UINT
, UINT
, UINT
, UINT
, NULL_TREE
));
6302 /* Expand an expression EXP that calls a built-in function,
6303 with result going to TARGET if that's convenient
6304 (and in mode MODE if that's convenient).
6305 SUBTARGET may be used as the target for computing one of EXP's operands.
6306 IGNORE is nonzero if the value is to be ignored. */
6309 nvptx_expand_builtin (tree exp
, rtx target
, rtx
ARG_UNUSED (subtarget
),
6310 machine_mode mode
, int ignore
)
6312 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
6313 switch (DECL_MD_FUNCTION_CODE (fndecl
))
6315 case NVPTX_BUILTIN_SHUFFLE
:
6316 case NVPTX_BUILTIN_SHUFFLELL
:
6317 return nvptx_expand_shuffle (exp
, target
, mode
, ignore
);
6319 case NVPTX_BUILTIN_WORKER_ADDR
:
6320 return nvptx_expand_shared_addr (exp
, target
, mode
, ignore
, false);
6322 case NVPTX_BUILTIN_VECTOR_ADDR
:
6323 return nvptx_expand_shared_addr (exp
, target
, mode
, ignore
, true);
6325 case NVPTX_BUILTIN_CMP_SWAP
:
6326 case NVPTX_BUILTIN_CMP_SWAPLL
:
6327 return nvptx_expand_cmp_swap (exp
, target
, mode
, ignore
);
6329 case NVPTX_BUILTIN_MEMBAR_GL
:
6330 emit_insn (gen_nvptx_membar_gl ());
6333 case NVPTX_BUILTIN_MEMBAR_CTA
:
6334 emit_insn (gen_nvptx_membar_cta ());
6337 case NVPTX_BUILTIN_BAR_RED_AND
:
6338 case NVPTX_BUILTIN_BAR_RED_OR
:
6339 case NVPTX_BUILTIN_BAR_RED_POPC
:
6340 return nvptx_expand_bar_red (exp
, target
, mode
, ignore
);
6342 default: gcc_unreachable ();
6346 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
6351 return PTX_WARP_SIZE
;
6354 /* Return 1 if TRAIT NAME is present in the OpenMP context's
6355 device trait set, return 0 if not present in any OpenMP context in the
6356 whole translation unit, or -1 if not present in the current OpenMP context
6357 but might be present in another OpenMP context in the same TU. */
6360 nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait
,
6365 case omp_device_kind
:
6366 return strcmp (name
, "gpu") == 0;
6367 case omp_device_arch
:
6368 return strcmp (name
, "nvptx") == 0;
6369 case omp_device_isa
:
6370 #define NVPTX_SM(XX, SEP) \
6372 if (strcmp (name, "sm_" #XX) == 0) \
6373 return ptx_isa_option == PTX_ISA_SM ## XX; \
6375 #include "nvptx-sm.def"
6384 nvptx_welformed_vector_length_p (int l
)
6387 return l
% PTX_WARP_SIZE
== 0;
6391 nvptx_apply_dim_limits (int dims
[])
6393 /* Check that the vector_length is not too large. */
6394 if (dims
[GOMP_DIM_VECTOR
] > PTX_MAX_VECTOR_LENGTH
)
6395 dims
[GOMP_DIM_VECTOR
] = PTX_MAX_VECTOR_LENGTH
;
6397 /* Check that the number of workers is not too large. */
6398 if (dims
[GOMP_DIM_WORKER
] > PTX_WORKER_LENGTH
)
6399 dims
[GOMP_DIM_WORKER
] = PTX_WORKER_LENGTH
;
6401 /* Ensure that num_worker * vector_length <= cta size. */
6402 if (dims
[GOMP_DIM_WORKER
] > 0 && dims
[GOMP_DIM_VECTOR
] > 0
6403 && dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
] > PTX_CTA_SIZE
)
6404 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
6406 /* If we need a per-worker barrier ... . */
6407 if (dims
[GOMP_DIM_WORKER
] > 0 && dims
[GOMP_DIM_VECTOR
] > 0
6408 && dims
[GOMP_DIM_VECTOR
] > PTX_WARP_SIZE
)
6409 /* Don't use more barriers than available. */
6410 dims
[GOMP_DIM_WORKER
] = MIN (dims
[GOMP_DIM_WORKER
],
6411 PTX_NUM_PER_WORKER_BARRIERS
);
6414 /* Return true if FNDECL contains calls to vector-partitionable routines. */
6417 has_vector_partitionable_routine_calls_p (tree fndecl
)
6423 FOR_EACH_BB_FN (bb
, DECL_STRUCT_FUNCTION (fndecl
))
6424 for (gimple_stmt_iterator i
= gsi_start_bb (bb
); !gsi_end_p (i
);
6425 gsi_next_nondebug (&i
))
6427 gimple
*stmt
= gsi_stmt (i
);
6428 if (gimple_code (stmt
) != GIMPLE_CALL
)
6431 tree callee
= gimple_call_fndecl (stmt
);
6435 tree attrs
= oacc_get_fn_attrib (callee
);
6436 if (attrs
== NULL_TREE
)
6439 int partition_level
= oacc_fn_attrib_level (attrs
);
6440 bool seq_routine_p
= partition_level
== GOMP_DIM_MAX
;
6448 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
6449 DIMS has changed. */
6452 nvptx_goacc_validate_dims_1 (tree decl
, int dims
[], int fn_level
, unsigned used
)
6454 bool oacc_default_dims_p
= false;
6455 bool oacc_min_dims_p
= false;
6456 bool offload_region_p
= false;
6457 bool routine_p
= false;
6458 bool routine_seq_p
= false;
6459 int default_vector_length
= -1;
6461 if (decl
== NULL_TREE
)
6464 oacc_default_dims_p
= true;
6465 else if (fn_level
== -2)
6466 oacc_min_dims_p
= true;
6470 else if (fn_level
== -1)
6471 offload_region_p
= true;
6472 else if (0 <= fn_level
&& fn_level
<= GOMP_DIM_MAX
)
6475 routine_seq_p
= fn_level
== GOMP_DIM_MAX
;
6480 if (oacc_min_dims_p
)
6482 gcc_assert (dims
[GOMP_DIM_VECTOR
] == 1);
6483 gcc_assert (dims
[GOMP_DIM_WORKER
] == 1);
6484 gcc_assert (dims
[GOMP_DIM_GANG
] == 1);
6486 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
6493 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
6498 if (oacc_default_dims_p
)
6501 0 : set at runtime, f.i. -fopenacc-dims=-
6502 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
6503 gcc_assert (dims
[GOMP_DIM_VECTOR
] >= -1);
6504 gcc_assert (dims
[GOMP_DIM_WORKER
] >= -1);
6505 gcc_assert (dims
[GOMP_DIM_GANG
] >= -1);
6507 /* But -fopenacc-dims=- is not yet supported on trunk. */
6508 gcc_assert (dims
[GOMP_DIM_VECTOR
] != 0);
6509 gcc_assert (dims
[GOMP_DIM_WORKER
] != 0);
6510 gcc_assert (dims
[GOMP_DIM_GANG
] != 0);
6513 if (offload_region_p
)
6516 0 : set using variable, f.i. num_gangs (n)
6517 >= 1: set using constant, f.i. num_gangs (1). */
6518 gcc_assert (dims
[GOMP_DIM_VECTOR
] >= -1);
6519 gcc_assert (dims
[GOMP_DIM_WORKER
] >= -1);
6520 gcc_assert (dims
[GOMP_DIM_GANG
] >= -1);
6523 if (offload_region_p
)
6524 default_vector_length
= oacc_get_default_dim (GOMP_DIM_VECTOR
);
6526 /* oacc_default_dims_p. */
6527 default_vector_length
= PTX_DEFAULT_VECTOR_LENGTH
;
6529 int old_dims
[GOMP_DIM_MAX
];
6531 for (i
= 0; i
< GOMP_DIM_MAX
; ++i
)
6532 old_dims
[i
] = dims
[i
];
6534 const char *vector_reason
= NULL
;
6535 if (offload_region_p
&& has_vector_partitionable_routine_calls_p (decl
))
6537 default_vector_length
= PTX_WARP_SIZE
;
6539 if (dims
[GOMP_DIM_VECTOR
] > PTX_WARP_SIZE
)
6541 vector_reason
= G_("using %<vector_length (%d)%> due to call to"
6542 " vector-partitionable routine, ignoring %d");
6543 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
6547 if (dims
[GOMP_DIM_VECTOR
] == 0)
6549 vector_reason
= G_("using %<vector_length (%d)%>, ignoring runtime setting");
6550 dims
[GOMP_DIM_VECTOR
] = default_vector_length
;
6553 if (dims
[GOMP_DIM_VECTOR
] > 0
6554 && !nvptx_welformed_vector_length_p (dims
[GOMP_DIM_VECTOR
]))
6555 dims
[GOMP_DIM_VECTOR
] = default_vector_length
;
6557 nvptx_apply_dim_limits (dims
);
6559 if (dims
[GOMP_DIM_VECTOR
] != old_dims
[GOMP_DIM_VECTOR
])
6560 warning_at (decl
? DECL_SOURCE_LOCATION (decl
) : UNKNOWN_LOCATION
, 0,
6561 vector_reason
!= NULL
6563 : G_("using %<vector_length (%d)%>, ignoring %d"),
6564 dims
[GOMP_DIM_VECTOR
], old_dims
[GOMP_DIM_VECTOR
]);
6566 if (dims
[GOMP_DIM_WORKER
] != old_dims
[GOMP_DIM_WORKER
])
6567 warning_at (decl
? DECL_SOURCE_LOCATION (decl
) : UNKNOWN_LOCATION
, 0,
6568 G_("using %<num_workers (%d)%>, ignoring %d"),
6569 dims
[GOMP_DIM_WORKER
], old_dims
[GOMP_DIM_WORKER
]);
6571 if (oacc_default_dims_p
)
6573 if (dims
[GOMP_DIM_VECTOR
] < 0)
6574 dims
[GOMP_DIM_VECTOR
] = default_vector_length
;
6575 if (dims
[GOMP_DIM_WORKER
] < 0)
6576 dims
[GOMP_DIM_WORKER
] = PTX_DEFAULT_RUNTIME_DIM
;
6577 if (dims
[GOMP_DIM_GANG
] < 0)
6578 dims
[GOMP_DIM_GANG
] = PTX_DEFAULT_RUNTIME_DIM
;
6579 nvptx_apply_dim_limits (dims
);
6582 if (offload_region_p
)
6584 for (i
= 0; i
< GOMP_DIM_MAX
; i
++)
6589 if ((used
& GOMP_DIM_MASK (i
)) == 0)
6590 /* Function oacc_validate_dims will apply the minimal dimension. */
6593 dims
[i
] = (i
== GOMP_DIM_VECTOR
6594 ? default_vector_length
6595 : oacc_get_default_dim (i
));
6598 nvptx_apply_dim_limits (dims
);
6602 /* Validate compute dimensions of an OpenACC offload or routine, fill
6603 in non-unity defaults. FN_LEVEL indicates the level at which a
6604 routine might spawn a loop. It is negative for non-routines. If
6605 DECL is null, we are validating the default dimensions. */
6608 nvptx_goacc_validate_dims (tree decl
, int dims
[], int fn_level
, unsigned used
)
6610 int old_dims
[GOMP_DIM_MAX
];
6613 for (i
= 0; i
< GOMP_DIM_MAX
; ++i
)
6614 old_dims
[i
] = dims
[i
];
6616 nvptx_goacc_validate_dims_1 (decl
, dims
, fn_level
, used
);
6618 gcc_assert (dims
[GOMP_DIM_VECTOR
] != 0);
6619 if (dims
[GOMP_DIM_WORKER
] > 0 && dims
[GOMP_DIM_VECTOR
] > 0)
6620 gcc_assert (dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
] <= PTX_CTA_SIZE
);
6622 for (i
= 0; i
< GOMP_DIM_MAX
; ++i
)
6623 if (old_dims
[i
] != dims
[i
])
6629 /* Return maximum dimension size, or zero for unbounded. */
6632 nvptx_dim_limit (int axis
)
6636 case GOMP_DIM_VECTOR
:
6637 return PTX_MAX_VECTOR_LENGTH
;
6645 /* Determine whether fork & joins are needed. */
6648 nvptx_goacc_fork_join (gcall
*call
, const int dims
[],
6649 bool ARG_UNUSED (is_fork
))
6651 tree arg
= gimple_call_arg (call
, 2);
6652 unsigned axis
= TREE_INT_CST_LOW (arg
);
6654 /* We only care about worker and vector partitioning. */
6655 if (axis
< GOMP_DIM_WORKER
)
6658 /* If the size is 1, there's no partitioning. */
6659 if (dims
[axis
] == 1)
6665 /* Generate a PTX builtin function call that returns the address in
6666 the worker reduction buffer at OFFSET. TYPE is the type of the
6667 data at that location. */
6670 nvptx_get_shared_red_addr (tree type
, tree offset
, bool vector
)
6672 enum nvptx_builtins addr_dim
= NVPTX_BUILTIN_WORKER_ADDR
;
6674 addr_dim
= NVPTX_BUILTIN_VECTOR_ADDR
;
6675 machine_mode mode
= TYPE_MODE (type
);
6676 tree fndecl
= nvptx_builtin_decl (addr_dim
, true);
6677 tree size
= build_int_cst (unsigned_type_node
, GET_MODE_SIZE (mode
));
6678 tree align
= build_int_cst (unsigned_type_node
,
6679 GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
);
6680 tree call
= build_call_expr (fndecl
, 3, offset
, size
, align
);
6682 return fold_convert (build_pointer_type (type
), call
);
6685 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
6686 will cast the variable if necessary. */
6689 nvptx_generate_vector_shuffle (location_t loc
,
6690 tree dest_var
, tree var
, unsigned shift
,
6693 unsigned fn
= NVPTX_BUILTIN_SHUFFLE
;
6694 tree_code code
= NOP_EXPR
;
6695 tree arg_type
= unsigned_type_node
;
6696 tree var_type
= TREE_TYPE (var
);
6697 tree dest_type
= var_type
;
6699 if (TREE_CODE (var_type
) == COMPLEX_TYPE
)
6700 var_type
= TREE_TYPE (var_type
);
6702 if (TREE_CODE (var_type
) == REAL_TYPE
)
6703 code
= VIEW_CONVERT_EXPR
;
6705 if (TYPE_SIZE (var_type
)
6706 == TYPE_SIZE (long_long_unsigned_type_node
))
6708 fn
= NVPTX_BUILTIN_SHUFFLELL
;
6709 arg_type
= long_long_unsigned_type_node
;
6712 tree call
= nvptx_builtin_decl (fn
, true);
6713 tree bits
= build_int_cst (unsigned_type_node
, shift
);
6714 tree kind
= build_int_cst (unsigned_type_node
, SHUFFLE_DOWN
);
6717 if (var_type
!= dest_type
)
6719 /* Do real and imaginary parts separately. */
6720 tree real
= fold_build1 (REALPART_EXPR
, var_type
, var
);
6721 real
= fold_build1 (code
, arg_type
, real
);
6722 real
= build_call_expr_loc (loc
, call
, 3, real
, bits
, kind
);
6723 real
= fold_build1 (code
, var_type
, real
);
6725 tree imag
= fold_build1 (IMAGPART_EXPR
, var_type
, var
);
6726 imag
= fold_build1 (code
, arg_type
, imag
);
6727 imag
= build_call_expr_loc (loc
, call
, 3, imag
, bits
, kind
);
6728 imag
= fold_build1 (code
, var_type
, imag
);
6730 expr
= fold_build2 (COMPLEX_EXPR
, dest_type
, real
, imag
);
6734 expr
= fold_build1 (code
, arg_type
, var
);
6735 expr
= build_call_expr_loc (loc
, call
, 3, expr
, bits
, kind
);
6736 expr
= fold_build1 (code
, dest_type
, expr
);
6739 gimplify_assign (dest_var
, expr
, seq
);
6742 /* Lazily generate the global lock var decl and return its address. */
6745 nvptx_global_lock_addr ()
6747 tree v
= global_lock_var
;
6751 tree name
= get_identifier ("__reduction_lock");
6752 tree type
= build_qualified_type (unsigned_type_node
,
6753 TYPE_QUAL_VOLATILE
);
6754 v
= build_decl (BUILTINS_LOCATION
, VAR_DECL
, name
, type
);
6755 global_lock_var
= v
;
6756 DECL_ARTIFICIAL (v
) = 1;
6757 DECL_EXTERNAL (v
) = 1;
6758 TREE_STATIC (v
) = 1;
6759 TREE_PUBLIC (v
) = 1;
6761 mark_addressable (v
);
6762 mark_decl_referenced (v
);
6765 return build_fold_addr_expr (v
);
6768 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
6769 GSI. We use a lockless scheme for nearly all case, which looks
6771 actual = initval(OP);
6774 write = guess OP myval;
6775 actual = cmp&swap (ptr, guess, write)
6776 } while (actual bit-different-to guess);
6779 This relies on a cmp&swap instruction, which is available for 32-
6780 and 64-bit types. Larger types must use a locking scheme. */
6783 nvptx_lockless_update (location_t loc
, gimple_stmt_iterator
*gsi
,
6784 tree ptr
, tree var
, tree_code op
)
6786 unsigned fn
= NVPTX_BUILTIN_CMP_SWAP
;
6787 tree_code code
= NOP_EXPR
;
6788 tree arg_type
= unsigned_type_node
;
6789 tree var_type
= TREE_TYPE (var
);
6791 if (TREE_CODE (var_type
) == COMPLEX_TYPE
6792 || TREE_CODE (var_type
) == REAL_TYPE
)
6793 code
= VIEW_CONVERT_EXPR
;
6795 if (TYPE_SIZE (var_type
) == TYPE_SIZE (long_long_unsigned_type_node
))
6797 arg_type
= long_long_unsigned_type_node
;
6798 fn
= NVPTX_BUILTIN_CMP_SWAPLL
;
6801 tree swap_fn
= nvptx_builtin_decl (fn
, true);
6803 gimple_seq init_seq
= NULL
;
6804 tree init_var
= make_ssa_name (arg_type
);
6805 tree init_expr
= omp_reduction_init_op (loc
, op
, var_type
);
6806 init_expr
= fold_build1 (code
, arg_type
, init_expr
);
6807 gimplify_assign (init_var
, init_expr
, &init_seq
);
6808 gimple
*init_end
= gimple_seq_last (init_seq
);
6810 gsi_insert_seq_before (gsi
, init_seq
, GSI_SAME_STMT
);
6812 /* Split the block just after the init stmts. */
6813 basic_block pre_bb
= gsi_bb (*gsi
);
6814 edge pre_edge
= split_block (pre_bb
, init_end
);
6815 basic_block loop_bb
= pre_edge
->dest
;
6816 pre_bb
= pre_edge
->src
;
6817 /* Reset the iterator. */
6818 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
6820 tree expect_var
= make_ssa_name (arg_type
);
6821 tree actual_var
= make_ssa_name (arg_type
);
6822 tree write_var
= make_ssa_name (arg_type
);
6824 /* Build and insert the reduction calculation. */
6825 gimple_seq red_seq
= NULL
;
6826 tree write_expr
= fold_build1 (code
, var_type
, expect_var
);
6827 write_expr
= fold_build2 (op
, var_type
, write_expr
, var
);
6828 write_expr
= fold_build1 (code
, arg_type
, write_expr
);
6829 gimplify_assign (write_var
, write_expr
, &red_seq
);
6831 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
6833 /* Build & insert the cmp&swap sequence. */
6834 gimple_seq latch_seq
= NULL
;
6835 tree swap_expr
= build_call_expr_loc (loc
, swap_fn
, 3,
6836 ptr
, expect_var
, write_var
);
6837 gimplify_assign (actual_var
, swap_expr
, &latch_seq
);
6839 gcond
*cond
= gimple_build_cond (EQ_EXPR
, actual_var
, expect_var
,
6840 NULL_TREE
, NULL_TREE
);
6841 gimple_seq_add_stmt (&latch_seq
, cond
);
6843 gimple
*latch_end
= gimple_seq_last (latch_seq
);
6844 gsi_insert_seq_before (gsi
, latch_seq
, GSI_SAME_STMT
);
6846 /* Split the block just after the latch stmts. */
6847 edge post_edge
= split_block (loop_bb
, latch_end
);
6848 basic_block post_bb
= post_edge
->dest
;
6849 loop_bb
= post_edge
->src
;
6850 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
6852 post_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
6853 post_edge
->probability
= profile_probability::even ();
6854 edge loop_edge
= make_edge (loop_bb
, loop_bb
, EDGE_FALSE_VALUE
);
6855 loop_edge
->probability
= profile_probability::even ();
6856 set_immediate_dominator (CDI_DOMINATORS
, loop_bb
, pre_bb
);
6857 set_immediate_dominator (CDI_DOMINATORS
, post_bb
, loop_bb
);
6859 gphi
*phi
= create_phi_node (expect_var
, loop_bb
);
6860 add_phi_arg (phi
, init_var
, pre_edge
, loc
);
6861 add_phi_arg (phi
, actual_var
, loop_edge
, loc
);
6863 loop
*loop
= alloc_loop ();
6864 loop
->header
= loop_bb
;
6865 loop
->latch
= loop_bb
;
6866 add_loop (loop
, loop_bb
->loop_father
);
6868 return fold_build1 (code
, var_type
, write_var
);
6871 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
6872 GSI. This is necessary for types larger than 64 bits, where there
6873 is no cmp&swap instruction to implement a lockless scheme. We use
6874 a lock variable in global memory.
6876 while (cmp&swap (&lock_var, 0, 1))
6879 accum = accum OP var;
6881 cmp&swap (&lock_var, 1, 0);
6884 A lock in global memory is necessary to force execution engine
6885 descheduling and avoid resource starvation that can occur if the
6886 lock is in .shared memory. */
6889 nvptx_lockfull_update (location_t loc
, gimple_stmt_iterator
*gsi
,
6890 tree ptr
, tree var
, tree_code op
, int level
)
6892 tree var_type
= TREE_TYPE (var
);
6893 tree swap_fn
= nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP
, true);
6894 tree uns_unlocked
= build_int_cst (unsigned_type_node
, 0);
6895 tree uns_locked
= build_int_cst (unsigned_type_node
, 1);
6897 /* Split the block just before the gsi. Insert a gimple nop to make
6899 gimple
*nop
= gimple_build_nop ();
6900 gsi_insert_before (gsi
, nop
, GSI_SAME_STMT
);
6901 basic_block entry_bb
= gsi_bb (*gsi
);
6902 edge entry_edge
= split_block (entry_bb
, nop
);
6903 basic_block lock_bb
= entry_edge
->dest
;
6904 /* Reset the iterator. */
6905 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
6907 /* Build and insert the locking sequence. */
6908 gimple_seq lock_seq
= NULL
;
6909 tree lock_var
= make_ssa_name (unsigned_type_node
);
6910 tree lock_expr
= nvptx_global_lock_addr ();
6911 lock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, lock_expr
,
6912 uns_unlocked
, uns_locked
);
6913 gimplify_assign (lock_var
, lock_expr
, &lock_seq
);
6914 gcond
*cond
= gimple_build_cond (EQ_EXPR
, lock_var
, uns_unlocked
,
6915 NULL_TREE
, NULL_TREE
);
6916 gimple_seq_add_stmt (&lock_seq
, cond
);
6917 gimple
*lock_end
= gimple_seq_last (lock_seq
);
6918 gsi_insert_seq_before (gsi
, lock_seq
, GSI_SAME_STMT
);
6920 /* Split the block just after the lock sequence. */
6921 edge locked_edge
= split_block (lock_bb
, lock_end
);
6922 basic_block update_bb
= locked_edge
->dest
;
6923 lock_bb
= locked_edge
->src
;
6924 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
6926 /* Create the lock loop ... */
6927 locked_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
6928 locked_edge
->probability
= profile_probability::even ();
6929 edge loop_edge
= make_edge (lock_bb
, lock_bb
, EDGE_FALSE_VALUE
);
6930 loop_edge
->probability
= profile_probability::even ();
6931 set_immediate_dominator (CDI_DOMINATORS
, lock_bb
, entry_bb
);
6932 set_immediate_dominator (CDI_DOMINATORS
, update_bb
, lock_bb
);
6934 /* ... and the loop structure. */
6935 loop
*lock_loop
= alloc_loop ();
6936 lock_loop
->header
= lock_bb
;
6937 lock_loop
->latch
= lock_bb
;
6938 lock_loop
->nb_iterations_estimate
= 1;
6939 lock_loop
->any_estimate
= true;
6940 add_loop (lock_loop
, entry_bb
->loop_father
);
6942 /* Build the pre-barrier. */
6943 gimple_seq red_seq
= NULL
;
6944 enum nvptx_builtins barrier_builtin
6945 = (level
== GOMP_DIM_GANG
6946 ? NVPTX_BUILTIN_MEMBAR_GL
6947 : NVPTX_BUILTIN_MEMBAR_CTA
);
6948 tree barrier_fn
= nvptx_builtin_decl (barrier_builtin
, true);
6949 tree barrier_expr
= build_call_expr_loc (loc
, barrier_fn
, 0);
6950 gimplify_stmt (&barrier_expr
, &red_seq
);
6952 /* Build the reduction calculation. */
6953 tree acc_in
= make_ssa_name (var_type
);
6954 tree ref_in
= build_simple_mem_ref (ptr
);
6955 TREE_THIS_VOLATILE (ref_in
) = 1;
6956 gimplify_assign (acc_in
, ref_in
, &red_seq
);
6958 tree acc_out
= make_ssa_name (var_type
);
6959 tree update_expr
= fold_build2 (op
, var_type
, ref_in
, var
);
6960 gimplify_assign (acc_out
, update_expr
, &red_seq
);
6962 tree ref_out
= build_simple_mem_ref (ptr
);
6963 TREE_THIS_VOLATILE (ref_out
) = 1;
6964 gimplify_assign (ref_out
, acc_out
, &red_seq
);
6966 /* Build the post-barrier. */
6967 barrier_expr
= build_call_expr_loc (loc
, barrier_fn
, 0);
6968 gimplify_stmt (&barrier_expr
, &red_seq
);
6970 /* Insert the reduction calculation. */
6971 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
6973 /* Build & insert the unlock sequence. */
6974 gimple_seq unlock_seq
= NULL
;
6975 tree unlock_expr
= nvptx_global_lock_addr ();
6976 unlock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, unlock_expr
,
6977 uns_locked
, uns_unlocked
);
6978 gimplify_and_add (unlock_expr
, &unlock_seq
);
6979 gsi_insert_seq_before (gsi
, unlock_seq
, GSI_SAME_STMT
);
6984 /* Emit a sequence to update a reduction accumlator at *PTR with the
6985 value held in VAR using operator OP. Return the updated value.
6987 TODO: optimize for atomic ops and indepedent complex ops. */
6990 nvptx_reduction_update (location_t loc
, gimple_stmt_iterator
*gsi
,
6991 tree ptr
, tree var
, tree_code op
, int level
)
6993 tree type
= TREE_TYPE (var
);
6994 tree size
= TYPE_SIZE (type
);
6996 if (size
== TYPE_SIZE (unsigned_type_node
)
6997 || size
== TYPE_SIZE (long_long_unsigned_type_node
))
6998 return nvptx_lockless_update (loc
, gsi
, ptr
, var
, op
);
7000 return nvptx_lockfull_update (loc
, gsi
, ptr
, var
, op
, level
);
7003 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
7006 nvptx_goacc_reduction_setup (gcall
*call
, offload_attrs
*oa
)
7008 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
7009 tree lhs
= gimple_call_lhs (call
);
7010 tree var
= gimple_call_arg (call
, 2);
7011 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
7012 gimple_seq seq
= NULL
;
7014 push_gimplify_context (true);
7016 if (level
!= GOMP_DIM_GANG
)
7018 /* Copy the receiver object. */
7019 tree ref_to_res
= gimple_call_arg (call
, 1);
7021 if (!integer_zerop (ref_to_res
))
7022 var
= build_simple_mem_ref (ref_to_res
);
7025 if (level
== GOMP_DIM_WORKER
7026 || (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
> PTX_WARP_SIZE
))
7028 /* Store incoming value to worker reduction buffer. */
7029 tree offset
= gimple_call_arg (call
, 5);
7030 tree call
= nvptx_get_shared_red_addr (TREE_TYPE (var
), offset
,
7031 level
== GOMP_DIM_VECTOR
);
7032 tree ptr
= make_ssa_name (TREE_TYPE (call
));
7034 gimplify_assign (ptr
, call
, &seq
);
7035 tree ref
= build_simple_mem_ref (ptr
);
7036 TREE_THIS_VOLATILE (ref
) = 1;
7037 gimplify_assign (ref
, var
, &seq
);
7041 gimplify_assign (lhs
, var
, &seq
);
7043 pop_gimplify_context (NULL
);
7044 gsi_replace_with_seq (&gsi
, seq
, true);
7047 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
7050 nvptx_goacc_reduction_init (gcall
*call
, offload_attrs
*oa
)
7052 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
7053 tree lhs
= gimple_call_lhs (call
);
7054 tree var
= gimple_call_arg (call
, 2);
7055 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
7056 enum tree_code rcode
7057 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
7058 tree init
= omp_reduction_init_op (gimple_location (call
), rcode
,
7060 gimple_seq seq
= NULL
;
7062 push_gimplify_context (true);
7064 if (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
== PTX_WARP_SIZE
)
7066 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
7067 tree tid
= make_ssa_name (integer_type_node
);
7068 tree dim_vector
= gimple_call_arg (call
, 3);
7069 gimple
*tid_call
= gimple_build_call_internal (IFN_GOACC_DIM_POS
, 1,
7071 gimple
*cond_stmt
= gimple_build_cond (NE_EXPR
, tid
, integer_zero_node
,
7072 NULL_TREE
, NULL_TREE
);
7074 gimple_call_set_lhs (tid_call
, tid
);
7075 gimple_seq_add_stmt (&seq
, tid_call
);
7076 gimple_seq_add_stmt (&seq
, cond_stmt
);
7078 /* Split the block just after the call. */
7079 edge init_edge
= split_block (gsi_bb (gsi
), call
);
7080 basic_block init_bb
= init_edge
->dest
;
7081 basic_block call_bb
= init_edge
->src
;
7083 /* Fixup flags from call_bb to init_bb. */
7084 init_edge
->flags
^= EDGE_FALLTHRU
| EDGE_TRUE_VALUE
;
7085 init_edge
->probability
= profile_probability::even ();
7087 /* Set the initialization stmts. */
7088 gimple_seq init_seq
= NULL
;
7089 tree init_var
= make_ssa_name (TREE_TYPE (var
));
7090 gimplify_assign (init_var
, init
, &init_seq
);
7091 gsi
= gsi_start_bb (init_bb
);
7092 gsi_insert_seq_before (&gsi
, init_seq
, GSI_SAME_STMT
);
7094 /* Split block just after the init stmt. */
7096 edge inited_edge
= split_block (gsi_bb (gsi
), gsi_stmt (gsi
));
7097 basic_block dst_bb
= inited_edge
->dest
;
7099 /* Create false edge from call_bb to dst_bb. */
7100 edge nop_edge
= make_edge (call_bb
, dst_bb
, EDGE_FALSE_VALUE
);
7101 nop_edge
->probability
= profile_probability::even ();
7103 /* Create phi node in dst block. */
7104 gphi
*phi
= create_phi_node (lhs
, dst_bb
);
7105 add_phi_arg (phi
, init_var
, inited_edge
, gimple_location (call
));
7106 add_phi_arg (phi
, var
, nop_edge
, gimple_location (call
));
7108 /* Reset dominator of dst bb. */
7109 set_immediate_dominator (CDI_DOMINATORS
, dst_bb
, call_bb
);
7111 /* Reset the gsi. */
7112 gsi
= gsi_for_stmt (call
);
7116 if (level
== GOMP_DIM_GANG
)
7118 /* If there's no receiver object, propagate the incoming VAR. */
7119 tree ref_to_res
= gimple_call_arg (call
, 1);
7120 if (integer_zerop (ref_to_res
))
7124 if (lhs
!= NULL_TREE
)
7125 gimplify_assign (lhs
, init
, &seq
);
7128 pop_gimplify_context (NULL
);
7129 gsi_replace_with_seq (&gsi
, seq
, true);
7132 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
7135 nvptx_goacc_reduction_fini (gcall
*call
, offload_attrs
*oa
)
7137 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
7138 tree lhs
= gimple_call_lhs (call
);
7139 tree ref_to_res
= gimple_call_arg (call
, 1);
7140 tree var
= gimple_call_arg (call
, 2);
7141 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
7143 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
7144 gimple_seq seq
= NULL
;
7147 push_gimplify_context (true);
7149 if (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
== PTX_WARP_SIZE
)
7151 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
7152 but that requires a method of emitting a unified jump at the
7154 for (int shfl
= PTX_WARP_SIZE
/ 2; shfl
> 0; shfl
= shfl
>> 1)
7156 tree other_var
= make_ssa_name (TREE_TYPE (var
));
7157 nvptx_generate_vector_shuffle (gimple_location (call
),
7158 other_var
, var
, shfl
, &seq
);
7160 r
= make_ssa_name (TREE_TYPE (var
));
7161 gimplify_assign (r
, fold_build2 (op
, TREE_TYPE (var
),
7162 var
, other_var
), &seq
);
7168 tree accum
= NULL_TREE
;
7170 if (level
== GOMP_DIM_WORKER
|| level
== GOMP_DIM_VECTOR
)
7172 /* Get reduction buffer address. */
7173 tree offset
= gimple_call_arg (call
, 5);
7174 tree call
= nvptx_get_shared_red_addr (TREE_TYPE (var
), offset
,
7175 level
== GOMP_DIM_VECTOR
);
7176 tree ptr
= make_ssa_name (TREE_TYPE (call
));
7178 gimplify_assign (ptr
, call
, &seq
);
7181 else if (integer_zerop (ref_to_res
))
7188 /* UPDATE the accumulator. */
7189 gsi_insert_seq_before (&gsi
, seq
, GSI_SAME_STMT
);
7191 r
= nvptx_reduction_update (gimple_location (call
), &gsi
,
7192 accum
, var
, op
, level
);
7197 gimplify_assign (lhs
, r
, &seq
);
7198 pop_gimplify_context (NULL
);
7200 gsi_replace_with_seq (&gsi
, seq
, true);
7203 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
7206 nvptx_goacc_reduction_teardown (gcall
*call
, offload_attrs
*oa
)
7208 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
7209 tree lhs
= gimple_call_lhs (call
);
7210 tree var
= gimple_call_arg (call
, 2);
7211 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
7212 gimple_seq seq
= NULL
;
7214 push_gimplify_context (true);
7215 if (level
== GOMP_DIM_WORKER
7216 || (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
> PTX_WARP_SIZE
))
7218 /* Read the worker reduction buffer. */
7219 tree offset
= gimple_call_arg (call
, 5);
7220 tree call
= nvptx_get_shared_red_addr (TREE_TYPE (var
), offset
,
7221 level
== GOMP_DIM_VECTOR
);
7222 tree ptr
= make_ssa_name (TREE_TYPE (call
));
7224 gimplify_assign (ptr
, call
, &seq
);
7225 var
= build_simple_mem_ref (ptr
);
7226 TREE_THIS_VOLATILE (var
) = 1;
7229 if (level
!= GOMP_DIM_GANG
)
7231 /* Write to the receiver object. */
7232 tree ref_to_res
= gimple_call_arg (call
, 1);
7234 if (!integer_zerop (ref_to_res
))
7235 gimplify_assign (build_simple_mem_ref (ref_to_res
), var
, &seq
);
7239 gimplify_assign (lhs
, var
, &seq
);
7241 pop_gimplify_context (NULL
);
7243 gsi_replace_with_seq (&gsi
, seq
, true);
7246 /* NVPTX reduction expander. */
7249 nvptx_goacc_reduction (gcall
*call
)
7251 unsigned code
= (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call
, 0));
7254 populate_offload_attrs (&oa
);
7258 case IFN_GOACC_REDUCTION_SETUP
:
7259 nvptx_goacc_reduction_setup (call
, &oa
);
7262 case IFN_GOACC_REDUCTION_INIT
:
7263 nvptx_goacc_reduction_init (call
, &oa
);
7266 case IFN_GOACC_REDUCTION_FINI
:
7267 nvptx_goacc_reduction_fini (call
, &oa
);
7270 case IFN_GOACC_REDUCTION_TEARDOWN
:
7271 nvptx_goacc_reduction_teardown (call
, &oa
);
7280 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
,
7281 rtx x ATTRIBUTE_UNUSED
)
7287 nvptx_scalar_mode_supported_p (scalar_mode mode
)
7289 if (nvptx_experimental
&& mode
== HFmode
&& TARGET_SM53
)
7292 return default_scalar_mode_supported_p (mode
);
7296 nvptx_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
7298 if (nvptx_experimental
&& mode
== HFmode
&& TARGET_SM53
)
7301 return default_libgcc_floating_mode_supported_p (mode
);
7305 nvptx_vector_mode_supported (machine_mode mode
)
7307 return (mode
== V2SImode
7308 || mode
== V2DImode
);
7311 /* Return the preferred mode for vectorizing scalar MODE. */
7314 nvptx_preferred_simd_mode (scalar_mode mode
)
7324 return default_preferred_simd_mode (mode
);
7329 nvptx_data_alignment (const_tree type
, unsigned int basic_align
)
7331 if (TREE_CODE (type
) == INTEGER_TYPE
)
7333 unsigned HOST_WIDE_INT size
= tree_to_uhwi (TYPE_SIZE_UNIT (type
));
7334 if (size
== GET_MODE_SIZE (TImode
))
7335 return GET_MODE_BITSIZE (maybe_split_mode (TImode
));
7341 /* Implement TARGET_MODES_TIEABLE_P. */
7344 nvptx_modes_tieable_p (machine_mode
, machine_mode
)
7349 /* Implement TARGET_HARD_REGNO_NREGS. */
7352 nvptx_hard_regno_nregs (unsigned int, machine_mode
)
7357 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
7360 nvptx_can_change_mode_class (machine_mode
, machine_mode
, reg_class_t
)
7365 /* Implement TARGET_TRULY_NOOP_TRUNCATION. */
7368 nvptx_truly_noop_truncation (poly_uint64
, poly_uint64
)
7373 /* Implement TARGET_GOACC_ADJUST_PRIVATE_DECL. */
7376 nvptx_goacc_adjust_private_decl (location_t loc
, tree decl
, int level
)
7378 gcc_checking_assert (!lookup_attribute ("oacc gang-private",
7379 DECL_ATTRIBUTES (decl
)));
7381 /* Set "oacc gang-private" attribute for gang-private variable
7383 if (level
== GOMP_DIM_GANG
)
7385 tree id
= get_identifier ("oacc gang-private");
7386 /* For later diagnostic purposes, pass LOC as VALUE (wrapped as a
7388 tree loc_tree
= build_empty_stmt (loc
);
7389 DECL_ATTRIBUTES (decl
)
7390 = tree_cons (id
, loc_tree
, DECL_ATTRIBUTES (decl
));
7396 /* Implement TARGET_GOACC_EXPAND_VAR_DECL. */
7399 nvptx_goacc_expand_var_decl (tree var
)
7401 /* Place "oacc gang-private" variables in shared memory. */
7402 if (tree attr
= lookup_attribute ("oacc gang-private",
7403 DECL_ATTRIBUTES (var
)))
7405 gcc_checking_assert (VAR_P (var
));
7407 unsigned int offset
, *poffset
;
7408 poffset
= gang_private_shared_hmap
.get (var
);
7413 unsigned HOST_WIDE_INT align
= DECL_ALIGN (var
);
7414 gang_private_shared_size
7415 = (gang_private_shared_size
+ align
- 1) & ~(align
- 1);
7416 if (gang_private_shared_align
< align
)
7417 gang_private_shared_align
= align
;
7419 offset
= gang_private_shared_size
;
7420 bool existed
= gang_private_shared_hmap
.put (var
, offset
);
7421 gcc_checking_assert (!existed
);
7422 gang_private_shared_size
+= tree_to_uhwi (DECL_SIZE_UNIT (var
));
7424 location_t loc
= EXPR_LOCATION (TREE_VALUE (attr
));
7425 #if 0 /* For some reason, this doesn't work. */
7426 if (dump_enabled_p ())
7428 dump_flags_t l_dump_flags
7429 = get_openacc_privatization_dump_flags ();
7431 const dump_user_location_t d_u_loc
7432 = dump_user_location_t::from_location_t (loc
);
7433 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
7435 # pragma GCC diagnostic push
7436 # pragma GCC diagnostic ignored "-Wformat"
7438 dump_printf_loc (l_dump_flags
, d_u_loc
,
7439 "variable %<%T%> adjusted for OpenACC"
7440 " privatization level: %qs\n",
7443 # pragma GCC diagnostic pop
7446 #else /* ..., thus emulate that, good enough for testsuite usage. */
7447 if (param_openacc_privatization
!= OPENACC_PRIVATIZATION_QUIET
)
7449 "variable %qD adjusted for OpenACC privatization level:"
7452 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
7454 /* 'dumpfile.cc:dump_loc' */
7455 fprintf (dump_file
, "%s:%d:%d: ", LOCATION_FILE (loc
),
7456 LOCATION_LINE (loc
), LOCATION_COLUMN (loc
));
7457 fprintf (dump_file
, "%s: ", "note");
7461 print_generic_expr (dump_file
, var
, TDF_SLIM
);
7463 "' adjusted for OpenACC privatization level: '%s'\n",
7468 rtx addr
= plus_constant (Pmode
, gang_private_shared_sym
, offset
);
7469 return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var
)), addr
);
7475 static GTY(()) tree nvptx_previous_fndecl
;
7478 nvptx_set_current_function (tree fndecl
)
7480 if (!fndecl
|| fndecl
== nvptx_previous_fndecl
)
7483 gang_private_shared_hmap
.empty ();
7484 nvptx_previous_fndecl
= fndecl
;
7485 vector_red_partition
= 0;
7486 oacc_bcast_partition
= 0;
7489 /* Implement TARGET_LIBC_HAS_FUNCTION. */
7492 nvptx_libc_has_function (enum function_class fn_class
, tree type
)
7494 if (fn_class
== function_sincos
)
7496 if (type
!= NULL_TREE
)
7497 /* Currently, newlib does not support sincosl. */
7498 return type
== float_type_node
|| type
== double_type_node
;
7503 return default_libc_has_function (fn_class
, type
);
7507 nvptx_mem_local_p (rtx mem
)
7509 gcc_assert (GET_CODE (mem
) == MEM
);
7511 struct address_info info
;
7512 decompose_mem_address (&info
, mem
);
7514 if (info
.base
!= NULL
&& REG_P (*info
.base
)
7515 && REGNO_PTR_FRAME_P (REGNO (*info
.base
)))
7517 if (TARGET_SOFT_STACK
)
7519 /* Frame-related doesn't mean local. */
7528 /* Define locally, for use in NVPTX_ASM_OUTPUT_DEF. */
7529 #define SET_ASM_OP ".alias "
7531 /* Define locally, for use in nvptx_asm_output_def_from_decls. Add NVPTX_
7532 prefix to avoid clash with ASM_OUTPUT_DEF from nvptx.h.
7533 Copy of ASM_OUTPUT_DEF from defaults.h, with added terminating
7535 #define NVPTX_ASM_OUTPUT_DEF(FILE,LABEL1,LABEL2) \
7538 fprintf ((FILE), "%s", SET_ASM_OP); \
7539 assemble_name (FILE, LABEL1); \
7540 fprintf (FILE, ","); \
7541 assemble_name (FILE, LABEL2); \
7542 fprintf (FILE, ";\n"); \
7547 nvptx_asm_output_def_from_decls (FILE *stream
, tree name
, tree value
)
7549 if (nvptx_alias
== 0 || !TARGET_PTX_6_3
)
7551 /* Copied from assemble_alias. */
7552 error_at (DECL_SOURCE_LOCATION (name
),
7553 "alias definitions not supported in this configuration");
7554 TREE_ASM_WRITTEN (name
) = 1;
7558 if (lookup_attribute ("weak", DECL_ATTRIBUTES (name
)))
7560 /* Prevent execution FAILs for gcc.dg/globalalias.c and
7561 gcc.dg/pr77587.c. */
7562 error_at (DECL_SOURCE_LOCATION (name
),
7563 "weak alias definitions not supported in this configuration");
7564 TREE_ASM_WRITTEN (name
) = 1;
7568 /* Ptx also doesn't support value having weak linkage, but we can't detect
7569 that here, so we'll end up with:
7570 "error: Function test with .weak scope cannot be aliased".
7571 See gcc.dg/localalias.c. */
7573 if (TREE_CODE (name
) != FUNCTION_DECL
)
7575 error_at (DECL_SOURCE_LOCATION (name
),
7576 "non-function alias definitions not supported"
7577 " in this configuration");
7578 TREE_ASM_WRITTEN (name
) = 1;
7582 if (!cgraph_node::get (name
)->referred_to_p ())
7583 /* Prevent "Internal error: reference to deleted section". */
7586 std::stringstream s
;
7587 write_fn_proto (s
, false, get_fnname_from_decl (name
), name
);
7588 fputs (s
.str ().c_str (), stream
);
7590 tree id
= DECL_ASSEMBLER_NAME (name
);
7591 NVPTX_ASM_OUTPUT_DEF (stream
, IDENTIFIER_POINTER (id
),
7592 IDENTIFIER_POINTER (value
));
7595 #undef NVPTX_ASM_OUTPUT_DEF
7598 #undef TARGET_OPTION_OVERRIDE
7599 #define TARGET_OPTION_OVERRIDE nvptx_option_override
7601 #undef TARGET_ATTRIBUTE_TABLE
7602 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
7605 #define TARGET_LRA_P hook_bool_void_false
7607 #undef TARGET_LEGITIMATE_ADDRESS_P
7608 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
7610 #undef TARGET_PROMOTE_FUNCTION_MODE
7611 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
7613 #undef TARGET_FUNCTION_ARG
7614 #define TARGET_FUNCTION_ARG nvptx_function_arg
7615 #undef TARGET_FUNCTION_INCOMING_ARG
7616 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
7617 #undef TARGET_FUNCTION_ARG_ADVANCE
7618 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
7619 #undef TARGET_FUNCTION_ARG_BOUNDARY
7620 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
7621 #undef TARGET_PASS_BY_REFERENCE
7622 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
7623 #undef TARGET_FUNCTION_VALUE_REGNO_P
7624 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
7625 #undef TARGET_FUNCTION_VALUE
7626 #define TARGET_FUNCTION_VALUE nvptx_function_value
7627 #undef TARGET_LIBCALL_VALUE
7628 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
7629 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
7630 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
7631 #undef TARGET_GET_DRAP_RTX
7632 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
7633 #undef TARGET_SPLIT_COMPLEX_ARG
7634 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
7635 #undef TARGET_RETURN_IN_MEMORY
7636 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
7637 #undef TARGET_OMIT_STRUCT_RETURN_REG
7638 #define TARGET_OMIT_STRUCT_RETURN_REG true
7639 #undef TARGET_STRICT_ARGUMENT_NAMING
7640 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
7641 #undef TARGET_CALL_ARGS
7642 #define TARGET_CALL_ARGS nvptx_call_args
7643 #undef TARGET_END_CALL_ARGS
7644 #define TARGET_END_CALL_ARGS nvptx_end_call_args
7646 #undef TARGET_ASM_FILE_START
7647 #define TARGET_ASM_FILE_START nvptx_file_start
7648 #undef TARGET_ASM_FILE_END
7649 #define TARGET_ASM_FILE_END nvptx_file_end
7650 #undef TARGET_ASM_GLOBALIZE_LABEL
7651 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
7652 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
7653 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
7654 #undef TARGET_PRINT_OPERAND
7655 #define TARGET_PRINT_OPERAND nvptx_print_operand
7656 #undef TARGET_PRINT_OPERAND_ADDRESS
7657 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
7658 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
7659 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
7660 #undef TARGET_ASM_INTEGER
7661 #define TARGET_ASM_INTEGER nvptx_assemble_integer
7662 #undef TARGET_ASM_DECL_END
7663 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
7664 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
7665 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
7666 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
7667 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
7668 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
7669 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
7671 #undef TARGET_MACHINE_DEPENDENT_REORG
7672 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
7673 #undef TARGET_NO_REGISTER_ALLOCATION
7674 #define TARGET_NO_REGISTER_ALLOCATION true
7676 #undef TARGET_ENCODE_SECTION_INFO
7677 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
7678 #undef TARGET_RECORD_OFFLOAD_SYMBOL
7679 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
7681 #undef TARGET_VECTOR_ALIGNMENT
7682 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
7684 #undef TARGET_CANNOT_COPY_INSN_P
7685 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
7687 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
7688 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
7690 #undef TARGET_INIT_BUILTINS
7691 #define TARGET_INIT_BUILTINS nvptx_init_builtins
7692 #undef TARGET_EXPAND_BUILTIN
7693 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
7694 #undef TARGET_BUILTIN_DECL
7695 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
7697 #undef TARGET_SIMT_VF
7698 #define TARGET_SIMT_VF nvptx_simt_vf
7700 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
7701 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa
7703 #undef TARGET_GOACC_VALIDATE_DIMS
7704 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
7706 #undef TARGET_GOACC_DIM_LIMIT
7707 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
7709 #undef TARGET_GOACC_FORK_JOIN
7710 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
7712 #undef TARGET_GOACC_REDUCTION
7713 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
7715 #undef TARGET_CANNOT_FORCE_CONST_MEM
7716 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
7718 #undef TARGET_SCALAR_MODE_SUPPORTED_P
7719 #define TARGET_SCALAR_MODE_SUPPORTED_P nvptx_scalar_mode_supported_p
7721 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
7722 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
7723 nvptx_libgcc_floating_mode_supported_p
7725 #undef TARGET_VECTOR_MODE_SUPPORTED_P
7726 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
7728 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
7729 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
7730 nvptx_preferred_simd_mode
7732 #undef TARGET_MODES_TIEABLE_P
7733 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
7735 #undef TARGET_HARD_REGNO_NREGS
7736 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
7738 #undef TARGET_CAN_CHANGE_MODE_CLASS
7739 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
7741 #undef TARGET_TRULY_NOOP_TRUNCATION
7742 #define TARGET_TRULY_NOOP_TRUNCATION nvptx_truly_noop_truncation
7744 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
7745 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
7747 #undef TARGET_GOACC_ADJUST_PRIVATE_DECL
7748 #define TARGET_GOACC_ADJUST_PRIVATE_DECL nvptx_goacc_adjust_private_decl
7750 #undef TARGET_GOACC_EXPAND_VAR_DECL
7751 #define TARGET_GOACC_EXPAND_VAR_DECL nvptx_goacc_expand_var_decl
7753 #undef TARGET_SET_CURRENT_FUNCTION
7754 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
7756 #undef TARGET_LIBC_HAS_FUNCTION
7757 #define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function
7759 struct gcc_target targetm
= TARGET_INITIALIZER
;
7761 #include "gt-nvptx.h"