1 /* Target code for NVPTX.
2 Copyright (C) 2014-2018 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
26 #include "coretypes.h"
40 #include "diagnostic.h"
42 #include "insn-flags.h"
44 #include "insn-attr.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
58 #include "stor-layout.h"
60 #include "omp-general.h"
62 #include "gomp-constants.h"
64 #include "internal-fn.h"
65 #include "gimple-iterator.h"
66 #include "stringpool.h"
69 #include "tree-ssa-operands.h"
70 #include "tree-ssanames.h"
72 #include "tree-phinodes.h"
74 #include "fold-const.h"
77 /* This file should be included last. */
78 #include "target-def.h"
80 #define WORKAROUND_PTXJIT_BUG 1
81 #define WORKAROUND_PTXJIT_BUG_2 1
82 #define WORKAROUND_PTXJIT_BUG_3 1
84 /* The various PTX memory areas an object might reside in. */
96 /* We record the data area in the target symbol flags. */
97 #define SYMBOL_DATA_AREA(SYM) \
98 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
100 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
101 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
103 /* Record the function decls we've written, and the libfuncs and function
104 decls corresponding to them. */
105 static std::stringstream func_decls
;
107 struct declared_libfunc_hasher
: ggc_cache_ptr_hash
<rtx_def
>
109 static hashval_t
hash (rtx x
) { return htab_hash_pointer (x
); }
110 static bool equal (rtx a
, rtx b
) { return a
== b
; }
114 hash_table
<declared_libfunc_hasher
> *declared_libfuncs_htab
;
116 struct tree_hasher
: ggc_cache_ptr_hash
<tree_node
>
118 static hashval_t
hash (tree t
) { return htab_hash_pointer (t
); }
119 static bool equal (tree a
, tree b
) { return a
== b
; }
122 static GTY((cache
)) hash_table
<tree_hasher
> *declared_fndecls_htab
;
123 static GTY((cache
)) hash_table
<tree_hasher
> *needed_fndecls_htab
;
125 /* Buffer needed to broadcast across workers. This is used for both
126 worker-neutering and worker broadcasting. It is shared by all
127 functions emitted. The buffer is placed in shared memory. It'd be
128 nice if PTX supported common blocks, because then this could be
129 shared across TUs (taking the largest size). */
130 static unsigned worker_bcast_size
;
131 static unsigned worker_bcast_align
;
132 static GTY(()) rtx worker_bcast_sym
;
134 /* Buffer needed for worker reductions. This has to be distinct from
135 the worker broadcast array, as both may be live concurrently. */
136 static unsigned worker_red_size
;
137 static unsigned worker_red_align
;
138 static GTY(()) rtx worker_red_sym
;
140 /* Global lock variable, needed for 128bit worker & gang reductions. */
141 static GTY(()) tree global_lock_var
;
143 /* True if any function references __nvptx_stacks. */
144 static bool need_softstack_decl
;
146 /* True if any function references __nvptx_uni. */
147 static bool need_unisimt_decl
;
149 /* Allocate a new, cleared machine_function structure. */
151 static struct machine_function
*
152 nvptx_init_machine_status (void)
154 struct machine_function
*p
= ggc_cleared_alloc
<machine_function
> ();
155 p
->return_mode
= VOIDmode
;
159 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
160 and -fopenacc is also enabled. */
163 diagnose_openacc_conflict (bool optval
, const char *optname
)
165 if (flag_openacc
&& optval
)
166 error ("option %s is not supported together with -fopenacc", optname
);
169 /* Implement TARGET_OPTION_OVERRIDE. */
172 nvptx_option_override (void)
174 init_machine_status
= nvptx_init_machine_status
;
176 /* Set toplevel_reorder, unless explicitly disabled. We need
177 reordering so that we emit necessary assembler decls of
178 undeclared variables. */
179 if (!global_options_set
.x_flag_toplevel_reorder
)
180 flag_toplevel_reorder
= 1;
182 debug_nonbind_markers_p
= 0;
184 /* Set flag_no_common, unless explicitly disabled. We fake common
185 using .weak, and that's not entirely accurate, so avoid it
187 if (!global_options_set
.x_flag_no_common
)
190 /* The patch area requires nops, which we don't have. */
191 if (function_entry_patch_area_size
> 0)
192 sorry ("not generating patch area, nops not supported");
194 /* Assumes that it will see only hard registers. */
195 flag_var_tracking
= 0;
197 if (nvptx_optimize
< 0)
198 nvptx_optimize
= optimize
> 0;
200 declared_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
201 needed_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
202 declared_libfuncs_htab
203 = hash_table
<declared_libfunc_hasher
>::create_ggc (17);
205 worker_bcast_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__worker_bcast");
206 SET_SYMBOL_DATA_AREA (worker_bcast_sym
, DATA_AREA_SHARED
);
207 worker_bcast_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
209 worker_red_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__worker_red");
210 SET_SYMBOL_DATA_AREA (worker_red_sym
, DATA_AREA_SHARED
);
211 worker_red_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
213 diagnose_openacc_conflict (TARGET_GOMP
, "-mgomp");
214 diagnose_openacc_conflict (TARGET_SOFT_STACK
, "-msoft-stack");
215 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT
, "-muniform-simt");
218 target_flags
|= MASK_SOFT_STACK
| MASK_UNIFORM_SIMT
;
221 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
222 deal with ptx ideosyncracies. */
225 nvptx_ptx_type_from_mode (machine_mode mode
, bool promote
)
260 /* Encode the PTX data area that DECL (which might not actually be a
261 _DECL) should reside in. */
264 nvptx_encode_section_info (tree decl
, rtx rtl
, int first
)
266 default_encode_section_info (decl
, rtl
, first
);
267 if (first
&& MEM_P (rtl
))
269 nvptx_data_area area
= DATA_AREA_GENERIC
;
271 if (TREE_CONSTANT (decl
))
272 area
= DATA_AREA_CONST
;
273 else if (TREE_CODE (decl
) == VAR_DECL
)
275 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl
)))
277 area
= DATA_AREA_SHARED
;
278 if (DECL_INITIAL (decl
))
279 error ("static initialization of variable %q+D in %<.shared%>"
280 " memory is not supported", decl
);
283 area
= TREE_READONLY (decl
) ? DATA_AREA_CONST
: DATA_AREA_GLOBAL
;
286 SET_SYMBOL_DATA_AREA (XEXP (rtl
, 0), area
);
290 /* Return the PTX name of the data area in which SYM should be
291 placed. The symbol must have already been processed by
292 nvptx_encode_seciton_info, or equivalent. */
295 section_for_sym (rtx sym
)
297 nvptx_data_area area
= SYMBOL_DATA_AREA (sym
);
298 /* Same order as nvptx_data_area enum. */
299 static char const *const areas
[] =
300 {"", ".global", ".shared", ".local", ".const", ".param"};
305 /* Similarly for a decl. */
308 section_for_decl (const_tree decl
)
310 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree
, decl
)), 0));
313 /* Check NAME for special function names and redirect them by returning a
314 replacement. This applies to malloc, free and realloc, for which we
315 want to use libgcc wrappers, and call, which triggers a bug in
316 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
317 not active in an offload compiler -- the names are all set by the
318 host-side compiler. */
321 nvptx_name_replacement (const char *name
)
323 if (strcmp (name
, "call") == 0)
324 return "__nvptx_call";
325 if (strcmp (name
, "malloc") == 0)
326 return "__nvptx_malloc";
327 if (strcmp (name
, "free") == 0)
328 return "__nvptx_free";
329 if (strcmp (name
, "realloc") == 0)
330 return "__nvptx_realloc";
334 /* If MODE should be treated as two registers of an inner mode, return
335 that inner mode. Otherwise return VOIDmode. */
338 maybe_split_mode (machine_mode mode
)
340 if (COMPLEX_MODE_P (mode
))
341 return GET_MODE_INNER (mode
);
349 /* Return true if mode should be treated as two registers. */
352 split_mode_p (machine_mode mode
)
354 return maybe_split_mode (mode
) != VOIDmode
;
357 /* Output a register, subreg, or register pair (with optional
358 enclosing braces). */
361 output_reg (FILE *file
, unsigned regno
, machine_mode inner_mode
,
362 int subreg_offset
= -1)
364 if (inner_mode
== VOIDmode
)
366 if (HARD_REGISTER_NUM_P (regno
))
367 fprintf (file
, "%s", reg_names
[regno
]);
369 fprintf (file
, "%%r%d", regno
);
371 else if (subreg_offset
>= 0)
373 output_reg (file
, regno
, VOIDmode
);
374 fprintf (file
, "$%d", subreg_offset
);
378 if (subreg_offset
== -1)
380 output_reg (file
, regno
, inner_mode
, GET_MODE_SIZE (inner_mode
));
382 output_reg (file
, regno
, inner_mode
, 0);
383 if (subreg_offset
== -1)
388 /* Emit forking instructions for MASK. */
391 nvptx_emit_forking (unsigned mask
, bool is_call
)
393 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
394 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
397 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
399 /* Emit fork at all levels. This helps form SESE regions, as
400 it creates a block with a single successor before entering a
401 partitooned region. That is a good candidate for the end of
403 emit_insn (gen_nvptx_fork (op
));
404 emit_insn (gen_nvptx_forked (op
));
408 /* Emit joining instructions for MASK. */
411 nvptx_emit_joining (unsigned mask
, bool is_call
)
413 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
414 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
417 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
419 /* Emit joining for all non-call pars to ensure there's a single
420 predecessor for the block the join insn ends up in. This is
421 needed for skipping entire loops. */
422 emit_insn (gen_nvptx_joining (op
));
423 emit_insn (gen_nvptx_join (op
));
428 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
429 returned in memory. Integer and floating types supported by the
430 machine are passed in registers, everything else is passed in
431 memory. Complex types are split. */
434 pass_in_memory (machine_mode mode
, const_tree type
, bool for_return
)
438 if (AGGREGATE_TYPE_P (type
))
440 if (TREE_CODE (type
) == VECTOR_TYPE
)
444 if (!for_return
&& COMPLEX_MODE_P (mode
))
445 /* Complex types are passed as two underlying args. */
446 mode
= GET_MODE_INNER (mode
);
448 if (GET_MODE_CLASS (mode
) != MODE_INT
449 && GET_MODE_CLASS (mode
) != MODE_FLOAT
)
452 if (GET_MODE_SIZE (mode
) > UNITS_PER_WORD
)
458 /* A non-memory argument of mode MODE is being passed, determine the mode it
459 should be promoted to. This is also used for determining return
463 promote_arg (machine_mode mode
, bool prototyped
)
465 if (!prototyped
&& mode
== SFmode
)
466 /* K&R float promotion for unprototyped functions. */
468 else if (GET_MODE_SIZE (mode
) < GET_MODE_SIZE (SImode
))
474 /* A non-memory return type of MODE is being returned. Determine the
475 mode it should be promoted to. */
478 promote_return (machine_mode mode
)
480 return promote_arg (mode
, true);
483 /* Implement TARGET_FUNCTION_ARG. */
486 nvptx_function_arg (cumulative_args_t
ARG_UNUSED (cum_v
), machine_mode mode
,
487 const_tree
, bool named
)
489 if (mode
== VOIDmode
|| !named
)
492 return gen_reg_rtx (mode
);
495 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
498 nvptx_function_incoming_arg (cumulative_args_t cum_v
, machine_mode mode
,
499 const_tree
, bool named
)
501 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
503 if (mode
== VOIDmode
|| !named
)
506 /* No need to deal with split modes here, the only case that can
507 happen is complex modes and those are dealt with by
508 TARGET_SPLIT_COMPLEX_ARG. */
509 return gen_rtx_UNSPEC (mode
,
510 gen_rtvec (1, GEN_INT (cum
->count
)),
514 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
517 nvptx_function_arg_advance (cumulative_args_t cum_v
,
518 machine_mode
ARG_UNUSED (mode
),
519 const_tree
ARG_UNUSED (type
),
520 bool ARG_UNUSED (named
))
522 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
527 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
529 For nvptx This is only used for varadic args. The type has already
530 been promoted and/or converted to invisible reference. */
533 nvptx_function_arg_boundary (machine_mode mode
, const_tree
ARG_UNUSED (type
))
535 return GET_MODE_ALIGNMENT (mode
);
538 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
540 For nvptx, we know how to handle functions declared as stdarg: by
541 passing an extra pointer to the unnamed arguments. However, the
542 Fortran frontend can produce a different situation, where a
543 function pointer is declared with no arguments, but the actual
544 function and calls to it take more arguments. In that case, we
545 want to ensure the call matches the definition of the function. */
548 nvptx_strict_argument_naming (cumulative_args_t cum_v
)
550 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
552 return cum
->fntype
== NULL_TREE
|| stdarg_p (cum
->fntype
);
555 /* Implement TARGET_LIBCALL_VALUE. */
558 nvptx_libcall_value (machine_mode mode
, const_rtx
)
560 if (!cfun
|| !cfun
->machine
->doing_call
)
561 /* Pretend to return in a hard reg for early uses before pseudos can be
563 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
565 return gen_reg_rtx (mode
);
568 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
569 where function FUNC returns or receives a value of data type TYPE. */
572 nvptx_function_value (const_tree type
, const_tree
ARG_UNUSED (func
),
575 machine_mode mode
= promote_return (TYPE_MODE (type
));
580 cfun
->machine
->return_mode
= mode
;
581 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
584 return nvptx_libcall_value (mode
, NULL_RTX
);
587 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
590 nvptx_function_value_regno_p (const unsigned int regno
)
592 return regno
== NVPTX_RETURN_REGNUM
;
595 /* Types with a mode other than those supported by the machine are passed by
596 reference in memory. */
599 nvptx_pass_by_reference (cumulative_args_t
ARG_UNUSED (cum
),
600 machine_mode mode
, const_tree type
,
601 bool ARG_UNUSED (named
))
603 return pass_in_memory (mode
, type
, false);
606 /* Implement TARGET_RETURN_IN_MEMORY. */
609 nvptx_return_in_memory (const_tree type
, const_tree
)
611 return pass_in_memory (TYPE_MODE (type
), type
, true);
614 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
617 nvptx_promote_function_mode (const_tree type
, machine_mode mode
,
618 int *ARG_UNUSED (punsignedp
),
619 const_tree funtype
, int for_return
)
621 return promote_arg (mode
, for_return
|| !type
|| TYPE_ARG_TYPES (funtype
));
624 /* Helper for write_arg. Emit a single PTX argument of MODE, either
625 in a prototype, or as copy in a function prologue. ARGNO is the
626 index of this argument in the PTX function. FOR_REG is negative,
627 if we're emitting the PTX prototype. It is zero if we're copying
628 to an argument register and it is greater than zero if we're
629 copying to a specific hard register. */
632 write_arg_mode (std::stringstream
&s
, int for_reg
, int argno
,
635 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
639 /* Writing PTX prototype. */
640 s
<< (argno
? ", " : " (");
641 s
<< ".param" << ptx_type
<< " %in_ar" << argno
;
645 s
<< "\t.reg" << ptx_type
<< " ";
647 s
<< reg_names
[for_reg
];
653 s
<< "\tld.param" << ptx_type
<< " ";
655 s
<< reg_names
[for_reg
];
658 s
<< ", [%in_ar" << argno
<< "];\n";
664 /* Process function parameter TYPE to emit one or more PTX
665 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
666 is true, if this is a prototyped function, rather than an old-style
667 C declaration. Returns the next argument number to use.
669 The promotion behavior here must match the regular GCC function
670 parameter marshalling machinery. */
673 write_arg_type (std::stringstream
&s
, int for_reg
, int argno
,
674 tree type
, bool prototyped
)
676 machine_mode mode
= TYPE_MODE (type
);
678 if (mode
== VOIDmode
)
681 if (pass_in_memory (mode
, type
, false))
685 bool split
= TREE_CODE (type
) == COMPLEX_TYPE
;
689 /* Complex types are sent as two separate args. */
690 type
= TREE_TYPE (type
);
691 mode
= TYPE_MODE (type
);
695 mode
= promote_arg (mode
, prototyped
);
697 argno
= write_arg_mode (s
, for_reg
, argno
, mode
);
700 return write_arg_mode (s
, for_reg
, argno
, mode
);
703 /* Emit a PTX return as a prototype or function prologue declaration
707 write_return_mode (std::stringstream
&s
, bool for_proto
, machine_mode mode
)
709 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
710 const char *pfx
= "\t.reg";
711 const char *sfx
= ";\n";
714 pfx
= "(.param", sfx
= "_out) ";
716 s
<< pfx
<< ptx_type
<< " " << reg_names
[NVPTX_RETURN_REGNUM
] << sfx
;
719 /* Process a function return TYPE to emit a PTX return as a prototype
720 or function prologue declaration. Returns true if return is via an
721 additional pointer parameter. The promotion behavior here must
722 match the regular GCC function return mashalling. */
725 write_return_type (std::stringstream
&s
, bool for_proto
, tree type
)
727 machine_mode mode
= TYPE_MODE (type
);
729 if (mode
== VOIDmode
)
732 bool return_in_mem
= pass_in_memory (mode
, type
, true);
737 return return_in_mem
;
739 /* Named return values can cause us to return a pointer as well
740 as expect an argument for the return location. This is
741 optimization-level specific, so no caller can make use of
742 this data, but more importantly for us, we must ensure it
743 doesn't change the PTX prototype. */
744 mode
= (machine_mode
) cfun
->machine
->return_mode
;
746 if (mode
== VOIDmode
)
747 return return_in_mem
;
749 /* Clear return_mode to inhibit copy of retval to non-existent
751 cfun
->machine
->return_mode
= VOIDmode
;
754 mode
= promote_return (mode
);
756 write_return_mode (s
, for_proto
, mode
);
758 return return_in_mem
;
761 /* Look for attributes in ATTRS that would indicate we must write a function
762 as a .entry kernel rather than a .func. Return true if one is found. */
765 write_as_kernel (tree attrs
)
767 return (lookup_attribute ("kernel", attrs
) != NULL_TREE
768 || (lookup_attribute ("omp target entrypoint", attrs
) != NULL_TREE
769 && lookup_attribute ("oacc function", attrs
) != NULL_TREE
));
770 /* For OpenMP target regions, the corresponding kernel entry is emitted from
771 write_omp_entry as a separate function. */
774 /* Emit a linker marker for a function decl or defn. */
777 write_fn_marker (std::stringstream
&s
, bool is_defn
, bool globalize
,
783 s
<< " FUNCTION " << (is_defn
? "DEF: " : "DECL: ");
787 /* Emit a linker marker for a variable decl or defn. */
790 write_var_marker (FILE *file
, bool is_defn
, bool globalize
, const char *name
)
792 fprintf (file
, "\n// BEGIN%s VAR %s: ",
793 globalize
? " GLOBAL" : "",
794 is_defn
? "DEF" : "DECL");
795 assemble_name_raw (file
, name
);
799 /* Write a .func or .kernel declaration or definition along with
800 a helper comment for use by ld. S is the stream to write to, DECL
801 the decl for the function with name NAME. For definitions, emit
802 a declaration too. */
805 write_fn_proto (std::stringstream
&s
, bool is_defn
,
806 const char *name
, const_tree decl
)
809 /* Emit a declaration. The PTX assembler gets upset without it. */
810 name
= write_fn_proto (s
, false, name
, decl
);
813 /* Avoid repeating the name replacement. */
814 name
= nvptx_name_replacement (name
);
819 write_fn_marker (s
, is_defn
, TREE_PUBLIC (decl
), name
);
821 /* PTX declaration. */
822 if (DECL_EXTERNAL (decl
))
824 else if (TREE_PUBLIC (decl
))
825 s
<< (DECL_WEAK (decl
) ? ".weak " : ".visible ");
826 s
<< (write_as_kernel (DECL_ATTRIBUTES (decl
)) ? ".entry " : ".func ");
828 tree fntype
= TREE_TYPE (decl
);
829 tree result_type
= TREE_TYPE (fntype
);
831 /* atomic_compare_exchange_$n builtins have an exceptional calling
833 int not_atomic_weak_arg
= -1;
834 if (DECL_BUILT_IN_CLASS (decl
) == BUILT_IN_NORMAL
)
835 switch (DECL_FUNCTION_CODE (decl
))
837 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1
:
838 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2
:
839 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4
:
840 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8
:
841 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16
:
842 /* These atomics skip the 'weak' parm in an actual library
843 call. We must skip it in the prototype too. */
844 not_atomic_weak_arg
= 3;
851 /* Declare the result. */
852 bool return_in_mem
= write_return_type (s
, true, result_type
);
858 /* Emit argument list. */
860 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
863 NULL in TYPE_ARG_TYPES, for old-style functions
864 NULL in DECL_ARGUMENTS, for builtin functions without another
866 So we have to pick the best one we have. */
867 tree args
= TYPE_ARG_TYPES (fntype
);
868 bool prototyped
= true;
871 args
= DECL_ARGUMENTS (decl
);
875 for (; args
; args
= TREE_CHAIN (args
), not_atomic_weak_arg
--)
877 tree type
= prototyped
? TREE_VALUE (args
) : TREE_TYPE (args
);
879 if (not_atomic_weak_arg
)
880 argno
= write_arg_type (s
, -1, argno
, type
, prototyped
);
882 gcc_assert (type
== boolean_type_node
);
885 if (stdarg_p (fntype
))
886 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
888 if (DECL_STATIC_CHAIN (decl
))
889 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
891 if (!argno
&& strcmp (name
, "main") == 0)
893 argno
= write_arg_type (s
, -1, argno
, integer_type_node
, true);
894 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
900 s
<< (is_defn
? "\n" : ";\n");
905 /* Construct a function declaration from a call insn. This can be
906 necessary for two reasons - either we have an indirect call which
907 requires a .callprototype declaration, or we have a libcall
908 generated by emit_library_call for which no decl exists. */
911 write_fn_proto_from_insn (std::stringstream
&s
, const char *name
,
916 s
<< "\t.callprototype ";
921 name
= nvptx_name_replacement (name
);
922 write_fn_marker (s
, false, true, name
);
923 s
<< "\t.extern .func ";
926 if (result
!= NULL_RTX
)
927 write_return_mode (s
, true, GET_MODE (result
));
931 int arg_end
= XVECLEN (pat
, 0);
932 for (int i
= 1; i
< arg_end
; i
++)
934 /* We don't have to deal with mode splitting & promotion here,
935 as that was already done when generating the call
937 machine_mode mode
= GET_MODE (XEXP (XVECEXP (pat
, 0, i
), 0));
939 write_arg_mode (s
, -1, i
- 1, mode
);
946 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
947 table and and write a ptx prototype. These are emitted at end of
951 nvptx_record_fndecl (tree decl
)
953 tree
*slot
= declared_fndecls_htab
->find_slot (decl
, INSERT
);
957 const char *name
= get_fnname_from_decl (decl
);
958 write_fn_proto (func_decls
, false, name
, decl
);
962 /* Record a libcall or unprototyped external function. CALLEE is the
963 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
964 declaration for it. */
967 nvptx_record_libfunc (rtx callee
, rtx retval
, rtx pat
)
969 rtx
*slot
= declared_libfuncs_htab
->find_slot (callee
, INSERT
);
974 const char *name
= XSTR (callee
, 0);
975 write_fn_proto_from_insn (func_decls
, name
, retval
, pat
);
979 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
980 is prototyped, record it now. Otherwise record it as needed at end
981 of compilation, when we might have more information about it. */
984 nvptx_record_needed_fndecl (tree decl
)
986 if (TYPE_ARG_TYPES (TREE_TYPE (decl
)) == NULL_TREE
)
988 tree
*slot
= needed_fndecls_htab
->find_slot (decl
, INSERT
);
993 nvptx_record_fndecl (decl
);
996 /* SYM is a SYMBOL_REF. If it refers to an external function, record
1000 nvptx_maybe_record_fnsym (rtx sym
)
1002 tree decl
= SYMBOL_REF_DECL (sym
);
1004 if (decl
&& TREE_CODE (decl
) == FUNCTION_DECL
&& DECL_EXTERNAL (decl
))
1005 nvptx_record_needed_fndecl (decl
);
1008 /* Emit a local array to hold some part of a conventional stack frame
1009 and initialize REGNO to point to it. If the size is zero, it'll
1010 never be valid to dereference, so we can simply initialize to
1014 init_frame (FILE *file
, int regno
, unsigned align
, unsigned size
)
1017 fprintf (file
, "\t.local .align %d .b8 %s_ar[%u];\n",
1018 align
, reg_names
[regno
], size
);
1019 fprintf (file
, "\t.reg.u%d %s;\n",
1020 POINTER_SIZE
, reg_names
[regno
]);
1021 fprintf (file
, (size
? "\tcvta.local.u%d %s, %s_ar;\n"
1022 : "\tmov.u%d %s, 0;\n"),
1023 POINTER_SIZE
, reg_names
[regno
], reg_names
[regno
]);
1026 /* Emit soft stack frame setup sequence. */
1029 init_softstack_frame (FILE *file
, unsigned alignment
, HOST_WIDE_INT size
)
1031 /* Maintain 64-bit stack alignment. */
1032 unsigned keep_align
= BIGGEST_ALIGNMENT
/ BITS_PER_UNIT
;
1033 size
= ROUND_UP (size
, keep_align
);
1034 int bits
= POINTER_SIZE
;
1035 const char *reg_stack
= reg_names
[STACK_POINTER_REGNUM
];
1036 const char *reg_frame
= reg_names
[FRAME_POINTER_REGNUM
];
1037 const char *reg_sspslot
= reg_names
[SOFTSTACK_SLOT_REGNUM
];
1038 const char *reg_sspprev
= reg_names
[SOFTSTACK_PREV_REGNUM
];
1039 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_stack
);
1040 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_frame
);
1041 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_sspslot
);
1042 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_sspprev
);
1043 fprintf (file
, "\t{\n");
1044 fprintf (file
, "\t\t.reg.u32 %%fstmp0;\n");
1045 fprintf (file
, "\t\t.reg.u%d %%fstmp1;\n", bits
);
1046 fprintf (file
, "\t\t.reg.u%d %%fstmp2;\n", bits
);
1047 fprintf (file
, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1048 fprintf (file
, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1049 bits
== 64 ? ".wide" : ".lo", bits
/ 8);
1050 fprintf (file
, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits
);
1052 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1053 fprintf (file
, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits
, reg_sspslot
);
1055 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1056 fprintf (file
, "\t\tld.shared.u%d %s, [%s];\n",
1057 bits
, reg_sspprev
, reg_sspslot
);
1059 /* Initialize %frame = %sspprev - size. */
1060 fprintf (file
, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC
";\n",
1061 bits
, reg_frame
, reg_sspprev
, size
);
1063 /* Apply alignment, if larger than 64. */
1064 if (alignment
> keep_align
)
1065 fprintf (file
, "\t\tand.b%d %s, %s, %d;\n",
1066 bits
, reg_frame
, reg_frame
, -alignment
);
1068 size
= crtl
->outgoing_args_size
;
1069 gcc_assert (size
% keep_align
== 0);
1071 /* Initialize %stack. */
1072 fprintf (file
, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC
";\n",
1073 bits
, reg_stack
, reg_frame
, size
);
1076 fprintf (file
, "\t\tst.shared.u%d [%s], %s;\n",
1077 bits
, reg_sspslot
, reg_stack
);
1078 fprintf (file
, "\t}\n");
1079 cfun
->machine
->has_softstack
= true;
1080 need_softstack_decl
= true;
1083 /* Emit code to initialize the REGNO predicate register to indicate
1084 whether we are not lane zero on the NAME axis. */
1087 nvptx_init_axis_predicate (FILE *file
, int regno
, const char *name
)
1089 fprintf (file
, "\t{\n");
1090 fprintf (file
, "\t\t.reg.u32\t%%%s;\n", name
);
1091 fprintf (file
, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name
, name
);
1092 fprintf (file
, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno
, name
);
1093 fprintf (file
, "\t}\n");
1096 /* Emit code to initialize predicate and master lane index registers for
1097 -muniform-simt code generation variant. */
1100 nvptx_init_unisimt_predicate (FILE *file
)
1102 cfun
->machine
->unisimt_location
= gen_reg_rtx (Pmode
);
1103 int loc
= REGNO (cfun
->machine
->unisimt_location
);
1104 int bits
= POINTER_SIZE
;
1105 fprintf (file
, "\t.reg.u%d %%r%d;\n", bits
, loc
);
1106 fprintf (file
, "\t{\n");
1107 fprintf (file
, "\t\t.reg.u32 %%ustmp0;\n");
1108 fprintf (file
, "\t\t.reg.u%d %%ustmp1;\n", bits
);
1109 fprintf (file
, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1110 fprintf (file
, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1111 bits
== 64 ? ".wide" : ".lo");
1112 fprintf (file
, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits
, loc
);
1113 fprintf (file
, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits
, loc
, loc
);
1114 if (cfun
->machine
->unisimt_predicate
)
1116 int master
= REGNO (cfun
->machine
->unisimt_master
);
1117 int pred
= REGNO (cfun
->machine
->unisimt_predicate
);
1118 fprintf (file
, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master
, loc
);
1119 fprintf (file
, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1120 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1121 fprintf (file
, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master
, master
);
1122 /* Compute predicate as 'tid.x == master'. */
1123 fprintf (file
, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred
, master
);
1125 fprintf (file
, "\t}\n");
1126 need_unisimt_decl
= true;
1129 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1131 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1132 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1134 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1135 __nvptx_uni[tid.y] = 0;
1136 gomp_nvptx_main (ORIG, arg);
1138 ORIG itself should not be emitted as a PTX .entry function. */
1141 write_omp_entry (FILE *file
, const char *name
, const char *orig
)
1143 static bool gomp_nvptx_main_declared
;
1144 if (!gomp_nvptx_main_declared
)
1146 gomp_nvptx_main_declared
= true;
1147 write_fn_marker (func_decls
, false, true, "gomp_nvptx_main");
1148 func_decls
<< ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1149 << " %in_ar1, .param.u" << POINTER_SIZE
<< " %in_ar2);\n";
1151 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1152 #define NTID_Y "%ntid.y"
1153 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1154 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1157 .reg.u" PS " %R<4>;\n\
1158 mov.u32 %r0, %tid.y;\n\
1159 mov.u32 %r1, " NTID_Y ";\n\
1160 mov.u32 %r2, %ctaid.x;\n\
1161 cvt.u" PS ".u32 %R1, %r0;\n\
1162 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1163 mov.u" PS " %R0, __nvptx_stacks;\n\
1164 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1165 ld.param.u" PS " %R2, [%stack];\n\
1166 ld.param.u" PS " %R3, [%sz];\n\
1167 add.u" PS " %R2, %R2, %R3;\n\
1168 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1169 st.shared.u" PS " [%R0], %R2;\n\
1170 mov.u" PS " %R0, __nvptx_uni;\n\
1171 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1173 st.shared.u32 [%R0], %r0;\n\
1174 mov.u" PS " %R0, \0;\n\
1175 ld.param.u" PS " %R1, [%arg];\n\
1177 .param.u" PS " %P<2>;\n\
1178 st.param.u" PS " [%P0], %R0;\n\
1179 st.param.u" PS " [%P1], %R1;\n\
1180 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1184 static const char entry64
[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1185 static const char entry32
[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1186 #undef ENTRY_TEMPLATE
1188 const char *entry_1
= TARGET_ABI64
? entry64
: entry32
;
1189 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1190 const char *entry_2
= entry_1
+ strlen (entry64
) + 1;
1191 fprintf (file
, ".visible .entry %s%s%s%s", name
, entry_1
, orig
, entry_2
);
1192 need_softstack_decl
= need_unisimt_decl
= true;
1195 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1196 function, including local var decls and copies from the arguments to
1200 nvptx_declare_function_name (FILE *file
, const char *name
, const_tree decl
)
1202 tree fntype
= TREE_TYPE (decl
);
1203 tree result_type
= TREE_TYPE (fntype
);
1206 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl
))
1207 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl
)))
1209 char *buf
= (char *) alloca (strlen (name
) + sizeof ("$impl"));
1210 sprintf (buf
, "%s$impl", name
);
1211 write_omp_entry (file
, name
, buf
);
1214 /* We construct the initial part of the function into a string
1215 stream, in order to share the prototype writing code. */
1216 std::stringstream s
;
1217 write_fn_proto (s
, true, name
, decl
);
1220 bool return_in_mem
= write_return_type (s
, false, result_type
);
1222 argno
= write_arg_type (s
, 0, argno
, ptr_type_node
, true);
1224 /* Declare and initialize incoming arguments. */
1225 tree args
= TYPE_ARG_TYPES (fntype
);
1226 bool prototyped
= true;
1229 args
= DECL_ARGUMENTS (decl
);
1233 for (; args
!= NULL_TREE
; args
= TREE_CHAIN (args
))
1235 tree type
= prototyped
? TREE_VALUE (args
) : TREE_TYPE (args
);
1237 argno
= write_arg_type (s
, 0, argno
, type
, prototyped
);
1240 if (stdarg_p (fntype
))
1241 argno
= write_arg_type (s
, ARG_POINTER_REGNUM
, argno
, ptr_type_node
,
1244 if (DECL_STATIC_CHAIN (decl
) || cfun
->machine
->has_chain
)
1245 write_arg_type (s
, STATIC_CHAIN_REGNUM
,
1246 DECL_STATIC_CHAIN (decl
) ? argno
: -1, ptr_type_node
,
1249 fprintf (file
, "%s", s
.str().c_str());
1251 /* Usually 'crtl->is_leaf' is computed during register allocator
1252 initialization (which is not done on NVPTX) or for pressure-sensitive
1253 optimizations. Initialize it here, except if already set. */
1255 crtl
->is_leaf
= leaf_function_p ();
1257 HOST_WIDE_INT sz
= get_frame_size ();
1258 bool need_frameptr
= sz
|| cfun
->machine
->has_chain
;
1259 int alignment
= crtl
->stack_alignment_needed
/ BITS_PER_UNIT
;
1260 if (!TARGET_SOFT_STACK
)
1262 /* Declare a local var for outgoing varargs. */
1263 if (cfun
->machine
->has_varadic
)
1264 init_frame (file
, STACK_POINTER_REGNUM
,
1265 UNITS_PER_WORD
, crtl
->outgoing_args_size
);
1267 /* Declare a local variable for the frame. Force its size to be
1268 DImode-compatible. */
1270 init_frame (file
, FRAME_POINTER_REGNUM
, alignment
,
1271 ROUND_UP (sz
, GET_MODE_SIZE (DImode
)));
1273 else if (need_frameptr
|| cfun
->machine
->has_varadic
|| cfun
->calls_alloca
1274 || (cfun
->machine
->has_simtreg
&& !crtl
->is_leaf
))
1275 init_softstack_frame (file
, alignment
, sz
);
1277 if (cfun
->machine
->has_simtreg
)
1279 unsigned HOST_WIDE_INT
&simtsz
= cfun
->machine
->simt_stack_size
;
1280 unsigned HOST_WIDE_INT
&align
= cfun
->machine
->simt_stack_align
;
1281 align
= MAX (align
, GET_MODE_SIZE (DImode
));
1282 if (!crtl
->is_leaf
|| cfun
->calls_alloca
)
1283 simtsz
= HOST_WIDE_INT_M1U
;
1284 if (simtsz
== HOST_WIDE_INT_M1U
)
1285 simtsz
= nvptx_softstack_size
;
1286 if (cfun
->machine
->has_softstack
)
1287 simtsz
+= POINTER_SIZE
/ 8;
1288 simtsz
= ROUND_UP (simtsz
, GET_MODE_SIZE (DImode
));
1289 if (align
> GET_MODE_SIZE (DImode
))
1290 simtsz
+= align
- GET_MODE_SIZE (DImode
);
1292 fprintf (file
, "\t.local.align 8 .b8 %%simtstack_ar["
1293 HOST_WIDE_INT_PRINT_DEC
"];\n", simtsz
);
1295 /* Declare the pseudos we have as ptx registers. */
1296 int maxregs
= max_reg_num ();
1297 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< maxregs
; i
++)
1299 if (regno_reg_rtx
[i
] != const0_rtx
)
1301 machine_mode mode
= PSEUDO_REGNO_MODE (i
);
1302 machine_mode split
= maybe_split_mode (mode
);
1304 if (split_mode_p (mode
))
1306 fprintf (file
, "\t.reg%s ", nvptx_ptx_type_from_mode (mode
, true));
1307 output_reg (file
, i
, split
, -2);
1308 fprintf (file
, ";\n");
1312 /* Emit axis predicates. */
1313 if (cfun
->machine
->axis_predicate
[0])
1314 nvptx_init_axis_predicate (file
,
1315 REGNO (cfun
->machine
->axis_predicate
[0]), "y");
1316 if (cfun
->machine
->axis_predicate
[1])
1317 nvptx_init_axis_predicate (file
,
1318 REGNO (cfun
->machine
->axis_predicate
[1]), "x");
1319 if (cfun
->machine
->unisimt_predicate
1320 || (cfun
->machine
->has_simtreg
&& !crtl
->is_leaf
))
1321 nvptx_init_unisimt_predicate (file
);
1324 /* Output code for switching uniform-simt state. ENTERING indicates whether
1325 we are entering or leaving non-uniform execution region. */
1328 nvptx_output_unisimt_switch (FILE *file
, bool entering
)
1330 if (crtl
->is_leaf
&& !cfun
->machine
->unisimt_predicate
)
1332 fprintf (file
, "\t{\n");
1333 fprintf (file
, "\t\t.reg.u32 %%ustmp2;\n");
1334 fprintf (file
, "\t\tmov.u32 %%ustmp2, %d;\n", entering
? -1 : 0);
1337 int loc
= REGNO (cfun
->machine
->unisimt_location
);
1338 fprintf (file
, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc
);
1340 if (cfun
->machine
->unisimt_predicate
)
1342 int master
= REGNO (cfun
->machine
->unisimt_master
);
1343 int pred
= REGNO (cfun
->machine
->unisimt_predicate
);
1344 fprintf (file
, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1345 fprintf (file
, "\t\tmov.u32 %%r%d, %s;\n",
1346 master
, entering
? "%ustmp2" : "0");
1347 fprintf (file
, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred
, master
);
1349 fprintf (file
, "\t}\n");
1352 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1353 ENTERING indicates whether we are entering or leaving non-uniform execution.
1354 PTR is the register pointing to allocated storage, it is assigned to on
1355 entering and used to restore state on leaving. SIZE and ALIGN are used only
1359 nvptx_output_softstack_switch (FILE *file
, bool entering
,
1360 rtx ptr
, rtx size
, rtx align
)
1362 gcc_assert (REG_P (ptr
) && !HARD_REGISTER_P (ptr
));
1363 if (crtl
->is_leaf
&& !cfun
->machine
->simt_stack_size
)
1365 int bits
= POINTER_SIZE
, regno
= REGNO (ptr
);
1366 fprintf (file
, "\t{\n");
1369 fprintf (file
, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1370 HOST_WIDE_INT_PRINT_DEC
";\n", bits
, regno
,
1371 cfun
->machine
->simt_stack_size
);
1372 fprintf (file
, "\t\tsub.u%d %%r%d, %%r%d, ", bits
, regno
, regno
);
1373 if (CONST_INT_P (size
))
1374 fprintf (file
, HOST_WIDE_INT_PRINT_DEC
,
1375 ROUND_UP (UINTVAL (size
), GET_MODE_SIZE (DImode
)));
1377 output_reg (file
, REGNO (size
), VOIDmode
);
1378 fputs (";\n", file
);
1379 if (!CONST_INT_P (size
) || UINTVAL (align
) > GET_MODE_SIZE (DImode
))
1381 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC
";\n",
1382 bits
, regno
, regno
, UINTVAL (align
));
1384 if (cfun
->machine
->has_softstack
)
1386 const char *reg_stack
= reg_names
[STACK_POINTER_REGNUM
];
1389 fprintf (file
, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1390 bits
, regno
, bits
/ 8, reg_stack
);
1391 fprintf (file
, "\t\tsub.u%d %s, %%r%d, %d;\n",
1392 bits
, reg_stack
, regno
, bits
/ 8);
1396 fprintf (file
, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1397 bits
, reg_stack
, regno
, bits
/ 8);
1399 nvptx_output_set_softstack (REGNO (stack_pointer_rtx
));
1401 fprintf (file
, "\t}\n");
1404 /* Output code to enter non-uniform execution region. DEST is a register
1405 to hold a per-lane allocation given by SIZE and ALIGN. */
1408 nvptx_output_simt_enter (rtx dest
, rtx size
, rtx align
)
1410 nvptx_output_unisimt_switch (asm_out_file
, true);
1411 nvptx_output_softstack_switch (asm_out_file
, true, dest
, size
, align
);
1415 /* Output code to leave non-uniform execution region. SRC is the register
1416 holding per-lane storage previously allocated by omp_simt_enter insn. */
1419 nvptx_output_simt_exit (rtx src
)
1421 nvptx_output_unisimt_switch (asm_out_file
, false);
1422 nvptx_output_softstack_switch (asm_out_file
, false, src
, NULL_RTX
, NULL_RTX
);
1426 /* Output instruction that sets soft stack pointer in shared memory to the
1427 value in register given by SRC_REGNO. */
1430 nvptx_output_set_softstack (unsigned src_regno
)
1432 if (cfun
->machine
->has_softstack
&& !crtl
->is_leaf
)
1434 fprintf (asm_out_file
, "\tst.shared.u%d\t[%s], ",
1435 POINTER_SIZE
, reg_names
[SOFTSTACK_SLOT_REGNUM
]);
1436 output_reg (asm_out_file
, src_regno
, VOIDmode
);
1437 fprintf (asm_out_file
, ";\n");
1441 /* Output a return instruction. Also copy the return value to its outgoing
1445 nvptx_output_return (void)
1447 machine_mode mode
= (machine_mode
)cfun
->machine
->return_mode
;
1449 if (mode
!= VOIDmode
)
1450 fprintf (asm_out_file
, "\tst.param%s\t[%s_out], %s;\n",
1451 nvptx_ptx_type_from_mode (mode
, false),
1452 reg_names
[NVPTX_RETURN_REGNUM
],
1453 reg_names
[NVPTX_RETURN_REGNUM
]);
1458 /* Terminate a function by writing a closing brace to FILE. */
1461 nvptx_function_end (FILE *file
)
1463 fprintf (file
, "}\n");
1466 /* Decide whether we can make a sibling call to a function. For ptx, we
1470 nvptx_function_ok_for_sibcall (tree
, tree
)
1475 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1478 nvptx_get_drap_rtx (void)
1480 if (TARGET_SOFT_STACK
&& stack_realign_drap
)
1481 return arg_pointer_rtx
;
1485 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1486 argument to the next call. */
1489 nvptx_call_args (rtx arg
, tree fntype
)
1491 if (!cfun
->machine
->doing_call
)
1493 cfun
->machine
->doing_call
= true;
1494 cfun
->machine
->is_varadic
= false;
1495 cfun
->machine
->num_args
= 0;
1497 if (fntype
&& stdarg_p (fntype
))
1499 cfun
->machine
->is_varadic
= true;
1500 cfun
->machine
->has_varadic
= true;
1501 cfun
->machine
->num_args
++;
1505 if (REG_P (arg
) && arg
!= pc_rtx
)
1507 cfun
->machine
->num_args
++;
1508 cfun
->machine
->call_args
= alloc_EXPR_LIST (VOIDmode
, arg
,
1509 cfun
->machine
->call_args
);
1513 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1514 information we recorded. */
1517 nvptx_end_call_args (void)
1519 cfun
->machine
->doing_call
= false;
1520 free_EXPR_LIST_list (&cfun
->machine
->call_args
);
1523 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1524 track of whether calls involving static chains or varargs were seen
1525 in the current function.
1526 For libcalls, maintain a hash table of decls we have seen, and
1527 record a function decl for later when encountering a new one. */
1530 nvptx_expand_call (rtx retval
, rtx address
)
1532 rtx callee
= XEXP (address
, 0);
1533 rtx varargs
= NULL_RTX
;
1534 unsigned parallel
= 0;
1536 if (!call_insn_operand (callee
, Pmode
))
1538 callee
= force_reg (Pmode
, callee
);
1539 address
= change_address (address
, QImode
, callee
);
1542 if (GET_CODE (callee
) == SYMBOL_REF
)
1544 tree decl
= SYMBOL_REF_DECL (callee
);
1545 if (decl
!= NULL_TREE
)
1547 if (DECL_STATIC_CHAIN (decl
))
1548 cfun
->machine
->has_chain
= true;
1550 tree attr
= oacc_get_fn_attrib (decl
);
1553 tree dims
= TREE_VALUE (attr
);
1555 parallel
= GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1;
1556 for (int ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++)
1558 if (TREE_PURPOSE (dims
)
1559 && !integer_zerop (TREE_PURPOSE (dims
)))
1561 /* Not on this axis. */
1562 parallel
^= GOMP_DIM_MASK (ix
);
1563 dims
= TREE_CHAIN (dims
);
1569 unsigned nargs
= cfun
->machine
->num_args
;
1570 if (cfun
->machine
->is_varadic
)
1572 varargs
= gen_reg_rtx (Pmode
);
1573 emit_move_insn (varargs
, stack_pointer_rtx
);
1576 rtvec vec
= rtvec_alloc (nargs
+ 1);
1577 rtx pat
= gen_rtx_PARALLEL (VOIDmode
, vec
);
1580 rtx call
= gen_rtx_CALL (VOIDmode
, address
, const0_rtx
);
1581 rtx tmp_retval
= retval
;
1584 if (!nvptx_register_operand (retval
, GET_MODE (retval
)))
1585 tmp_retval
= gen_reg_rtx (GET_MODE (retval
));
1586 call
= gen_rtx_SET (tmp_retval
, call
);
1588 XVECEXP (pat
, 0, vec_pos
++) = call
;
1590 /* Construct the call insn, including a USE for each argument pseudo
1591 register. These will be used when printing the insn. */
1592 for (rtx arg
= cfun
->machine
->call_args
; arg
; arg
= XEXP (arg
, 1))
1593 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, XEXP (arg
, 0));
1596 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, varargs
);
1598 gcc_assert (vec_pos
= XVECLEN (pat
, 0));
1600 nvptx_emit_forking (parallel
, true);
1601 emit_call_insn (pat
);
1602 nvptx_emit_joining (parallel
, true);
1604 if (tmp_retval
!= retval
)
1605 emit_move_insn (retval
, tmp_retval
);
1608 /* Emit a comparison COMPARE, and return the new test to be used in the
1612 nvptx_expand_compare (rtx compare
)
1614 rtx pred
= gen_reg_rtx (BImode
);
1615 rtx cmp
= gen_rtx_fmt_ee (GET_CODE (compare
), BImode
,
1616 XEXP (compare
, 0), XEXP (compare
, 1));
1617 emit_insn (gen_rtx_SET (pred
, cmp
));
1618 return gen_rtx_NE (BImode
, pred
, const0_rtx
);
1621 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1624 nvptx_expand_oacc_fork (unsigned mode
)
1626 nvptx_emit_forking (GOMP_DIM_MASK (mode
), false);
1630 nvptx_expand_oacc_join (unsigned mode
)
1632 nvptx_emit_joining (GOMP_DIM_MASK (mode
), false);
1635 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1639 nvptx_gen_unpack (rtx dst0
, rtx dst1
, rtx src
)
1643 switch (GET_MODE (src
))
1646 res
= gen_unpackdisi2 (dst0
, dst1
, src
);
1649 res
= gen_unpackdfsi2 (dst0
, dst1
, src
);
1651 default: gcc_unreachable ();
1656 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1660 nvptx_gen_pack (rtx dst
, rtx src0
, rtx src1
)
1664 switch (GET_MODE (dst
))
1667 res
= gen_packsidi2 (dst
, src0
, src1
);
1670 res
= gen_packsidf2 (dst
, src0
, src1
);
1672 default: gcc_unreachable ();
1677 /* Generate an instruction or sequence to broadcast register REG
1678 across the vectors of a single warp. */
1681 nvptx_gen_shuffle (rtx dst
, rtx src
, rtx idx
, nvptx_shuffle_kind kind
)
1685 switch (GET_MODE (dst
))
1688 res
= gen_nvptx_shufflesi (dst
, src
, idx
, GEN_INT (kind
));
1691 res
= gen_nvptx_shufflesf (dst
, src
, idx
, GEN_INT (kind
));
1696 rtx tmp0
= gen_reg_rtx (SImode
);
1697 rtx tmp1
= gen_reg_rtx (SImode
);
1700 emit_insn (nvptx_gen_unpack (tmp0
, tmp1
, src
));
1701 emit_insn (nvptx_gen_shuffle (tmp0
, tmp0
, idx
, kind
));
1702 emit_insn (nvptx_gen_shuffle (tmp1
, tmp1
, idx
, kind
));
1703 emit_insn (nvptx_gen_pack (dst
, tmp0
, tmp1
));
1710 rtx tmp
= gen_reg_rtx (SImode
);
1713 emit_insn (gen_sel_truesi (tmp
, src
, GEN_INT (1), const0_rtx
));
1714 emit_insn (nvptx_gen_shuffle (tmp
, tmp
, idx
, kind
));
1715 emit_insn (gen_rtx_SET (dst
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
1723 rtx tmp
= gen_reg_rtx (SImode
);
1726 emit_insn (gen_rtx_SET (tmp
, gen_rtx_fmt_e (ZERO_EXTEND
, SImode
, src
)));
1727 emit_insn (nvptx_gen_shuffle (tmp
, tmp
, idx
, kind
));
1728 emit_insn (gen_rtx_SET (dst
, gen_rtx_fmt_e (TRUNCATE
, GET_MODE (dst
),
1741 /* Generate an instruction or sequence to broadcast register REG
1742 across the vectors of a single warp. */
1745 nvptx_gen_vcast (rtx reg
)
1747 return nvptx_gen_shuffle (reg
, reg
, const0_rtx
, SHUFFLE_IDX
);
1750 /* Structure used when generating a worker-level spill or fill. */
1754 rtx base
; /* Register holding base addr of buffer. */
1755 rtx ptr
; /* Iteration var, if needed. */
1756 unsigned offset
; /* Offset into worker buffer. */
1759 /* Direction of the spill/fill and looping setup/teardown indicator. */
1765 PM_loop_begin
= 1 << 2,
1766 PM_loop_end
= 1 << 3,
1768 PM_read_write
= PM_read
| PM_write
1771 /* Generate instruction(s) to spill or fill register REG to/from the
1772 worker broadcast array. PM indicates what is to be done, REP
1773 how many loop iterations will be executed (0 for not a loop). */
1776 nvptx_gen_wcast (rtx reg
, propagate_mask pm
, unsigned rep
, wcast_data_t
*data
)
1779 machine_mode mode
= GET_MODE (reg
);
1785 rtx tmp
= gen_reg_rtx (SImode
);
1789 emit_insn (gen_sel_truesi (tmp
, reg
, GEN_INT (1), const0_rtx
));
1790 emit_insn (nvptx_gen_wcast (tmp
, pm
, rep
, data
));
1792 emit_insn (gen_rtx_SET (reg
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
1800 rtx addr
= data
->ptr
;
1804 unsigned align
= GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
;
1806 if (align
> worker_bcast_align
)
1807 worker_bcast_align
= align
;
1808 data
->offset
= (data
->offset
+ align
- 1) & ~(align
- 1);
1811 addr
= gen_rtx_PLUS (Pmode
, addr
, GEN_INT (data
->offset
));
1814 addr
= gen_rtx_MEM (mode
, addr
);
1816 res
= gen_rtx_SET (addr
, reg
);
1817 else if (pm
== PM_write
)
1818 res
= gen_rtx_SET (reg
, addr
);
1824 /* We're using a ptr, increment it. */
1828 emit_insn (gen_adddi3 (data
->ptr
, data
->ptr
,
1829 GEN_INT (GET_MODE_SIZE (GET_MODE (reg
)))));
1835 data
->offset
+= rep
* GET_MODE_SIZE (GET_MODE (reg
));
1842 /* Returns true if X is a valid address for use in a memory reference. */
1845 nvptx_legitimate_address_p (machine_mode
, rtx x
, bool)
1847 enum rtx_code code
= GET_CODE (x
);
1855 if (REG_P (XEXP (x
, 0)) && CONST_INT_P (XEXP (x
, 1)))
1869 /* Machinery to output constant initializers. When beginning an
1870 initializer, we decide on a fragment size (which is visible in ptx
1871 in the type used), and then all initializer data is buffered until
1872 a fragment is filled and ready to be written out. */
1876 unsigned HOST_WIDE_INT mask
; /* Mask for storing fragment. */
1877 unsigned HOST_WIDE_INT val
; /* Current fragment value. */
1878 unsigned HOST_WIDE_INT remaining
; /* Remaining bytes to be written
1880 unsigned size
; /* Fragment size to accumulate. */
1881 unsigned offset
; /* Offset within current fragment. */
1882 bool started
; /* Whether we've output any initializer. */
1885 /* The current fragment is full, write it out. SYM may provide a
1886 symbolic reference we should output, in which case the fragment
1887 value is the addend. */
1890 output_init_frag (rtx sym
)
1892 fprintf (asm_out_file
, init_frag
.started
? ", " : " = { ");
1893 unsigned HOST_WIDE_INT val
= init_frag
.val
;
1895 init_frag
.started
= true;
1897 init_frag
.offset
= 0;
1898 init_frag
.remaining
--;
1902 bool function
= (SYMBOL_REF_DECL (sym
)
1903 && (TREE_CODE (SYMBOL_REF_DECL (sym
)) == FUNCTION_DECL
));
1905 fprintf (asm_out_file
, "generic(");
1906 output_address (VOIDmode
, sym
);
1908 fprintf (asm_out_file
, ")");
1910 fprintf (asm_out_file
, " + ");
1914 fprintf (asm_out_file
, HOST_WIDE_INT_PRINT_DEC
, val
);
1917 /* Add value VAL of size SIZE to the data we're emitting, and keep
1918 writing out chunks as they fill up. */
1921 nvptx_assemble_value (unsigned HOST_WIDE_INT val
, unsigned size
)
1923 val
&= ((unsigned HOST_WIDE_INT
)2 << (size
* BITS_PER_UNIT
- 1)) - 1;
1925 for (unsigned part
= 0; size
; size
-= part
)
1927 val
>>= part
* BITS_PER_UNIT
;
1928 part
= init_frag
.size
- init_frag
.offset
;
1932 unsigned HOST_WIDE_INT partial
1933 = val
<< (init_frag
.offset
* BITS_PER_UNIT
);
1934 init_frag
.val
|= partial
& init_frag
.mask
;
1935 init_frag
.offset
+= part
;
1937 if (init_frag
.offset
== init_frag
.size
)
1938 output_init_frag (NULL
);
1942 /* Target hook for assembling integer object X of size SIZE. */
1945 nvptx_assemble_integer (rtx x
, unsigned int size
, int ARG_UNUSED (aligned_p
))
1947 HOST_WIDE_INT val
= 0;
1949 switch (GET_CODE (x
))
1952 /* Let the generic machinery figure it out, usually for a
1957 nvptx_assemble_value (INTVAL (x
), size
);
1962 gcc_assert (GET_CODE (x
) == PLUS
);
1963 val
= INTVAL (XEXP (x
, 1));
1965 gcc_assert (GET_CODE (x
) == SYMBOL_REF
);
1969 gcc_assert (size
== init_frag
.size
);
1970 if (init_frag
.offset
)
1971 sorry ("cannot emit unaligned pointers in ptx assembly");
1973 nvptx_maybe_record_fnsym (x
);
1974 init_frag
.val
= val
;
1975 output_init_frag (x
);
1982 /* Output SIZE zero bytes. We ignore the FILE argument since the
1983 functions we're calling to perform the output just use
1987 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size
)
1989 /* Finish the current fragment, if it's started. */
1990 if (init_frag
.offset
)
1992 unsigned part
= init_frag
.size
- init_frag
.offset
;
1994 part
= (unsigned) size
;
1996 nvptx_assemble_value (0, part
);
1999 /* If this skip doesn't terminate the initializer, write as many
2000 remaining pieces as possible directly. */
2001 if (size
< init_frag
.remaining
* init_frag
.size
)
2003 while (size
>= init_frag
.size
)
2005 size
-= init_frag
.size
;
2006 output_init_frag (NULL_RTX
);
2009 nvptx_assemble_value (0, size
);
2013 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2014 ignore the FILE arg. */
2017 nvptx_output_ascii (FILE *, const char *str
, unsigned HOST_WIDE_INT size
)
2019 for (unsigned HOST_WIDE_INT i
= 0; i
< size
; i
++)
2020 nvptx_assemble_value (str
[i
], 1);
2023 /* Return true if TYPE is a record type where the last field is an array without
2027 flexible_array_member_type_p (const_tree type
)
2029 if (TREE_CODE (type
) != RECORD_TYPE
)
2032 const_tree last_field
= NULL_TREE
;
2033 for (const_tree f
= TYPE_FIELDS (type
); f
; f
= TREE_CHAIN (f
))
2039 const_tree last_field_type
= TREE_TYPE (last_field
);
2040 if (TREE_CODE (last_field_type
) != ARRAY_TYPE
)
2043 return (! TYPE_DOMAIN (last_field_type
)
2044 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type
)));
2047 /* Emit a PTX variable decl and prepare for emission of its
2048 initializer. NAME is the symbol name and SETION the PTX data
2049 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2050 The caller has already emitted any indentation and linkage
2051 specifier. It is responsible for any initializer, terminating ;
2052 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2053 this is the opposite way round that PTX wants them! */
2056 nvptx_assemble_decl_begin (FILE *file
, const char *name
, const char *section
,
2057 const_tree type
, HOST_WIDE_INT size
, unsigned align
,
2058 bool undefined
= false)
2060 bool atype
= (TREE_CODE (type
) == ARRAY_TYPE
)
2061 && (TYPE_DOMAIN (type
) == NULL_TREE
);
2063 if (undefined
&& flexible_array_member_type_p (type
))
2069 while (TREE_CODE (type
) == ARRAY_TYPE
)
2070 type
= TREE_TYPE (type
);
2072 if (TREE_CODE (type
) == VECTOR_TYPE
2073 || TREE_CODE (type
) == COMPLEX_TYPE
)
2074 /* Neither vector nor complex types can contain the other. */
2075 type
= TREE_TYPE (type
);
2077 unsigned elt_size
= int_size_in_bytes (type
);
2079 /* Largest mode we're prepared to accept. For BLKmode types we
2080 don't know if it'll contain pointer constants, so have to choose
2081 pointer size, otherwise we can choose DImode. */
2082 machine_mode elt_mode
= TYPE_MODE (type
) == BLKmode
? Pmode
: DImode
;
2084 elt_size
|= GET_MODE_SIZE (elt_mode
);
2085 elt_size
&= -elt_size
; /* Extract LSB set. */
2087 init_frag
.size
= elt_size
;
2088 /* Avoid undefined shift behavior by using '2'. */
2089 init_frag
.mask
= ((unsigned HOST_WIDE_INT
)2
2090 << (elt_size
* BITS_PER_UNIT
- 1)) - 1;
2092 init_frag
.offset
= 0;
2093 init_frag
.started
= false;
2094 /* Size might not be a multiple of elt size, if there's an
2095 initialized trailing struct array with smaller type than
2097 init_frag
.remaining
= (size
+ elt_size
- 1) / elt_size
;
2099 fprintf (file
, "%s .align %d .u%d ",
2100 section
, align
/ BITS_PER_UNIT
,
2101 elt_size
* BITS_PER_UNIT
);
2102 assemble_name (file
, name
);
2105 /* We make everything an array, to simplify any initialization
2107 fprintf (file
, "[" HOST_WIDE_INT_PRINT_DEC
"]", init_frag
.remaining
);
2109 fprintf (file
, "[]");
2112 /* Called when the initializer for a decl has been completely output through
2113 combinations of the three functions above. */
2116 nvptx_assemble_decl_end (void)
2118 if (init_frag
.offset
)
2119 /* This can happen with a packed struct with trailing array member. */
2120 nvptx_assemble_value (0, init_frag
.size
- init_frag
.offset
);
2121 fprintf (asm_out_file
, init_frag
.started
? " };\n" : ";\n");
2124 /* Output an uninitialized common or file-scope variable. */
2127 nvptx_output_aligned_decl (FILE *file
, const char *name
,
2128 const_tree decl
, HOST_WIDE_INT size
, unsigned align
)
2130 write_var_marker (file
, true, TREE_PUBLIC (decl
), name
);
2132 /* If this is public, it is common. The nearest thing we have to
2134 fprintf (file
, "\t%s", TREE_PUBLIC (decl
) ? ".weak " : "");
2136 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2137 TREE_TYPE (decl
), size
, align
);
2138 nvptx_assemble_decl_end ();
2141 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2142 writing a constant variable EXP with NAME and SIZE and its
2143 initializer to FILE. */
2146 nvptx_asm_declare_constant_name (FILE *file
, const char *name
,
2147 const_tree exp
, HOST_WIDE_INT obj_size
)
2149 write_var_marker (file
, true, false, name
);
2151 fprintf (file
, "\t");
2153 tree type
= TREE_TYPE (exp
);
2154 nvptx_assemble_decl_begin (file
, name
, ".const", type
, obj_size
,
2158 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2159 a variable DECL with NAME to FILE. */
2162 nvptx_declare_object_name (FILE *file
, const char *name
, const_tree decl
)
2164 write_var_marker (file
, true, TREE_PUBLIC (decl
), name
);
2166 fprintf (file
, "\t%s", (!TREE_PUBLIC (decl
) ? ""
2167 : DECL_WEAK (decl
) ? ".weak " : ".visible "));
2169 tree type
= TREE_TYPE (decl
);
2170 HOST_WIDE_INT obj_size
= tree_to_shwi (DECL_SIZE_UNIT (decl
));
2171 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2172 type
, obj_size
, DECL_ALIGN (decl
));
2175 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2178 nvptx_globalize_label (FILE *, const char *)
2182 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2183 declaration only for variable DECL with NAME to FILE. */
2186 nvptx_assemble_undefined_decl (FILE *file
, const char *name
, const_tree decl
)
2188 /* The middle end can place constant pool decls into the varpool as
2189 undefined. Until that is fixed, catch the problem here. */
2190 if (DECL_IN_CONSTANT_POOL (decl
))
2193 /* We support weak defintions, and hence have the right
2194 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2195 if (DECL_WEAK (decl
))
2196 error_at (DECL_SOURCE_LOCATION (decl
),
2197 "PTX does not support weak declarations"
2198 " (only weak definitions)");
2199 write_var_marker (file
, false, TREE_PUBLIC (decl
), name
);
2201 fprintf (file
, "\t.extern ");
2202 tree size
= DECL_SIZE_UNIT (decl
);
2203 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2204 TREE_TYPE (decl
), size
? tree_to_shwi (size
) : 0,
2205 DECL_ALIGN (decl
), true);
2206 nvptx_assemble_decl_end ();
2209 /* Output a pattern for a move instruction. */
2212 nvptx_output_mov_insn (rtx dst
, rtx src
)
2214 machine_mode dst_mode
= GET_MODE (dst
);
2215 machine_mode dst_inner
= (GET_CODE (dst
) == SUBREG
2216 ? GET_MODE (XEXP (dst
, 0)) : dst_mode
);
2217 machine_mode src_inner
= (GET_CODE (src
) == SUBREG
2218 ? GET_MODE (XEXP (src
, 0)) : dst_mode
);
2221 if (GET_CODE (sym
) == CONST
)
2222 sym
= XEXP (XEXP (sym
, 0), 0);
2223 if (SYMBOL_REF_P (sym
))
2225 if (SYMBOL_DATA_AREA (sym
) != DATA_AREA_GENERIC
)
2226 return "%.\tcvta%D1%t0\t%0, %1;";
2227 nvptx_maybe_record_fnsym (sym
);
2230 if (src_inner
== dst_inner
)
2231 return "%.\tmov%t0\t%0, %1;";
2233 if (CONSTANT_P (src
))
2234 return (GET_MODE_CLASS (dst_inner
) == MODE_INT
2235 && GET_MODE_CLASS (src_inner
) != MODE_FLOAT
2236 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2238 if (GET_MODE_SIZE (dst_inner
) == GET_MODE_SIZE (src_inner
))
2240 if (GET_MODE_BITSIZE (dst_mode
) == 128
2241 && GET_MODE_BITSIZE (GET_MODE (src
)) == 128)
2243 /* mov.b128 is not supported. */
2244 if (dst_inner
== V2DImode
&& src_inner
== TImode
)
2245 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2246 else if (dst_inner
== TImode
&& src_inner
== V2DImode
)
2247 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2251 return "%.\tmov.b%T0\t%0, %1;";
2254 return "%.\tcvt%t0%t1\t%0, %1;";
2257 static void nvptx_print_operand (FILE *, rtx
, int);
2259 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2260 involves writing .param declarations and in/out copies into them. For
2261 indirect calls, also write the .callprototype. */
2264 nvptx_output_call_insn (rtx_insn
*insn
, rtx result
, rtx callee
)
2268 bool needs_tgt
= register_operand (callee
, Pmode
);
2269 rtx pat
= PATTERN (insn
);
2270 if (GET_CODE (pat
) == COND_EXEC
)
2271 pat
= COND_EXEC_CODE (pat
);
2272 int arg_end
= XVECLEN (pat
, 0);
2273 tree decl
= NULL_TREE
;
2275 fprintf (asm_out_file
, "\t{\n");
2277 fprintf (asm_out_file
, "\t\t.param%s %s_in;\n",
2278 nvptx_ptx_type_from_mode (GET_MODE (result
), false),
2279 reg_names
[NVPTX_RETURN_REGNUM
]);
2281 /* Ensure we have a ptx declaration in the output if necessary. */
2282 if (GET_CODE (callee
) == SYMBOL_REF
)
2284 decl
= SYMBOL_REF_DECL (callee
);
2286 || (DECL_EXTERNAL (decl
) && !TYPE_ARG_TYPES (TREE_TYPE (decl
))))
2287 nvptx_record_libfunc (callee
, result
, pat
);
2288 else if (DECL_EXTERNAL (decl
))
2289 nvptx_record_fndecl (decl
);
2294 ASM_GENERATE_INTERNAL_LABEL (buf
, "LCT", labelno
);
2296 ASM_OUTPUT_LABEL (asm_out_file
, buf
);
2297 std::stringstream s
;
2298 write_fn_proto_from_insn (s
, NULL
, result
, pat
);
2299 fputs (s
.str().c_str(), asm_out_file
);
2302 for (int argno
= 1; argno
< arg_end
; argno
++)
2304 rtx t
= XEXP (XVECEXP (pat
, 0, argno
), 0);
2305 machine_mode mode
= GET_MODE (t
);
2306 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
2308 /* Mode splitting has already been done. */
2309 fprintf (asm_out_file
, "\t\t.param%s %%out_arg%d;\n"
2310 "\t\tst.param%s [%%out_arg%d], ",
2311 ptx_type
, argno
, ptx_type
, argno
);
2312 output_reg (asm_out_file
, REGNO (t
), VOIDmode
);
2313 fprintf (asm_out_file
, ";\n");
2316 /* The '.' stands for the call's predicate, if any. */
2317 nvptx_print_operand (asm_out_file
, NULL_RTX
, '.');
2318 fprintf (asm_out_file
, "\t\tcall ");
2319 if (result
!= NULL_RTX
)
2320 fprintf (asm_out_file
, "(%s_in), ", reg_names
[NVPTX_RETURN_REGNUM
]);
2324 const char *name
= get_fnname_from_decl (decl
);
2325 name
= nvptx_name_replacement (name
);
2326 assemble_name (asm_out_file
, name
);
2329 output_address (VOIDmode
, callee
);
2331 const char *open
= "(";
2332 for (int argno
= 1; argno
< arg_end
; argno
++)
2334 fprintf (asm_out_file
, ", %s%%out_arg%d", open
, argno
);
2337 if (decl
&& DECL_STATIC_CHAIN (decl
))
2339 fprintf (asm_out_file
, ", %s%s", open
, reg_names
[STATIC_CHAIN_REGNUM
]);
2343 fprintf (asm_out_file
, ")");
2347 fprintf (asm_out_file
, ", ");
2348 assemble_name (asm_out_file
, buf
);
2350 fprintf (asm_out_file
, ";\n");
2352 if (find_reg_note (insn
, REG_NORETURN
, NULL
))
2354 /* No return functions confuse the PTX JIT, as it doesn't realize
2355 the flow control barrier they imply. It can seg fault if it
2356 encounters what looks like an unexitable loop. Emit a trailing
2357 trap and exit, which it does grok. */
2358 fprintf (asm_out_file
, "\t\ttrap; // (noreturn)\n");
2359 fprintf (asm_out_file
, "\t\texit; // (noreturn)\n");
2364 static char rval
[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2367 /* We must escape the '%' that starts RETURN_REGNUM. */
2368 sprintf (rval
, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2369 reg_names
[NVPTX_RETURN_REGNUM
]);
2376 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2379 nvptx_print_operand_punct_valid_p (unsigned char c
)
2381 return c
== '.' || c
== '#';
2384 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2387 nvptx_print_address_operand (FILE *file
, rtx x
, machine_mode
)
2390 if (GET_CODE (x
) == CONST
)
2392 switch (GET_CODE (x
))
2396 output_address (VOIDmode
, XEXP (x
, 0));
2397 fprintf (file
, "+");
2398 output_address (VOIDmode
, off
);
2403 output_addr_const (file
, x
);
2407 gcc_assert (GET_CODE (x
) != MEM
);
2408 nvptx_print_operand (file
, x
, 0);
2413 /* Write assembly language output for the address ADDR to FILE. */
2416 nvptx_print_operand_address (FILE *file
, machine_mode mode
, rtx addr
)
2418 nvptx_print_address_operand (file
, addr
, mode
);
2421 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2424 . -- print the predicate for the instruction or an emptry string for an
2426 # -- print a rounding mode for the instruction
2428 A -- print a data area for a MEM
2429 c -- print an opcode suffix for a comparison operator, including a type code
2430 D -- print a data area for a MEM operand
2431 S -- print a shuffle kind specified by CONST_INT
2432 t -- print a type opcode suffix, promoting QImode to 32 bits
2433 T -- print a type size in bits
2434 u -- print a type opcode suffix without promotions. */
2437 nvptx_print_operand (FILE *file
, rtx x
, int code
)
2441 x
= current_insn_predicate
;
2445 if (GET_CODE (x
) == EQ
)
2447 output_reg (file
, REGNO (XEXP (x
, 0)), VOIDmode
);
2451 else if (code
== '#')
2453 fputs (".rn", file
);
2457 enum rtx_code x_code
= GET_CODE (x
);
2458 machine_mode mode
= GET_MODE (x
);
2467 if (GET_CODE (x
) == CONST
)
2469 if (GET_CODE (x
) == PLUS
)
2472 if (GET_CODE (x
) == SYMBOL_REF
)
2473 fputs (section_for_sym (x
), file
);
2478 if (x_code
== SUBREG
)
2480 machine_mode inner_mode
= GET_MODE (SUBREG_REG (x
));
2481 if (VECTOR_MODE_P (inner_mode
)
2482 && (GET_MODE_SIZE (mode
)
2483 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode
))))
2484 mode
= GET_MODE_INNER (inner_mode
);
2485 else if (split_mode_p (inner_mode
))
2486 mode
= maybe_split_mode (inner_mode
);
2490 fprintf (file
, "%s", nvptx_ptx_type_from_mode (mode
, code
== 't'));
2496 rtx inner_x
= SUBREG_REG (x
);
2497 machine_mode inner_mode
= GET_MODE (inner_x
);
2498 machine_mode split
= maybe_split_mode (inner_mode
);
2500 output_reg (file
, REGNO (inner_x
), split
,
2502 ? GET_MODE_SIZE (inner_mode
) / 2
2509 nvptx_shuffle_kind kind
= (nvptx_shuffle_kind
) UINTVAL (x
);
2510 /* Same order as nvptx_shuffle_kind. */
2511 static const char *const kinds
[] =
2512 {".up", ".down", ".bfly", ".idx"};
2513 fputs (kinds
[kind
], file
);
2518 fprintf (file
, "%d", GET_MODE_BITSIZE (mode
));
2522 fprintf (file
, "@");
2526 fprintf (file
, "@!");
2530 mode
= GET_MODE (XEXP (x
, 0));
2534 fputs (".eq", file
);
2537 if (FLOAT_MODE_P (mode
))
2538 fputs (".neu", file
);
2540 fputs (".ne", file
);
2544 fputs (".le", file
);
2548 fputs (".ge", file
);
2552 fputs (".lt", file
);
2556 fputs (".gt", file
);
2559 fputs (".ne", file
);
2562 fputs (".equ", file
);
2565 fputs (".leu", file
);
2568 fputs (".geu", file
);
2571 fputs (".ltu", file
);
2574 fputs (".gtu", file
);
2577 fputs (".nan", file
);
2580 fputs (".num", file
);
2585 if (FLOAT_MODE_P (mode
)
2586 || x_code
== EQ
|| x_code
== NE
2587 || x_code
== GEU
|| x_code
== GTU
2588 || x_code
== LEU
|| x_code
== LTU
)
2589 fputs (nvptx_ptx_type_from_mode (mode
, true), file
);
2591 fprintf (file
, ".s%d", GET_MODE_BITSIZE (mode
));
2599 rtx inner_x
= SUBREG_REG (x
);
2600 machine_mode inner_mode
= GET_MODE (inner_x
);
2601 machine_mode split
= maybe_split_mode (inner_mode
);
2603 if (VECTOR_MODE_P (inner_mode
)
2604 && (GET_MODE_SIZE (mode
)
2605 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode
))))
2607 output_reg (file
, REGNO (inner_x
), VOIDmode
);
2608 fprintf (file
, ".%s", SUBREG_BYTE (x
) == 0 ? "x" : "y");
2610 else if (split_mode_p (inner_mode
)
2611 && (GET_MODE_SIZE (inner_mode
) == GET_MODE_SIZE (mode
)))
2612 output_reg (file
, REGNO (inner_x
), split
);
2614 output_reg (file
, REGNO (inner_x
), split
, SUBREG_BYTE (x
));
2619 output_reg (file
, REGNO (x
), maybe_split_mode (mode
));
2624 nvptx_print_address_operand (file
, XEXP (x
, 0), mode
);
2629 output_addr_const (file
, x
);
2635 /* We could use output_addr_const, but that can print things like
2636 "x-8", which breaks ptxas. Need to ensure it is output as
2638 nvptx_print_address_operand (file
, x
, VOIDmode
);
2643 real_to_target (vals
, CONST_DOUBLE_REAL_VALUE (x
), mode
);
2644 vals
[0] &= 0xffffffff;
2645 vals
[1] &= 0xffffffff;
2647 fprintf (file
, "0f%08lx", vals
[0]);
2649 fprintf (file
, "0d%08lx%08lx", vals
[1], vals
[0]);
2654 unsigned n
= CONST_VECTOR_NUNITS (x
);
2655 fprintf (file
, "{ ");
2656 for (unsigned i
= 0; i
< n
; ++i
)
2659 fprintf (file
, ", ");
2661 rtx elem
= CONST_VECTOR_ELT (x
, i
);
2662 output_addr_const (file
, elem
);
2664 fprintf (file
, " }");
2669 output_addr_const (file
, x
);
2674 /* Record replacement regs used to deal with subreg operands. */
2677 rtx replacement
[MAX_RECOG_OPERANDS
];
2683 /* Allocate or reuse a replacement in R and return the rtx. */
2686 get_replacement (struct reg_replace
*r
)
2688 if (r
->n_allocated
== r
->n_in_use
)
2689 r
->replacement
[r
->n_allocated
++] = gen_reg_rtx (r
->mode
);
2690 return r
->replacement
[r
->n_in_use
++];
2693 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2694 the presence of subregs would break the rules for most instructions.
2695 Replace them with a suitable new register of the right size, plus
2696 conversion copyin/copyout instructions. */
2699 nvptx_reorg_subreg (void)
2701 struct reg_replace qiregs
, hiregs
, siregs
, diregs
;
2702 rtx_insn
*insn
, *next
;
2704 qiregs
.n_allocated
= 0;
2705 hiregs
.n_allocated
= 0;
2706 siregs
.n_allocated
= 0;
2707 diregs
.n_allocated
= 0;
2708 qiregs
.mode
= QImode
;
2709 hiregs
.mode
= HImode
;
2710 siregs
.mode
= SImode
;
2711 diregs
.mode
= DImode
;
2713 for (insn
= get_insns (); insn
; insn
= next
)
2715 next
= NEXT_INSN (insn
);
2716 if (!NONDEBUG_INSN_P (insn
)
2717 || asm_noperands (PATTERN (insn
)) >= 0
2718 || GET_CODE (PATTERN (insn
)) == USE
2719 || GET_CODE (PATTERN (insn
)) == CLOBBER
)
2722 qiregs
.n_in_use
= 0;
2723 hiregs
.n_in_use
= 0;
2724 siregs
.n_in_use
= 0;
2725 diregs
.n_in_use
= 0;
2726 extract_insn (insn
);
2727 enum attr_subregs_ok s_ok
= get_attr_subregs_ok (insn
);
2729 for (int i
= 0; i
< recog_data
.n_operands
; i
++)
2731 rtx op
= recog_data
.operand
[i
];
2732 if (GET_CODE (op
) != SUBREG
)
2735 rtx inner
= SUBREG_REG (op
);
2737 machine_mode outer_mode
= GET_MODE (op
);
2738 machine_mode inner_mode
= GET_MODE (inner
);
2741 && (GET_MODE_PRECISION (inner_mode
)
2742 >= GET_MODE_PRECISION (outer_mode
)))
2744 gcc_assert (SCALAR_INT_MODE_P (outer_mode
));
2745 struct reg_replace
*r
= (outer_mode
== QImode
? &qiregs
2746 : outer_mode
== HImode
? &hiregs
2747 : outer_mode
== SImode
? &siregs
2749 rtx new_reg
= get_replacement (r
);
2751 if (recog_data
.operand_type
[i
] != OP_OUT
)
2754 if (GET_MODE_PRECISION (inner_mode
)
2755 < GET_MODE_PRECISION (outer_mode
))
2760 rtx pat
= gen_rtx_SET (new_reg
,
2761 gen_rtx_fmt_e (code
, outer_mode
, inner
));
2762 emit_insn_before (pat
, insn
);
2765 if (recog_data
.operand_type
[i
] != OP_IN
)
2768 if (GET_MODE_PRECISION (inner_mode
)
2769 < GET_MODE_PRECISION (outer_mode
))
2774 rtx pat
= gen_rtx_SET (inner
,
2775 gen_rtx_fmt_e (code
, inner_mode
, new_reg
));
2776 emit_insn_after (pat
, insn
);
2778 validate_change (insn
, recog_data
.operand_loc
[i
], new_reg
, false);
2783 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2787 nvptx_get_unisimt_master ()
2789 rtx
&master
= cfun
->machine
->unisimt_master
;
2790 return master
? master
: master
= gen_reg_rtx (SImode
);
2793 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2796 nvptx_get_unisimt_predicate ()
2798 rtx
&pred
= cfun
->machine
->unisimt_predicate
;
2799 return pred
? pred
: pred
= gen_reg_rtx (BImode
);
2802 /* Return true if given call insn references one of the functions provided by
2803 the CUDA runtime: malloc, free, vprintf. */
2806 nvptx_call_insn_is_syscall_p (rtx_insn
*insn
)
2808 rtx pat
= PATTERN (insn
);
2809 gcc_checking_assert (GET_CODE (pat
) == PARALLEL
);
2810 pat
= XVECEXP (pat
, 0, 0);
2811 if (GET_CODE (pat
) == SET
)
2812 pat
= SET_SRC (pat
);
2813 gcc_checking_assert (GET_CODE (pat
) == CALL
2814 && GET_CODE (XEXP (pat
, 0)) == MEM
);
2815 rtx addr
= XEXP (XEXP (pat
, 0), 0);
2816 if (GET_CODE (addr
) != SYMBOL_REF
)
2818 const char *name
= XSTR (addr
, 0);
2819 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2820 references with forced assembler name refer to PTX syscalls. For vprintf,
2821 accept both normal and forced-assembler-name references. */
2822 return (!strcmp (name
, "vprintf") || !strcmp (name
, "*vprintf")
2823 || !strcmp (name
, "*malloc")
2824 || !strcmp (name
, "*free"));
2827 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2828 propagate its value from lane MASTER to current lane. */
2831 nvptx_unisimt_handle_set (rtx set
, rtx_insn
*insn
, rtx master
)
2834 if (GET_CODE (set
) == SET
&& REG_P (reg
= SET_DEST (set
)))
2835 emit_insn_after (nvptx_gen_shuffle (reg
, reg
, master
, SHUFFLE_IDX
), insn
);
2838 /* Adjust code for uniform-simt code generation variant by making atomics and
2839 "syscalls" conditionally executed, and inserting shuffle-based propagation
2840 for registers being set. */
2843 nvptx_reorg_uniform_simt ()
2845 rtx_insn
*insn
, *next
;
2847 for (insn
= get_insns (); insn
; insn
= next
)
2849 next
= NEXT_INSN (insn
);
2850 if (!(CALL_P (insn
) && nvptx_call_insn_is_syscall_p (insn
))
2851 && !(NONJUMP_INSN_P (insn
)
2852 && GET_CODE (PATTERN (insn
)) == PARALLEL
2853 && get_attr_atomic (insn
)))
2855 rtx pat
= PATTERN (insn
);
2856 rtx master
= nvptx_get_unisimt_master ();
2857 for (int i
= 0; i
< XVECLEN (pat
, 0); i
++)
2858 nvptx_unisimt_handle_set (XVECEXP (pat
, 0, i
), insn
, master
);
2859 rtx pred
= nvptx_get_unisimt_predicate ();
2860 pred
= gen_rtx_NE (BImode
, pred
, const0_rtx
);
2861 pat
= gen_rtx_COND_EXEC (VOIDmode
, pred
, pat
);
2862 validate_change (insn
, &PATTERN (insn
), pat
, false);
2866 /* Loop structure of the function. The entire function is described as
2871 /* Parent parallel. */
2874 /* Next sibling parallel. */
2877 /* First child parallel. */
2880 /* Partitioning mask of the parallel. */
2883 /* Partitioning used within inner parallels. */
2884 unsigned inner_mask
;
2886 /* Location of parallel forked and join. The forked is the first
2887 block in the parallel and the join is the first block after of
2889 basic_block forked_block
;
2890 basic_block join_block
;
2892 rtx_insn
*forked_insn
;
2893 rtx_insn
*join_insn
;
2895 rtx_insn
*fork_insn
;
2896 rtx_insn
*joining_insn
;
2898 /* Basic blocks in this parallel, but not in child parallels. The
2899 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2901 auto_vec
<basic_block
> blocks
;
2904 parallel (parallel
*parent
, unsigned mode
);
2908 /* Constructor links the new parallel into it's parent's chain of
2911 parallel::parallel (parallel
*parent_
, unsigned mask_
)
2912 :parent (parent_
), next (0), inner (0), mask (mask_
), inner_mask (0)
2914 forked_block
= join_block
= 0;
2915 forked_insn
= join_insn
= 0;
2916 fork_insn
= joining_insn
= 0;
2920 next
= parent
->inner
;
2921 parent
->inner
= this;
2925 parallel::~parallel ()
2931 /* Map of basic blocks to insns */
2932 typedef hash_map
<basic_block
, rtx_insn
*> bb_insn_map_t
;
2934 /* A tuple of an insn of interest and the BB in which it resides. */
2935 typedef std::pair
<rtx_insn
*, basic_block
> insn_bb_t
;
2936 typedef auto_vec
<insn_bb_t
> insn_bb_vec_t
;
2938 /* Split basic blocks such that each forked and join unspecs are at
2939 the start of their basic blocks. Thus afterwards each block will
2940 have a single partitioning mode. We also do the same for return
2941 insns, as they are executed by every thread. Return the
2942 partitioning mode of the function as a whole. Populate MAP with
2943 head and tail blocks. We also clear the BB visited flag, which is
2944 used when finding partitions. */
2947 nvptx_split_blocks (bb_insn_map_t
*map
)
2949 insn_bb_vec_t worklist
;
2953 /* Locate all the reorg instructions of interest. */
2954 FOR_ALL_BB_FN (block
, cfun
)
2956 bool seen_insn
= false;
2958 /* Clear visited flag, for use by parallel locator */
2959 block
->flags
&= ~BB_VISITED
;
2961 FOR_BB_INSNS (block
, insn
)
2965 switch (recog_memoized (insn
))
2970 case CODE_FOR_nvptx_forked
:
2971 case CODE_FOR_nvptx_join
:
2974 case CODE_FOR_return
:
2975 /* We also need to split just before return insns, as
2976 that insn needs executing by all threads, but the
2977 block it is in probably does not. */
2982 /* We've found an instruction that must be at the start of
2983 a block, but isn't. Add it to the worklist. */
2984 worklist
.safe_push (insn_bb_t (insn
, block
));
2986 /* It was already the first instruction. Just add it to
2988 map
->get_or_insert (block
) = insn
;
2993 /* Split blocks on the worklist. */
2996 basic_block remap
= 0;
2997 for (ix
= 0; worklist
.iterate (ix
, &elt
); ix
++)
2999 if (remap
!= elt
->second
)
3001 block
= elt
->second
;
3005 /* Split block before insn. The insn is in the new block */
3006 edge e
= split_block (block
, PREV_INSN (elt
->first
));
3009 map
->get_or_insert (block
) = elt
->first
;
3013 /* BLOCK is a basic block containing a head or tail instruction.
3014 Locate the associated prehead or pretail instruction, which must be
3015 in the single predecessor block. */
3018 nvptx_discover_pre (basic_block block
, int expected
)
3020 gcc_assert (block
->preds
->length () == 1);
3021 basic_block pre_block
= (*block
->preds
)[0]->src
;
3024 for (pre_insn
= BB_END (pre_block
); !INSN_P (pre_insn
);
3025 pre_insn
= PREV_INSN (pre_insn
))
3026 gcc_assert (pre_insn
!= BB_HEAD (pre_block
));
3028 gcc_assert (recog_memoized (pre_insn
) == expected
);
3032 /* Dump this parallel and all its inner parallels. */
3035 nvptx_dump_pars (parallel
*par
, unsigned depth
)
3037 fprintf (dump_file
, "%u: mask %d head=%d, tail=%d\n",
3039 par
->forked_block
? par
->forked_block
->index
: -1,
3040 par
->join_block
? par
->join_block
->index
: -1);
3042 fprintf (dump_file
, " blocks:");
3045 for (unsigned ix
= 0; par
->blocks
.iterate (ix
, &block
); ix
++)
3046 fprintf (dump_file
, " %d", block
->index
);
3047 fprintf (dump_file
, "\n");
3049 nvptx_dump_pars (par
->inner
, depth
+ 1);
3052 nvptx_dump_pars (par
->next
, depth
);
3055 /* If BLOCK contains a fork/join marker, process it to create or
3056 terminate a loop structure. Add this block to the current loop,
3057 and then walk successor blocks. */
3060 nvptx_find_par (bb_insn_map_t
*map
, parallel
*par
, basic_block block
)
3062 if (block
->flags
& BB_VISITED
)
3064 block
->flags
|= BB_VISITED
;
3066 if (rtx_insn
**endp
= map
->get (block
))
3068 rtx_insn
*end
= *endp
;
3070 /* This is a block head or tail, or return instruction. */
3071 switch (recog_memoized (end
))
3073 case CODE_FOR_return
:
3074 /* Return instructions are in their own block, and we
3075 don't need to do anything more. */
3078 case CODE_FOR_nvptx_forked
:
3079 /* Loop head, create a new inner loop and add it into
3080 our parent's child list. */
3082 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
3085 par
= new parallel (par
, mask
);
3086 par
->forked_block
= block
;
3087 par
->forked_insn
= end
;
3088 if (mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
3090 = nvptx_discover_pre (block
, CODE_FOR_nvptx_fork
);
3094 case CODE_FOR_nvptx_join
:
3095 /* A loop tail. Finish the current loop and return to
3098 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
3100 gcc_assert (par
->mask
== mask
);
3101 par
->join_block
= block
;
3102 par
->join_insn
= end
;
3103 if (mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
3105 = nvptx_discover_pre (block
, CODE_FOR_nvptx_joining
);
3116 /* Add this block onto the current loop's list of blocks. */
3117 par
->blocks
.safe_push (block
);
3119 /* This must be the entry block. Create a NULL parallel. */
3120 par
= new parallel (0, 0);
3122 /* Walk successor blocks. */
3126 FOR_EACH_EDGE (e
, ei
, block
->succs
)
3127 nvptx_find_par (map
, par
, e
->dest
);
3132 /* DFS walk the CFG looking for fork & join markers. Construct
3133 loop structures as we go. MAP is a mapping of basic blocks
3134 to head & tail markers, discovered when splitting blocks. This
3135 speeds up the discovery. We rely on the BB visited flag having
3136 been cleared when splitting blocks. */
3139 nvptx_discover_pars (bb_insn_map_t
*map
)
3143 /* Mark exit blocks as visited. */
3144 block
= EXIT_BLOCK_PTR_FOR_FN (cfun
);
3145 block
->flags
|= BB_VISITED
;
3147 /* And entry block as not. */
3148 block
= ENTRY_BLOCK_PTR_FOR_FN (cfun
);
3149 block
->flags
&= ~BB_VISITED
;
3151 parallel
*par
= nvptx_find_par (map
, 0, block
);
3155 fprintf (dump_file
, "\nLoops\n");
3156 nvptx_dump_pars (par
, 0);
3157 fprintf (dump_file
, "\n");
3163 /* Analyse a group of BBs within a partitioned region and create N
3164 Single-Entry-Single-Exit regions. Some of those regions will be
3165 trivial ones consisting of a single BB. The blocks of a
3166 partitioned region might form a set of disjoint graphs -- because
3167 the region encloses a differently partitoned sub region.
3169 We use the linear time algorithm described in 'Finding Regions Fast:
3170 Single Entry Single Exit and control Regions in Linear Time'
3171 Johnson, Pearson & Pingali. That algorithm deals with complete
3172 CFGs, where a back edge is inserted from END to START, and thus the
3173 problem becomes one of finding equivalent loops.
3175 In this case we have a partial CFG. We complete it by redirecting
3176 any incoming edge to the graph to be from an arbitrary external BB,
3177 and similarly redirecting any outgoing edge to be to that BB.
3178 Thus we end up with a closed graph.
3180 The algorithm works by building a spanning tree of an undirected
3181 graph and keeping track of back edges from nodes further from the
3182 root in the tree to nodes nearer to the root in the tree. In the
3183 description below, the root is up and the tree grows downwards.
3185 We avoid having to deal with degenerate back-edges to the same
3186 block, by splitting each BB into 3 -- one for input edges, one for
3187 the node itself and one for the output edges. Such back edges are
3188 referred to as 'Brackets'. Cycle equivalent nodes will have the
3189 same set of brackets.
3191 Determining bracket equivalency is done by maintaining a list of
3192 brackets in such a manner that the list length and final bracket
3193 uniquely identify the set.
3195 We use coloring to mark all BBs with cycle equivalency with the
3196 same color. This is the output of the 'Finding Regions Fast'
3197 algorithm. Notice it doesn't actually find the set of nodes within
3198 a particular region, just unorderd sets of nodes that are the
3199 entries and exits of SESE regions.
3201 After determining cycle equivalency, we need to find the minimal
3202 set of SESE regions. Do this with a DFS coloring walk of the
3203 complete graph. We're either 'looking' or 'coloring'. When
3204 looking, and we're in the subgraph, we start coloring the color of
3205 the current node, and remember that node as the start of the
3206 current color's SESE region. Every time we go to a new node, we
3207 decrement the count of nodes with thet color. If it reaches zero,
3208 we remember that node as the end of the current color's SESE region
3209 and return to 'looking'. Otherwise we color the node the current
3212 This way we end up with coloring the inside of non-trivial SESE
3213 regions with the color of that region. */
3215 /* A pair of BBs. We use this to represent SESE regions. */
3216 typedef std::pair
<basic_block
, basic_block
> bb_pair_t
;
3217 typedef auto_vec
<bb_pair_t
> bb_pair_vec_t
;
3219 /* A node in the undirected CFG. The discriminator SECOND indicates just
3220 above or just below the BB idicated by FIRST. */
3221 typedef std::pair
<basic_block
, int> pseudo_node_t
;
3223 /* A bracket indicates an edge towards the root of the spanning tree of the
3224 undirected graph. Each bracket has a color, determined
3225 from the currrent set of brackets. */
3228 pseudo_node_t back
; /* Back target */
3230 /* Current color and size of set. */
3234 bracket (pseudo_node_t back_
)
3235 : back (back_
), color (~0u), size (~0u)
3239 unsigned get_color (auto_vec
<unsigned> &color_counts
, unsigned length
)
3244 color
= color_counts
.length ();
3245 color_counts
.quick_push (0);
3247 color_counts
[color
]++;
3252 typedef auto_vec
<bracket
> bracket_vec_t
;
3254 /* Basic block info for finding SESE regions. */
3258 int node
; /* Node number in spanning tree. */
3259 int parent
; /* Parent node number. */
3261 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3262 edges arrive at pseudo-node Ai and the outgoing edges leave at
3263 pseudo-node Ao. We have to remember which way we arrived at a
3264 particular node when generating the spanning tree. dir > 0 means
3265 we arrived at Ai, dir < 0 means we arrived at Ao. */
3268 /* Lowest numbered pseudo-node reached via a backedge from thsis
3269 node, or any descendant. */
3272 int color
; /* Cycle-equivalence color */
3274 /* Stack of brackets for this node. */
3275 bracket_vec_t brackets
;
3277 bb_sese (unsigned node_
, unsigned p
, int dir_
)
3278 :node (node_
), parent (p
), dir (dir_
)
3283 /* Push a bracket ending at BACK. */
3284 void push (const pseudo_node_t
&back
)
3287 fprintf (dump_file
, "Pushing backedge %d:%+d\n",
3288 back
.first
? back
.first
->index
: 0, back
.second
);
3289 brackets
.safe_push (bracket (back
));
3292 void append (bb_sese
*child
);
3293 void remove (const pseudo_node_t
&);
3295 /* Set node's color. */
3296 void set_color (auto_vec
<unsigned> &color_counts
)
3298 color
= brackets
.last ().get_color (color_counts
, brackets
.length ());
3302 bb_sese::~bb_sese ()
3306 /* Destructively append CHILD's brackets. */
3309 bb_sese::append (bb_sese
*child
)
3311 if (int len
= child
->brackets
.length ())
3317 for (ix
= 0; ix
< len
; ix
++)
3319 const pseudo_node_t
&pseudo
= child
->brackets
[ix
].back
;
3320 fprintf (dump_file
, "Appending (%d)'s backedge %d:%+d\n",
3321 child
->node
, pseudo
.first
? pseudo
.first
->index
: 0,
3325 if (!brackets
.length ())
3326 std::swap (brackets
, child
->brackets
);
3329 brackets
.reserve (len
);
3330 for (ix
= 0; ix
< len
; ix
++)
3331 brackets
.quick_push (child
->brackets
[ix
]);
3336 /* Remove brackets that terminate at PSEUDO. */
3339 bb_sese::remove (const pseudo_node_t
&pseudo
)
3341 unsigned removed
= 0;
3342 int len
= brackets
.length ();
3344 for (int ix
= 0; ix
< len
; ix
++)
3346 if (brackets
[ix
].back
== pseudo
)
3349 fprintf (dump_file
, "Removing backedge %d:%+d\n",
3350 pseudo
.first
? pseudo
.first
->index
: 0, pseudo
.second
);
3354 brackets
[ix
-removed
] = brackets
[ix
];
3360 /* Accessors for BB's aux pointer. */
3361 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3362 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3364 /* DFS walk creating SESE data structures. Only cover nodes with
3365 BB_VISITED set. Append discovered blocks to LIST. We number in
3366 increments of 3 so that the above and below pseudo nodes can be
3367 implicitly numbered too. */
3370 nvptx_sese_number (int n
, int p
, int dir
, basic_block b
,
3371 auto_vec
<basic_block
> *list
)
3373 if (BB_GET_SESE (b
))
3377 fprintf (dump_file
, "Block %d(%d), parent (%d), orientation %+d\n",
3378 b
->index
, n
, p
, dir
);
3380 BB_SET_SESE (b
, new bb_sese (n
, p
, dir
));
3384 list
->quick_push (b
);
3386 /* First walk the nodes on the 'other side' of this node, then walk
3387 the nodes on the same side. */
3388 for (unsigned ix
= 2; ix
; ix
--)
3390 vec
<edge
, va_gc
> *edges
= dir
> 0 ? b
->succs
: b
->preds
;
3391 size_t offset
= (dir
> 0 ? offsetof (edge_def
, dest
)
3392 : offsetof (edge_def
, src
));
3396 FOR_EACH_EDGE (e
, ei
, edges
)
3398 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3400 if (target
->flags
& BB_VISITED
)
3401 n
= nvptx_sese_number (n
, p
, dir
, target
, list
);
3408 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3409 EDGES are the outgoing edges and OFFSET is the offset to the src
3410 or dst block on the edges. */
3413 nvptx_sese_pseudo (basic_block me
, bb_sese
*sese
, int depth
, int dir
,
3414 vec
<edge
, va_gc
> *edges
, size_t offset
)
3418 int hi_back
= depth
;
3419 pseudo_node_t
node_back (0, depth
);
3420 int hi_child
= depth
;
3421 pseudo_node_t
node_child (0, depth
);
3422 basic_block child
= NULL
;
3423 unsigned num_children
= 0;
3424 int usd
= -dir
* sese
->dir
;
3427 fprintf (dump_file
, "\nProcessing %d(%d) %+d\n",
3428 me
->index
, sese
->node
, dir
);
3432 /* This is the above pseudo-child. It has the BB itself as an
3433 additional child node. */
3434 node_child
= sese
->high
;
3435 hi_child
= node_child
.second
;
3436 if (node_child
.first
)
3437 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
3441 /* Examine each edge.
3442 - if it is a child (a) append its bracket list and (b) record
3443 whether it is the child with the highest reaching bracket.
3444 - if it is an edge to ancestor, record whether it's the highest
3445 reaching backlink. */
3446 FOR_EACH_EDGE (e
, ei
, edges
)
3448 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3450 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
3452 if (t_sese
->parent
== sese
->node
&& !(t_sese
->dir
+ usd
))
3454 /* Child node. Append its bracket list. */
3456 sese
->append (t_sese
);
3458 /* Compare it's hi value. */
3459 int t_hi
= t_sese
->high
.second
;
3461 if (basic_block child_hi_block
= t_sese
->high
.first
)
3462 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
3464 if (hi_child
> t_hi
)
3467 node_child
= t_sese
->high
;
3471 else if (t_sese
->node
< sese
->node
+ dir
3472 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
3474 /* Non-parental ancestor node -- a backlink. */
3475 int d
= usd
* t_sese
->dir
;
3476 int back
= t_sese
->node
+ d
;
3481 node_back
= pseudo_node_t (target
, d
);
3486 { /* Fallen off graph, backlink to entry node. */
3488 node_back
= pseudo_node_t (0, 0);
3492 /* Remove any brackets that terminate at this pseudo node. */
3493 sese
->remove (pseudo_node_t (me
, dir
));
3495 /* Now push any backlinks from this pseudo node. */
3496 FOR_EACH_EDGE (e
, ei
, edges
)
3498 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3499 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
3501 if (t_sese
->node
< sese
->node
+ dir
3502 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
3503 /* Non-parental ancestor node - backedge from me. */
3504 sese
->push (pseudo_node_t (target
, usd
* t_sese
->dir
));
3508 /* back edge to entry node */
3509 sese
->push (pseudo_node_t (0, 0));
3513 /* If this node leads directly or indirectly to a no-return region of
3514 the graph, then fake a backedge to entry node. */
3515 if (!sese
->brackets
.length () || !edges
|| !edges
->length ())
3518 node_back
= pseudo_node_t (0, 0);
3519 sese
->push (node_back
);
3522 /* Record the highest reaching backedge from us or a descendant. */
3523 sese
->high
= hi_back
< hi_child
? node_back
: node_child
;
3525 if (num_children
> 1)
3527 /* There is more than one child -- this is a Y shaped piece of
3528 spanning tree. We have to insert a fake backedge from this
3529 node to the highest ancestor reached by not-the-highest
3530 reaching child. Note that there may be multiple children
3531 with backedges to the same highest node. That's ok and we
3532 insert the edge to that highest node. */
3534 if (dir
< 0 && child
)
3536 node_child
= sese
->high
;
3537 hi_child
= node_child
.second
;
3538 if (node_child
.first
)
3539 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
3542 FOR_EACH_EDGE (e
, ei
, edges
)
3544 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3546 if (target
== child
)
3547 /* Ignore the highest child. */
3550 bb_sese
*t_sese
= BB_GET_SESE (target
);
3553 if (t_sese
->parent
!= sese
->node
)
3557 /* Compare its hi value. */
3558 int t_hi
= t_sese
->high
.second
;
3560 if (basic_block child_hi_block
= t_sese
->high
.first
)
3561 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
3563 if (hi_child
> t_hi
)
3566 node_child
= t_sese
->high
;
3570 sese
->push (node_child
);
3575 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3576 proceed to successors. Set SESE entry and exit nodes of
3580 nvptx_sese_color (auto_vec
<unsigned> &color_counts
, bb_pair_vec_t
®ions
,
3581 basic_block block
, int coloring
)
3583 bb_sese
*sese
= BB_GET_SESE (block
);
3585 if (block
->flags
& BB_VISITED
)
3587 /* If we've already encountered this block, either we must not
3588 be coloring, or it must have been colored the current color. */
3589 gcc_assert (coloring
< 0 || (sese
&& coloring
== sese
->color
));
3593 block
->flags
|= BB_VISITED
;
3599 /* Start coloring a region. */
3600 regions
[sese
->color
].first
= block
;
3601 coloring
= sese
->color
;
3604 if (!--color_counts
[sese
->color
] && sese
->color
== coloring
)
3606 /* Found final block of SESE region. */
3607 regions
[sese
->color
].second
= block
;
3611 /* Color the node, so we can assert on revisiting the node
3612 that the graph is indeed SESE. */
3613 sese
->color
= coloring
;
3616 /* Fallen off the subgraph, we cannot be coloring. */
3617 gcc_assert (coloring
< 0);
3619 /* Walk each successor block. */
3620 if (block
->succs
&& block
->succs
->length ())
3625 FOR_EACH_EDGE (e
, ei
, block
->succs
)
3626 nvptx_sese_color (color_counts
, regions
, e
->dest
, coloring
);
3629 gcc_assert (coloring
< 0);
3632 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3633 end up with NULL entries in it. */
3636 nvptx_find_sese (auto_vec
<basic_block
> &blocks
, bb_pair_vec_t
®ions
)
3641 /* First clear each BB of the whole function. */
3642 FOR_ALL_BB_FN (block
, cfun
)
3644 block
->flags
&= ~BB_VISITED
;
3645 BB_SET_SESE (block
, 0);
3648 /* Mark blocks in the function that are in this graph. */
3649 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3650 block
->flags
|= BB_VISITED
;
3652 /* Counts of nodes assigned to each color. There cannot be more
3653 colors than blocks (and hopefully there will be fewer). */
3654 auto_vec
<unsigned> color_counts
;
3655 color_counts
.reserve (blocks
.length ());
3657 /* Worklist of nodes in the spanning tree. Again, there cannot be
3658 more nodes in the tree than blocks (there will be fewer if the
3659 CFG of blocks is disjoint). */
3660 auto_vec
<basic_block
> spanlist
;
3661 spanlist
.reserve (blocks
.length ());
3663 /* Make sure every block has its cycle class determined. */
3664 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3666 if (BB_GET_SESE (block
))
3667 /* We already met this block in an earlier graph solve. */
3671 fprintf (dump_file
, "Searching graph starting at %d\n", block
->index
);
3673 /* Number the nodes reachable from block initial DFS order. */
3674 int depth
= nvptx_sese_number (2, 0, +1, block
, &spanlist
);
3676 /* Now walk in reverse DFS order to find cycle equivalents. */
3677 while (spanlist
.length ())
3679 block
= spanlist
.pop ();
3680 bb_sese
*sese
= BB_GET_SESE (block
);
3682 /* Do the pseudo node below. */
3683 nvptx_sese_pseudo (block
, sese
, depth
, +1,
3684 sese
->dir
> 0 ? block
->succs
: block
->preds
,
3685 (sese
->dir
> 0 ? offsetof (edge_def
, dest
)
3686 : offsetof (edge_def
, src
)));
3687 sese
->set_color (color_counts
);
3688 /* Do the pseudo node above. */
3689 nvptx_sese_pseudo (block
, sese
, depth
, -1,
3690 sese
->dir
< 0 ? block
->succs
: block
->preds
,
3691 (sese
->dir
< 0 ? offsetof (edge_def
, dest
)
3692 : offsetof (edge_def
, src
)));
3695 fprintf (dump_file
, "\n");
3701 const char *comma
= "";
3703 fprintf (dump_file
, "Found %d cycle equivalents\n",
3704 color_counts
.length ());
3705 for (ix
= 0; color_counts
.iterate (ix
, &count
); ix
++)
3707 fprintf (dump_file
, "%s%d[%d]={", comma
, ix
, count
);
3710 for (unsigned jx
= 0; blocks
.iterate (jx
, &block
); jx
++)
3711 if (BB_GET_SESE (block
)->color
== ix
)
3713 block
->flags
|= BB_VISITED
;
3714 fprintf (dump_file
, "%s%d", comma
, block
->index
);
3717 fprintf (dump_file
, "}");
3720 fprintf (dump_file
, "\n");
3723 /* Now we've colored every block in the subgraph. We now need to
3724 determine the minimal set of SESE regions that cover that
3725 subgraph. Do this with a DFS walk of the complete function.
3726 During the walk we're either 'looking' or 'coloring'. When we
3727 reach the last node of a particular color, we stop coloring and
3728 return to looking. */
3730 /* There cannot be more SESE regions than colors. */
3731 regions
.reserve (color_counts
.length ());
3732 for (ix
= color_counts
.length (); ix
--;)
3733 regions
.quick_push (bb_pair_t (0, 0));
3735 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3736 block
->flags
&= ~BB_VISITED
;
3738 nvptx_sese_color (color_counts
, regions
, ENTRY_BLOCK_PTR_FOR_FN (cfun
), -1);
3742 const char *comma
= "";
3743 int len
= regions
.length ();
3745 fprintf (dump_file
, "SESE regions:");
3746 for (ix
= 0; ix
!= len
; ix
++)
3748 basic_block from
= regions
[ix
].first
;
3749 basic_block to
= regions
[ix
].second
;
3753 fprintf (dump_file
, "%s %d{%d", comma
, ix
, from
->index
);
3755 fprintf (dump_file
, "->%d", to
->index
);
3757 int color
= BB_GET_SESE (from
)->color
;
3759 /* Print the blocks within the region (excluding ends). */
3760 FOR_EACH_BB_FN (block
, cfun
)
3762 bb_sese
*sese
= BB_GET_SESE (block
);
3764 if (sese
&& sese
->color
== color
3765 && block
!= from
&& block
!= to
)
3766 fprintf (dump_file
, ".%d", block
->index
);
3768 fprintf (dump_file
, "}");
3772 fprintf (dump_file
, "\n\n");
3775 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3776 delete BB_GET_SESE (block
);
3782 /* Propagate live state at the start of a partitioned region. IS_CALL
3783 indicates whether the propagation is for a (partitioned) call
3784 instruction. BLOCK provides the live register information, and
3785 might not contain INSN. Propagation is inserted just after INSN. RW
3786 indicates whether we are reading and/or writing state. This
3787 separation is needed for worker-level proppagation where we
3788 essentially do a spill & fill. FN is the underlying worker
3789 function to generate the propagation instructions for single
3790 register. DATA is user data.
3792 Returns true if we didn't emit any instructions.
3794 We propagate the live register set for non-calls and the entire
3795 frame for calls and non-calls. We could do better by (a)
3796 propagating just the live set that is used within the partitioned
3797 regions and (b) only propagating stack entries that are used. The
3798 latter might be quite hard to determine. */
3800 typedef rtx (*propagator_fn
) (rtx
, propagate_mask
, unsigned, void *);
3803 nvptx_propagate (bool is_call
, basic_block block
, rtx_insn
*insn
,
3804 propagate_mask rw
, propagator_fn fn
, void *data
)
3806 bitmap live
= DF_LIVE_IN (block
);
3807 bitmap_iterator iterator
;
3811 /* Copy the frame array. */
3812 HOST_WIDE_INT fs
= get_frame_size ();
3815 rtx tmp
= gen_reg_rtx (DImode
);
3817 rtx ptr
= gen_reg_rtx (Pmode
);
3818 rtx pred
= NULL_RTX
;
3819 rtx_code_label
*label
= NULL
;
3822 /* The frame size might not be DImode compatible, but the frame
3823 array's declaration will be. So it's ok to round up here. */
3824 fs
= (fs
+ GET_MODE_SIZE (DImode
) - 1) / GET_MODE_SIZE (DImode
);
3825 /* Detect single iteration loop. */
3830 emit_insn (gen_rtx_SET (ptr
, frame_pointer_rtx
));
3833 idx
= gen_reg_rtx (SImode
);
3834 pred
= gen_reg_rtx (BImode
);
3835 label
= gen_label_rtx ();
3837 emit_insn (gen_rtx_SET (idx
, GEN_INT (fs
)));
3838 /* Allow worker function to initialize anything needed. */
3839 rtx init
= fn (tmp
, PM_loop_begin
, fs
, data
);
3843 LABEL_NUSES (label
)++;
3844 emit_insn (gen_addsi3 (idx
, idx
, GEN_INT (-1)));
3847 emit_insn (gen_rtx_SET (tmp
, gen_rtx_MEM (DImode
, ptr
)));
3848 emit_insn (fn (tmp
, rw
, fs
, data
));
3850 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode
, ptr
), tmp
));
3853 emit_insn (gen_rtx_SET (pred
, gen_rtx_NE (BImode
, idx
, const0_rtx
)));
3854 emit_insn (gen_adddi3 (ptr
, ptr
, GEN_INT (GET_MODE_SIZE (DImode
))));
3855 emit_insn (gen_br_true_uni (pred
, label
));
3856 rtx fini
= fn (tmp
, PM_loop_end
, fs
, data
);
3859 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx
), idx
));
3861 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp
), tmp
));
3862 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr
), ptr
));
3863 rtx cpy
= get_insns ();
3865 insn
= emit_insn_after (cpy
, insn
);
3869 /* Copy live registers. */
3870 EXECUTE_IF_SET_IN_BITMAP (live
, 0, ix
, iterator
)
3872 rtx reg
= regno_reg_rtx
[ix
];
3874 if (REGNO (reg
) >= FIRST_PSEUDO_REGISTER
)
3876 rtx bcast
= fn (reg
, rw
, 0, data
);
3878 insn
= emit_insn_after (bcast
, insn
);
3885 /* Worker for nvptx_vpropagate. */
3888 vprop_gen (rtx reg
, propagate_mask pm
,
3889 unsigned ARG_UNUSED (count
), void *ARG_UNUSED (data
))
3891 if (!(pm
& PM_read_write
))
3894 return nvptx_gen_vcast (reg
);
3897 /* Propagate state that is live at start of BLOCK across the vectors
3898 of a single warp. Propagation is inserted just after INSN.
3899 IS_CALL and return as for nvptx_propagate. */
3902 nvptx_vpropagate (bool is_call
, basic_block block
, rtx_insn
*insn
)
3904 return nvptx_propagate (is_call
, block
, insn
, PM_read_write
, vprop_gen
, 0);
3907 /* Worker for nvptx_wpropagate. */
3910 wprop_gen (rtx reg
, propagate_mask pm
, unsigned rep
, void *data_
)
3912 wcast_data_t
*data
= (wcast_data_t
*)data_
;
3914 if (pm
& PM_loop_begin
)
3916 /* Starting a loop, initialize pointer. */
3917 unsigned align
= GET_MODE_ALIGNMENT (GET_MODE (reg
)) / BITS_PER_UNIT
;
3919 if (align
> worker_bcast_align
)
3920 worker_bcast_align
= align
;
3921 data
->offset
= (data
->offset
+ align
- 1) & ~(align
- 1);
3923 data
->ptr
= gen_reg_rtx (Pmode
);
3925 return gen_adddi3 (data
->ptr
, data
->base
, GEN_INT (data
->offset
));
3927 else if (pm
& PM_loop_end
)
3929 rtx clobber
= gen_rtx_CLOBBER (GET_MODE (data
->ptr
), data
->ptr
);
3930 data
->ptr
= NULL_RTX
;
3934 return nvptx_gen_wcast (reg
, pm
, rep
, data
);
3937 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3938 indicates if this is just before partitioned mode (do spill), or
3939 just after it starts (do fill). Sequence is inserted just after
3940 INSN. IS_CALL and return as for nvptx_propagate. */
3943 nvptx_wpropagate (bool pre_p
, bool is_call
, basic_block block
, rtx_insn
*insn
)
3947 data
.base
= gen_reg_rtx (Pmode
);
3949 data
.ptr
= NULL_RTX
;
3951 bool empty
= nvptx_propagate (is_call
, block
, insn
,
3952 pre_p
? PM_read
: PM_write
, wprop_gen
, &data
);
3953 gcc_assert (empty
== !data
.offset
);
3956 /* Stuff was emitted, initialize the base pointer now. */
3957 rtx init
= gen_rtx_SET (data
.base
, worker_bcast_sym
);
3958 emit_insn_after (init
, insn
);
3960 if (worker_bcast_size
< data
.offset
)
3961 worker_bcast_size
= data
.offset
;
3966 /* Emit a worker-level synchronization barrier. We use different
3967 markers for before and after synchronizations. */
3970 nvptx_wsync (bool after
)
3972 return gen_nvptx_barsync (GEN_INT (after
));
3975 #if WORKAROUND_PTXJIT_BUG
3976 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
3980 bb_first_real_insn (basic_block bb
)
3984 /* Find first insn of from block. */
3985 FOR_BB_INSNS (bb
, insn
)
3993 /* Return true if INSN needs neutering. */
3996 needs_neutering_p (rtx_insn
*insn
)
4001 switch (recog_memoized (insn
))
4003 case CODE_FOR_nvptx_fork
:
4004 case CODE_FOR_nvptx_forked
:
4005 case CODE_FOR_nvptx_joining
:
4006 case CODE_FOR_nvptx_join
:
4007 case CODE_FOR_nvptx_barsync
:
4014 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4017 verify_neutering_jumps (basic_block from
,
4018 rtx_insn
*vector_jump
, rtx_insn
*worker_jump
,
4019 rtx_insn
*vector_label
, rtx_insn
*worker_label
)
4021 basic_block bb
= from
;
4022 rtx_insn
*insn
= BB_HEAD (bb
);
4023 bool seen_worker_jump
= false;
4024 bool seen_vector_jump
= false;
4025 bool seen_worker_label
= false;
4026 bool seen_vector_label
= false;
4027 bool worker_neutered
= false;
4028 bool vector_neutered
= false;
4031 if (insn
== worker_jump
)
4033 seen_worker_jump
= true;
4034 worker_neutered
= true;
4035 gcc_assert (!vector_neutered
);
4037 else if (insn
== vector_jump
)
4039 seen_vector_jump
= true;
4040 vector_neutered
= true;
4042 else if (insn
== worker_label
)
4044 seen_worker_label
= true;
4045 gcc_assert (worker_neutered
);
4046 worker_neutered
= false;
4048 else if (insn
== vector_label
)
4050 seen_vector_label
= true;
4051 gcc_assert (vector_neutered
);
4052 vector_neutered
= false;
4054 else if (INSN_P (insn
))
4055 switch (recog_memoized (insn
))
4057 case CODE_FOR_nvptx_barsync
:
4058 gcc_assert (!vector_neutered
&& !worker_neutered
);
4064 if (insn
!= BB_END (bb
))
4065 insn
= NEXT_INSN (insn
);
4066 else if (JUMP_P (insn
) && single_succ_p (bb
)
4067 && !seen_vector_jump
&& !seen_worker_jump
)
4069 bb
= single_succ (bb
);
4070 insn
= BB_HEAD (bb
);
4076 gcc_assert (!(vector_jump
&& !seen_vector_jump
));
4077 gcc_assert (!(worker_jump
&& !seen_worker_jump
));
4079 if (seen_vector_label
|| seen_worker_label
)
4081 gcc_assert (!(vector_label
&& !seen_vector_label
));
4082 gcc_assert (!(worker_label
&& !seen_worker_label
));
4090 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4093 verify_neutering_labels (basic_block to
, rtx_insn
*vector_label
,
4094 rtx_insn
*worker_label
)
4096 basic_block bb
= to
;
4097 rtx_insn
*insn
= BB_END (bb
);
4098 bool seen_worker_label
= false;
4099 bool seen_vector_label
= false;
4102 if (insn
== worker_label
)
4104 seen_worker_label
= true;
4105 gcc_assert (!seen_vector_label
);
4107 else if (insn
== vector_label
)
4108 seen_vector_label
= true;
4109 else if (INSN_P (insn
))
4110 switch (recog_memoized (insn
))
4112 case CODE_FOR_nvptx_barsync
:
4113 gcc_assert (!seen_vector_label
&& !seen_worker_label
);
4117 if (insn
!= BB_HEAD (bb
))
4118 insn
= PREV_INSN (insn
);
4123 gcc_assert (!(vector_label
&& !seen_vector_label
));
4124 gcc_assert (!(worker_label
&& !seen_worker_label
));
4127 /* Single neutering according to MASK. FROM is the incoming block and
4128 TO is the outgoing block. These may be the same block. Insert at
4131 if (tid.<axis>) goto end.
4133 and insert before ending branch of TO (if there is such an insn):
4136 <possibly-broadcast-cond>
4139 We currently only use differnt FROM and TO when skipping an entire
4140 loop. We could do more if we detected superblocks. */
4143 nvptx_single (unsigned mask
, basic_block from
, basic_block to
)
4145 rtx_insn
*head
= BB_HEAD (from
);
4146 rtx_insn
*tail
= BB_END (to
);
4147 unsigned skip_mask
= mask
;
4151 /* Find first insn of from block. */
4152 while (head
!= BB_END (from
) && !needs_neutering_p (head
))
4153 head
= NEXT_INSN (head
);
4158 if (!(JUMP_P (head
) && single_succ_p (from
)))
4161 basic_block jump_target
= single_succ (from
);
4162 if (!single_pred_p (jump_target
))
4166 head
= BB_HEAD (from
);
4169 /* Find last insn of to block */
4170 rtx_insn
*limit
= from
== to
? head
: BB_HEAD (to
);
4171 while (tail
!= limit
&& !INSN_P (tail
) && !LABEL_P (tail
))
4172 tail
= PREV_INSN (tail
);
4174 /* Detect if tail is a branch. */
4175 rtx tail_branch
= NULL_RTX
;
4176 rtx cond_branch
= NULL_RTX
;
4177 if (tail
&& INSN_P (tail
))
4179 tail_branch
= PATTERN (tail
);
4180 if (GET_CODE (tail_branch
) != SET
|| SET_DEST (tail_branch
) != pc_rtx
)
4181 tail_branch
= NULL_RTX
;
4184 cond_branch
= SET_SRC (tail_branch
);
4185 if (GET_CODE (cond_branch
) != IF_THEN_ELSE
)
4186 cond_branch
= NULL_RTX
;
4192 /* If this is empty, do nothing. */
4193 if (!head
|| !needs_neutering_p (head
))
4198 /* If we're only doing vector single, there's no need to
4199 emit skip code because we'll not insert anything. */
4200 if (!(mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)))
4203 else if (tail_branch
)
4204 /* Block with only unconditional branch. Nothing to do. */
4208 /* Insert the vector test inside the worker test. */
4210 rtx_insn
*before
= tail
;
4211 rtx_insn
*neuter_start
= NULL
;
4212 rtx_insn
*worker_label
= NULL
, *vector_label
= NULL
;
4213 rtx_insn
*worker_jump
= NULL
, *vector_jump
= NULL
;
4214 for (mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
4215 if (GOMP_DIM_MASK (mode
) & skip_mask
)
4217 rtx_code_label
*label
= gen_label_rtx ();
4218 rtx pred
= cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
];
4219 rtx_insn
**mode_jump
= mode
== GOMP_DIM_VECTOR
? &vector_jump
: &worker_jump
;
4220 rtx_insn
**mode_label
= mode
== GOMP_DIM_VECTOR
? &vector_label
: &worker_label
;
4224 pred
= gen_reg_rtx (BImode
);
4225 cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
] = pred
;
4229 if (mode
== GOMP_DIM_VECTOR
)
4230 br
= gen_br_true (pred
, label
);
4232 br
= gen_br_true_uni (pred
, label
);
4234 neuter_start
= emit_insn_after (br
, neuter_start
);
4236 neuter_start
= emit_insn_before (br
, head
);
4237 *mode_jump
= neuter_start
;
4239 LABEL_NUSES (label
)++;
4240 rtx_insn
*label_insn
;
4243 label_insn
= emit_label_before (label
, before
);
4244 before
= label_insn
;
4248 label_insn
= emit_label_after (label
, tail
);
4249 if ((mode
== GOMP_DIM_VECTOR
|| mode
== GOMP_DIM_WORKER
)
4250 && CALL_P (tail
) && find_reg_note (tail
, REG_NORETURN
, NULL
))
4251 emit_insn_after (gen_exit (), label_insn
);
4254 if (mode
== GOMP_DIM_VECTOR
)
4255 vector_label
= label_insn
;
4257 worker_label
= label_insn
;
4260 /* Now deal with propagating the branch condition. */
4263 rtx pvar
= XEXP (XEXP (cond_branch
, 0), 0);
4265 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR
) == mask
)
4267 /* Vector mode only, do a shuffle. */
4268 #if WORKAROUND_PTXJIT_BUG
4269 /* The branch condition %rcond is propagated like this:
4274 setp.ne.u32 %rnotvzero,%x,0;
4277 @%rnotvzero bra Lskip;
4278 setp.<op>.<type> %rcond,op1,op2;
4280 selp.u32 %rcondu32,1,0,%rcond;
4281 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4282 setp.ne.u32 %rcond,%rcondu32,0;
4284 There seems to be a bug in the ptx JIT compiler (observed at driver
4285 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4286 unless %rcond is initialized to something before 'bra Lskip'. The
4287 bug is not observed with ptxas from cuda 8.0.61.
4289 It is true that the code is non-trivial: at Lskip, %rcond is
4290 uninitialized in threads 1-31, and after the selp the same holds
4291 for %rcondu32. But shfl propagates the defined value in thread 0
4292 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4293 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4295 There is nothing in the PTX spec to suggest that this is wrong, or
4296 to explain why the extra initialization is needed. So, we classify
4297 it as a JIT bug, and the extra initialization as workaround:
4302 setp.ne.u32 %rnotvzero,%x,0;
4305 +.reg .pred %rcond2;
4306 +setp.eq.u32 %rcond2, 1, 0;
4308 @%rnotvzero bra Lskip;
4309 setp.<op>.<type> %rcond,op1,op2;
4310 +mov.pred %rcond2, %rcond;
4312 +mov.pred %rcond, %rcond2;
4313 selp.u32 %rcondu32,1,0,%rcond;
4314 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4315 setp.ne.u32 %rcond,%rcondu32,0;
4317 rtx_insn
*label
= PREV_INSN (tail
);
4318 gcc_assert (label
&& LABEL_P (label
));
4319 rtx tmp
= gen_reg_rtx (BImode
);
4320 emit_insn_before (gen_movbi (tmp
, const0_rtx
),
4321 bb_first_real_insn (from
));
4322 emit_insn_before (gen_rtx_SET (tmp
, pvar
), label
);
4323 emit_insn_before (gen_rtx_SET (pvar
, tmp
), tail
);
4325 emit_insn_before (nvptx_gen_vcast (pvar
), tail
);
4329 /* Includes worker mode, do spill & fill. By construction
4330 we should never have worker mode only. */
4333 data
.base
= worker_bcast_sym
;
4336 if (worker_bcast_size
< GET_MODE_SIZE (SImode
))
4337 worker_bcast_size
= GET_MODE_SIZE (SImode
);
4340 emit_insn_before (nvptx_gen_wcast (pvar
, PM_read
, 0, &data
),
4342 /* Barrier so other workers can see the write. */
4343 emit_insn_before (nvptx_wsync (false), tail
);
4345 emit_insn_before (nvptx_gen_wcast (pvar
, PM_write
, 0, &data
), tail
);
4346 /* This barrier is needed to avoid worker zero clobbering
4347 the broadcast buffer before all the other workers have
4348 had a chance to read this instance of it. */
4349 emit_insn_before (nvptx_wsync (true), tail
);
4352 extract_insn (tail
);
4353 rtx unsp
= gen_rtx_UNSPEC (BImode
, gen_rtvec (1, pvar
),
4355 validate_change (tail
, recog_data
.operand_loc
[0], unsp
, false);
4358 bool seen_label
= verify_neutering_jumps (from
, vector_jump
, worker_jump
,
4359 vector_label
, worker_label
);
4361 verify_neutering_labels (to
, vector_label
, worker_label
);
4364 /* PAR is a parallel that is being skipped in its entirety according to
4365 MASK. Treat this as skipping a superblock starting at forked
4366 and ending at joining. */
4369 nvptx_skip_par (unsigned mask
, parallel
*par
)
4371 basic_block tail
= par
->join_block
;
4372 gcc_assert (tail
->preds
->length () == 1);
4374 basic_block pre_tail
= (*tail
->preds
)[0]->src
;
4375 gcc_assert (pre_tail
->succs
->length () == 1);
4377 nvptx_single (mask
, par
->forked_block
, pre_tail
);
4380 /* If PAR has a single inner parallel and PAR itself only contains
4381 empty entry and exit blocks, swallow the inner PAR. */
4384 nvptx_optimize_inner (parallel
*par
)
4386 parallel
*inner
= par
->inner
;
4388 /* We mustn't be the outer dummy par. */
4392 /* We must have a single inner par. */
4393 if (!inner
|| inner
->next
)
4396 /* We must only contain 2 blocks ourselves -- the head and tail of
4398 if (par
->blocks
.length () != 2)
4401 /* We must be disjoint partitioning. As we only have vector and
4402 worker partitioning, this is sufficient to guarantee the pars
4403 have adjacent partitioning. */
4404 if ((par
->mask
& inner
->mask
) & (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1))
4405 /* This indicates malformed code generation. */
4408 /* The outer forked insn should be immediately followed by the inner
4410 rtx_insn
*forked
= par
->forked_insn
;
4411 rtx_insn
*fork
= BB_END (par
->forked_block
);
4413 if (NEXT_INSN (forked
) != fork
)
4415 gcc_checking_assert (recog_memoized (fork
) == CODE_FOR_nvptx_fork
);
4417 /* The outer joining insn must immediately follow the inner join
4419 rtx_insn
*joining
= par
->joining_insn
;
4420 rtx_insn
*join
= inner
->join_insn
;
4421 if (NEXT_INSN (join
) != joining
)
4424 /* Preconditions met. Swallow the inner par. */
4426 fprintf (dump_file
, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4427 inner
->mask
, inner
->forked_block
->index
,
4428 inner
->join_block
->index
,
4429 par
->mask
, par
->forked_block
->index
, par
->join_block
->index
);
4431 par
->mask
|= inner
->mask
& (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1);
4433 par
->blocks
.reserve (inner
->blocks
.length ());
4434 while (inner
->blocks
.length ())
4435 par
->blocks
.quick_push (inner
->blocks
.pop ());
4437 par
->inner
= inner
->inner
;
4438 inner
->inner
= NULL
;
4443 /* Process the parallel PAR and all its contained
4444 parallels. We do everything but the neutering. Return mask of
4445 partitioned modes used within this parallel. */
4448 nvptx_process_pars (parallel
*par
)
4451 nvptx_optimize_inner (par
);
4453 unsigned inner_mask
= par
->mask
;
4455 /* Do the inner parallels first. */
4458 par
->inner_mask
= nvptx_process_pars (par
->inner
);
4459 inner_mask
|= par
->inner_mask
;
4462 bool is_call
= (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_MAX
)) != 0;
4464 if (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
4466 nvptx_wpropagate (false, is_call
, par
->forked_block
, par
->forked_insn
);
4467 bool empty
= nvptx_wpropagate (true, is_call
,
4468 par
->forked_block
, par
->fork_insn
);
4470 if (!empty
|| !is_call
)
4472 /* Insert begin and end synchronizations. */
4473 emit_insn_before (nvptx_wsync (false), par
->forked_insn
);
4474 emit_insn_before (nvptx_wsync (true), par
->join_insn
);
4477 else if (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
4478 nvptx_vpropagate (is_call
, par
->forked_block
, par
->forked_insn
);
4480 /* Now do siblings. */
4482 inner_mask
|= nvptx_process_pars (par
->next
);
4486 /* Neuter the parallel described by PAR. We recurse in depth-first
4487 order. MODES are the partitioning of the execution and OUTER is
4488 the partitioning of the parallels we are contained in. */
4491 nvptx_neuter_pars (parallel
*par
, unsigned modes
, unsigned outer
)
4493 unsigned me
= (par
->mask
4494 & (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
4495 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
4496 unsigned skip_mask
= 0, neuter_mask
= 0;
4499 nvptx_neuter_pars (par
->inner
, modes
, outer
| me
);
4501 for (unsigned mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
4503 if ((outer
| me
) & GOMP_DIM_MASK (mode
))
4504 {} /* Mode is partitioned: no neutering. */
4505 else if (!(modes
& GOMP_DIM_MASK (mode
)))
4506 {} /* Mode is not used: nothing to do. */
4507 else if (par
->inner_mask
& GOMP_DIM_MASK (mode
)
4508 || !par
->forked_insn
)
4509 /* Partitioned in inner parallels, or we're not a partitioned
4510 at all: neuter individual blocks. */
4511 neuter_mask
|= GOMP_DIM_MASK (mode
);
4512 else if (!par
->parent
|| !par
->parent
->forked_insn
4513 || par
->parent
->inner_mask
& GOMP_DIM_MASK (mode
))
4514 /* Parent isn't a parallel or contains this paralleling: skip
4515 parallel at this level. */
4516 skip_mask
|= GOMP_DIM_MASK (mode
);
4518 {} /* Parent will skip this parallel itself. */
4527 /* Neuter whole SESE regions. */
4528 bb_pair_vec_t regions
;
4530 nvptx_find_sese (par
->blocks
, regions
);
4531 len
= regions
.length ();
4532 for (ix
= 0; ix
!= len
; ix
++)
4534 basic_block from
= regions
[ix
].first
;
4535 basic_block to
= regions
[ix
].second
;
4538 nvptx_single (neuter_mask
, from
, to
);
4545 /* Neuter each BB individually. */
4546 len
= par
->blocks
.length ();
4547 for (ix
= 0; ix
!= len
; ix
++)
4549 basic_block block
= par
->blocks
[ix
];
4551 nvptx_single (neuter_mask
, block
, block
);
4557 nvptx_skip_par (skip_mask
, par
);
4560 nvptx_neuter_pars (par
->next
, modes
, outer
);
4563 #if WORKAROUND_PTXJIT_BUG_2
4564 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4565 is needed in the nvptx target because the branches generated for
4566 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4569 nvptx_pc_set (const rtx_insn
*insn
, bool strict
= true)
4572 if ((strict
&& !JUMP_P (insn
))
4573 || (!strict
&& !INSN_P (insn
)))
4575 pat
= PATTERN (insn
);
4577 /* The set is allowed to appear either as the insn pattern or
4578 the first set in a PARALLEL. */
4579 if (GET_CODE (pat
) == PARALLEL
)
4580 pat
= XVECEXP (pat
, 0, 0);
4581 if (GET_CODE (pat
) == SET
&& GET_CODE (SET_DEST (pat
)) == PC
)
4587 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4590 nvptx_condjump_label (const rtx_insn
*insn
, bool strict
= true)
4592 rtx x
= nvptx_pc_set (insn
, strict
);
4597 if (GET_CODE (x
) == LABEL_REF
)
4599 if (GET_CODE (x
) != IF_THEN_ELSE
)
4601 if (XEXP (x
, 2) == pc_rtx
&& GET_CODE (XEXP (x
, 1)) == LABEL_REF
)
4603 if (XEXP (x
, 1) == pc_rtx
&& GET_CODE (XEXP (x
, 2)) == LABEL_REF
)
4608 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4609 insn inbetween the branch and the label. This works around a JIT bug
4610 observed at driver version 384.111, at -O0 for sm_50. */
4613 prevent_branch_around_nothing (void)
4615 rtx_insn
*seen_label
= NULL
;
4616 for (rtx_insn
*insn
= get_insns (); insn
; insn
= NEXT_INSN (insn
))
4618 if (INSN_P (insn
) && condjump_p (insn
))
4620 seen_label
= label_ref_label (nvptx_condjump_label (insn
, false));
4624 if (seen_label
== NULL
)
4627 if (NOTE_P (insn
) || DEBUG_INSN_P (insn
))
4631 switch (recog_memoized (insn
))
4633 case CODE_FOR_nvptx_fork
:
4634 case CODE_FOR_nvptx_forked
:
4635 case CODE_FOR_nvptx_joining
:
4636 case CODE_FOR_nvptx_join
:
4643 if (LABEL_P (insn
) && insn
== seen_label
)
4644 emit_insn_before (gen_fake_nop (), insn
);
4651 #ifdef WORKAROUND_PTXJIT_BUG_3
4652 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4653 works around a hang observed at driver version 390.48 for sm_50. */
4656 workaround_barsyncs (void)
4658 bool seen_barsync
= false;
4659 for (rtx_insn
*insn
= get_insns (); insn
; insn
= NEXT_INSN (insn
))
4661 if (INSN_P (insn
) && recog_memoized (insn
) == CODE_FOR_nvptx_barsync
)
4665 emit_insn_before (gen_nvptx_membar_cta (), insn
);
4666 emit_insn_before (gen_nvptx_membar_cta (), insn
);
4669 seen_barsync
= true;
4676 if (NOTE_P (insn
) || DEBUG_INSN_P (insn
))
4678 else if (INSN_P (insn
))
4679 switch (recog_memoized (insn
))
4681 case CODE_FOR_nvptx_fork
:
4682 case CODE_FOR_nvptx_forked
:
4683 case CODE_FOR_nvptx_joining
:
4684 case CODE_FOR_nvptx_join
:
4690 seen_barsync
= false;
4695 /* PTX-specific reorganization
4696 - Split blocks at fork and join instructions
4697 - Compute live registers
4698 - Mark now-unused registers, so function begin doesn't declare
4700 - Insert state propagation when entering partitioned mode
4701 - Insert neutering instructions when in single mode
4702 - Replace subregs with suitable sequences.
4708 /* We are freeing block_for_insn in the toplev to keep compatibility
4709 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4710 compute_bb_for_insn ();
4712 thread_prologue_and_epilogue_insns ();
4714 /* Split blocks and record interesting unspecs. */
4715 bb_insn_map_t bb_insn_map
;
4717 nvptx_split_blocks (&bb_insn_map
);
4719 /* Compute live regs */
4720 df_clear_flags (DF_LR_RUN_DCE
);
4721 df_set_flags (DF_NO_INSN_RESCAN
| DF_NO_HARD_REGS
);
4722 df_live_add_problem ();
4723 df_live_set_all_dirty ();
4725 regstat_init_n_sets_and_refs ();
4728 df_dump (dump_file
);
4730 /* Mark unused regs as unused. */
4731 int max_regs
= max_reg_num ();
4732 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< max_regs
; i
++)
4733 if (REG_N_SETS (i
) == 0 && REG_N_REFS (i
) == 0)
4734 regno_reg_rtx
[i
] = const0_rtx
;
4736 /* Determine launch dimensions of the function. If it is not an
4737 offloaded function (i.e. this is a regular compiler), the
4738 function has no neutering. */
4739 tree attr
= oacc_get_fn_attrib (current_function_decl
);
4742 /* If we determined this mask before RTL expansion, we could
4743 elide emission of some levels of forks and joins. */
4745 tree dims
= TREE_VALUE (attr
);
4748 for (ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++, dims
= TREE_CHAIN (dims
))
4750 int size
= TREE_INT_CST_LOW (TREE_VALUE (dims
));
4751 tree allowed
= TREE_PURPOSE (dims
);
4753 if (size
!= 1 && !(allowed
&& integer_zerop (allowed
)))
4754 mask
|= GOMP_DIM_MASK (ix
);
4756 /* If there is worker neutering, there must be vector
4757 neutering. Otherwise the hardware will fail. */
4758 gcc_assert (!(mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
4759 || (mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
4761 /* Discover & process partitioned regions. */
4762 parallel
*pars
= nvptx_discover_pars (&bb_insn_map
);
4763 nvptx_process_pars (pars
);
4764 nvptx_neuter_pars (pars
, mask
, 0);
4768 /* Replace subregs. */
4769 nvptx_reorg_subreg ();
4771 if (TARGET_UNIFORM_SIMT
)
4772 nvptx_reorg_uniform_simt ();
4774 #if WORKAROUND_PTXJIT_BUG_2
4775 prevent_branch_around_nothing ();
4778 #ifdef WORKAROUND_PTXJIT_BUG_3
4779 workaround_barsyncs ();
4782 regstat_free_n_sets_and_refs ();
4784 df_finish_pass (true);
4787 /* Handle a "kernel" attribute; arguments as in
4788 struct attribute_spec.handler. */
4791 nvptx_handle_kernel_attribute (tree
*node
, tree name
, tree
ARG_UNUSED (args
),
4792 int ARG_UNUSED (flags
), bool *no_add_attrs
)
4796 if (TREE_CODE (decl
) != FUNCTION_DECL
)
4798 error ("%qE attribute only applies to functions", name
);
4799 *no_add_attrs
= true;
4801 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl
))))
4803 error ("%qE attribute requires a void return type", name
);
4804 *no_add_attrs
= true;
4810 /* Handle a "shared" attribute; arguments as in
4811 struct attribute_spec.handler. */
4814 nvptx_handle_shared_attribute (tree
*node
, tree name
, tree
ARG_UNUSED (args
),
4815 int ARG_UNUSED (flags
), bool *no_add_attrs
)
4819 if (TREE_CODE (decl
) != VAR_DECL
)
4821 error ("%qE attribute only applies to variables", name
);
4822 *no_add_attrs
= true;
4824 else if (!(TREE_PUBLIC (decl
) || TREE_STATIC (decl
)))
4826 error ("%qE attribute not allowed with auto storage class", name
);
4827 *no_add_attrs
= true;
4833 /* Table of valid machine attributes. */
4834 static const struct attribute_spec nvptx_attribute_table
[] =
4836 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
4837 affects_type_identity, handler, exclude } */
4838 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute
,
4840 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute
,
4842 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
4845 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
4847 static HOST_WIDE_INT
4848 nvptx_vector_alignment (const_tree type
)
4850 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
4852 return MIN (align
, BIGGEST_ALIGNMENT
);
4855 /* Indicate that INSN cannot be duplicated. */
4858 nvptx_cannot_copy_insn_p (rtx_insn
*insn
)
4860 switch (recog_memoized (insn
))
4862 case CODE_FOR_nvptx_shufflesi
:
4863 case CODE_FOR_nvptx_shufflesf
:
4864 case CODE_FOR_nvptx_barsync
:
4865 case CODE_FOR_nvptx_fork
:
4866 case CODE_FOR_nvptx_forked
:
4867 case CODE_FOR_nvptx_joining
:
4868 case CODE_FOR_nvptx_join
:
4875 /* Section anchors do not work. Initialization for flag_section_anchor
4876 probes the existence of the anchoring target hooks and prevents
4877 anchoring if they don't exist. However, we may be being used with
4878 a host-side compiler that does support anchoring, and hence see
4879 the anchor flag set (as it's not recalculated). So provide an
4880 implementation denying anchoring. */
4883 nvptx_use_anchors_for_symbol_p (const_rtx
ARG_UNUSED (a
))
4888 /* Record a symbol for mkoffload to enter into the mapping table. */
4891 nvptx_record_offload_symbol (tree decl
)
4893 switch (TREE_CODE (decl
))
4896 fprintf (asm_out_file
, "//:VAR_MAP \"%s\"\n",
4897 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
4902 tree attr
= oacc_get_fn_attrib (decl
);
4903 /* OpenMP offloading does not set this attribute. */
4904 tree dims
= attr
? TREE_VALUE (attr
) : NULL_TREE
;
4906 fprintf (asm_out_file
, "//:FUNC_MAP \"%s\"",
4907 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
4909 for (; dims
; dims
= TREE_CHAIN (dims
))
4911 int size
= TREE_INT_CST_LOW (TREE_VALUE (dims
));
4913 gcc_assert (!TREE_PURPOSE (dims
));
4914 fprintf (asm_out_file
, ", %#x", size
);
4917 fprintf (asm_out_file
, "\n");
4926 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4927 at the start of a file. */
4930 nvptx_file_start (void)
4932 fputs ("// BEGIN PREAMBLE\n", asm_out_file
);
4933 fputs ("\t.version\t3.1\n", asm_out_file
);
4935 fputs ("\t.target\tsm_35\n", asm_out_file
);
4937 fputs ("\t.target\tsm_30\n", asm_out_file
);
4938 fprintf (asm_out_file
, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode
));
4939 fputs ("// END PREAMBLE\n", asm_out_file
);
4942 /* Emit a declaration for a worker-level buffer in .shared memory. */
4945 write_worker_buffer (FILE *file
, rtx sym
, unsigned align
, unsigned size
)
4947 const char *name
= XSTR (sym
, 0);
4949 write_var_marker (file
, true, false, name
);
4950 fprintf (file
, ".shared .align %d .u8 %s[%d];\n",
4954 /* Write out the function declarations we've collected and declare storage
4955 for the broadcast buffer. */
4958 nvptx_file_end (void)
4960 hash_table
<tree_hasher
>::iterator iter
;
4962 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab
, decl
, tree
, iter
)
4963 nvptx_record_fndecl (decl
);
4964 fputs (func_decls
.str().c_str(), asm_out_file
);
4966 if (worker_bcast_size
)
4967 write_worker_buffer (asm_out_file
, worker_bcast_sym
,
4968 worker_bcast_align
, worker_bcast_size
);
4970 if (worker_red_size
)
4971 write_worker_buffer (asm_out_file
, worker_red_sym
,
4972 worker_red_align
, worker_red_size
);
4974 if (need_softstack_decl
)
4976 write_var_marker (asm_out_file
, false, true, "__nvptx_stacks");
4977 /* 32 is the maximum number of warps in a block. Even though it's an
4978 external declaration, emit the array size explicitly; otherwise, it
4979 may fail at PTX JIT time if the definition is later in link order. */
4980 fprintf (asm_out_file
, ".extern .shared .u%d __nvptx_stacks[32];\n",
4983 if (need_unisimt_decl
)
4985 write_var_marker (asm_out_file
, false, true, "__nvptx_uni");
4986 fprintf (asm_out_file
, ".extern .shared .u32 __nvptx_uni[32];\n");
4990 /* Expander for the shuffle builtins. */
4993 nvptx_expand_shuffle (tree exp
, rtx target
, machine_mode mode
, int ignore
)
4998 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 0),
4999 NULL_RTX
, mode
, EXPAND_NORMAL
);
5001 src
= copy_to_mode_reg (mode
, src
);
5003 rtx idx
= expand_expr (CALL_EXPR_ARG (exp
, 1),
5004 NULL_RTX
, SImode
, EXPAND_NORMAL
);
5005 rtx op
= expand_expr (CALL_EXPR_ARG (exp
, 2),
5006 NULL_RTX
, SImode
, EXPAND_NORMAL
);
5008 if (!REG_P (idx
) && GET_CODE (idx
) != CONST_INT
)
5009 idx
= copy_to_mode_reg (SImode
, idx
);
5011 rtx pat
= nvptx_gen_shuffle (target
, src
, idx
,
5012 (nvptx_shuffle_kind
) INTVAL (op
));
5019 /* Worker reduction address expander. */
5022 nvptx_expand_worker_addr (tree exp
, rtx target
,
5023 machine_mode
ARG_UNUSED (mode
), int ignore
)
5028 unsigned align
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 2));
5029 if (align
> worker_red_align
)
5030 worker_red_align
= align
;
5032 unsigned offset
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 0));
5033 unsigned size
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 1));
5034 if (size
+ offset
> worker_red_size
)
5035 worker_red_size
= size
+ offset
;
5037 rtx addr
= worker_red_sym
;
5040 addr
= gen_rtx_PLUS (Pmode
, addr
, GEN_INT (offset
));
5041 addr
= gen_rtx_CONST (Pmode
, addr
);
5044 emit_move_insn (target
, addr
);
5049 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5050 not require taking the address of any object, other than the memory
5051 cell being operated on. */
5054 nvptx_expand_cmp_swap (tree exp
, rtx target
,
5055 machine_mode
ARG_UNUSED (m
), int ARG_UNUSED (ignore
))
5057 machine_mode mode
= TYPE_MODE (TREE_TYPE (exp
));
5060 target
= gen_reg_rtx (mode
);
5062 rtx mem
= expand_expr (CALL_EXPR_ARG (exp
, 0),
5063 NULL_RTX
, Pmode
, EXPAND_NORMAL
);
5064 rtx cmp
= expand_expr (CALL_EXPR_ARG (exp
, 1),
5065 NULL_RTX
, mode
, EXPAND_NORMAL
);
5066 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 2),
5067 NULL_RTX
, mode
, EXPAND_NORMAL
);
5070 mem
= gen_rtx_MEM (mode
, mem
);
5072 cmp
= copy_to_mode_reg (mode
, cmp
);
5074 src
= copy_to_mode_reg (mode
, src
);
5077 pat
= gen_atomic_compare_and_swapsi_1 (target
, mem
, cmp
, src
, const0_rtx
);
5079 pat
= gen_atomic_compare_and_swapdi_1 (target
, mem
, cmp
, src
, const0_rtx
);
5087 /* Codes for all the NVPTX builtins. */
5090 NVPTX_BUILTIN_SHUFFLE
,
5091 NVPTX_BUILTIN_SHUFFLELL
,
5092 NVPTX_BUILTIN_WORKER_ADDR
,
5093 NVPTX_BUILTIN_CMP_SWAP
,
5094 NVPTX_BUILTIN_CMP_SWAPLL
,
5098 static GTY(()) tree nvptx_builtin_decls
[NVPTX_BUILTIN_MAX
];
5100 /* Return the NVPTX builtin for CODE. */
5103 nvptx_builtin_decl (unsigned code
, bool ARG_UNUSED (initialize_p
))
5105 if (code
>= NVPTX_BUILTIN_MAX
)
5106 return error_mark_node
;
5108 return nvptx_builtin_decls
[code
];
5111 /* Set up all builtin functions for this target. */
5114 nvptx_init_builtins (void)
5116 #define DEF(ID, NAME, T) \
5117 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5118 = add_builtin_function ("__builtin_nvptx_" NAME, \
5119 build_function_type_list T, \
5120 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5122 #define UINT unsigned_type_node
5123 #define LLUINT long_long_unsigned_type_node
5124 #define PTRVOID ptr_type_node
5126 DEF (SHUFFLE
, "shuffle", (UINT
, UINT
, UINT
, UINT
, NULL_TREE
));
5127 DEF (SHUFFLELL
, "shufflell", (LLUINT
, LLUINT
, UINT
, UINT
, NULL_TREE
));
5128 DEF (WORKER_ADDR
, "worker_addr",
5129 (PTRVOID
, ST
, UINT
, UINT
, NULL_TREE
));
5130 DEF (CMP_SWAP
, "cmp_swap", (UINT
, PTRVOID
, UINT
, UINT
, NULL_TREE
));
5131 DEF (CMP_SWAPLL
, "cmp_swapll", (LLUINT
, PTRVOID
, LLUINT
, LLUINT
, NULL_TREE
));
5140 /* Expand an expression EXP that calls a built-in function,
5141 with result going to TARGET if that's convenient
5142 (and in mode MODE if that's convenient).
5143 SUBTARGET may be used as the target for computing one of EXP's operands.
5144 IGNORE is nonzero if the value is to be ignored. */
5147 nvptx_expand_builtin (tree exp
, rtx target
, rtx
ARG_UNUSED (subtarget
),
5148 machine_mode mode
, int ignore
)
5150 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
5151 switch (DECL_FUNCTION_CODE (fndecl
))
5153 case NVPTX_BUILTIN_SHUFFLE
:
5154 case NVPTX_BUILTIN_SHUFFLELL
:
5155 return nvptx_expand_shuffle (exp
, target
, mode
, ignore
);
5157 case NVPTX_BUILTIN_WORKER_ADDR
:
5158 return nvptx_expand_worker_addr (exp
, target
, mode
, ignore
);
5160 case NVPTX_BUILTIN_CMP_SWAP
:
5161 case NVPTX_BUILTIN_CMP_SWAPLL
:
5162 return nvptx_expand_cmp_swap (exp
, target
, mode
, ignore
);
5164 default: gcc_unreachable ();
5168 /* Define dimension sizes for known hardware. */
5169 #define PTX_VECTOR_LENGTH 32
5170 #define PTX_WORKER_LENGTH 32
5171 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
5173 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5178 return PTX_VECTOR_LENGTH
;
5181 /* Validate compute dimensions of an OpenACC offload or routine, fill
5182 in non-unity defaults. FN_LEVEL indicates the level at which a
5183 routine might spawn a loop. It is negative for non-routines. If
5184 DECL is null, we are validating the default dimensions. */
5187 nvptx_goacc_validate_dims (tree decl
, int dims
[], int fn_level
)
5189 bool changed
= false;
5191 /* The vector size must be 32, unless this is a SEQ routine. */
5192 if (fn_level
<= GOMP_DIM_VECTOR
&& fn_level
>= -1
5193 && dims
[GOMP_DIM_VECTOR
] >= 0
5194 && dims
[GOMP_DIM_VECTOR
] != PTX_VECTOR_LENGTH
)
5196 if (fn_level
< 0 && dims
[GOMP_DIM_VECTOR
] >= 0)
5197 warning_at (decl
? DECL_SOURCE_LOCATION (decl
) : UNKNOWN_LOCATION
, 0,
5198 dims
[GOMP_DIM_VECTOR
]
5199 ? G_("using vector_length (%d), ignoring %d")
5200 : G_("using vector_length (%d), ignoring runtime setting"),
5201 PTX_VECTOR_LENGTH
, dims
[GOMP_DIM_VECTOR
]);
5202 dims
[GOMP_DIM_VECTOR
] = PTX_VECTOR_LENGTH
;
5206 /* Check the num workers is not too large. */
5207 if (dims
[GOMP_DIM_WORKER
] > PTX_WORKER_LENGTH
)
5209 warning_at (decl
? DECL_SOURCE_LOCATION (decl
) : UNKNOWN_LOCATION
, 0,
5210 "using num_workers (%d), ignoring %d",
5211 PTX_WORKER_LENGTH
, dims
[GOMP_DIM_WORKER
]);
5212 dims
[GOMP_DIM_WORKER
] = PTX_WORKER_LENGTH
;
5218 dims
[GOMP_DIM_VECTOR
] = PTX_VECTOR_LENGTH
;
5219 if (dims
[GOMP_DIM_WORKER
] < 0)
5220 dims
[GOMP_DIM_WORKER
] = PTX_DEFAULT_RUNTIME_DIM
;
5221 if (dims
[GOMP_DIM_GANG
] < 0)
5222 dims
[GOMP_DIM_GANG
] = PTX_DEFAULT_RUNTIME_DIM
;
5229 /* Return maximum dimension size, or zero for unbounded. */
5232 nvptx_dim_limit (int axis
)
5236 case GOMP_DIM_VECTOR
:
5237 return PTX_VECTOR_LENGTH
;
5245 /* Determine whether fork & joins are needed. */
5248 nvptx_goacc_fork_join (gcall
*call
, const int dims
[],
5249 bool ARG_UNUSED (is_fork
))
5251 tree arg
= gimple_call_arg (call
, 2);
5252 unsigned axis
= TREE_INT_CST_LOW (arg
);
5254 /* We only care about worker and vector partitioning. */
5255 if (axis
< GOMP_DIM_WORKER
)
5258 /* If the size is 1, there's no partitioning. */
5259 if (dims
[axis
] == 1)
5265 /* Generate a PTX builtin function call that returns the address in
5266 the worker reduction buffer at OFFSET. TYPE is the type of the
5267 data at that location. */
5270 nvptx_get_worker_red_addr (tree type
, tree offset
)
5272 machine_mode mode
= TYPE_MODE (type
);
5273 tree fndecl
= nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR
, true);
5274 tree size
= build_int_cst (unsigned_type_node
, GET_MODE_SIZE (mode
));
5275 tree align
= build_int_cst (unsigned_type_node
,
5276 GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
);
5277 tree call
= build_call_expr (fndecl
, 3, offset
, size
, align
);
5279 return fold_convert (build_pointer_type (type
), call
);
5282 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5283 will cast the variable if necessary. */
5286 nvptx_generate_vector_shuffle (location_t loc
,
5287 tree dest_var
, tree var
, unsigned shift
,
5290 unsigned fn
= NVPTX_BUILTIN_SHUFFLE
;
5291 tree_code code
= NOP_EXPR
;
5292 tree arg_type
= unsigned_type_node
;
5293 tree var_type
= TREE_TYPE (var
);
5294 tree dest_type
= var_type
;
5296 if (TREE_CODE (var_type
) == COMPLEX_TYPE
)
5297 var_type
= TREE_TYPE (var_type
);
5299 if (TREE_CODE (var_type
) == REAL_TYPE
)
5300 code
= VIEW_CONVERT_EXPR
;
5302 if (TYPE_SIZE (var_type
)
5303 == TYPE_SIZE (long_long_unsigned_type_node
))
5305 fn
= NVPTX_BUILTIN_SHUFFLELL
;
5306 arg_type
= long_long_unsigned_type_node
;
5309 tree call
= nvptx_builtin_decl (fn
, true);
5310 tree bits
= build_int_cst (unsigned_type_node
, shift
);
5311 tree kind
= build_int_cst (unsigned_type_node
, SHUFFLE_DOWN
);
5314 if (var_type
!= dest_type
)
5316 /* Do real and imaginary parts separately. */
5317 tree real
= fold_build1 (REALPART_EXPR
, var_type
, var
);
5318 real
= fold_build1 (code
, arg_type
, real
);
5319 real
= build_call_expr_loc (loc
, call
, 3, real
, bits
, kind
);
5320 real
= fold_build1 (code
, var_type
, real
);
5322 tree imag
= fold_build1 (IMAGPART_EXPR
, var_type
, var
);
5323 imag
= fold_build1 (code
, arg_type
, imag
);
5324 imag
= build_call_expr_loc (loc
, call
, 3, imag
, bits
, kind
);
5325 imag
= fold_build1 (code
, var_type
, imag
);
5327 expr
= fold_build2 (COMPLEX_EXPR
, dest_type
, real
, imag
);
5331 expr
= fold_build1 (code
, arg_type
, var
);
5332 expr
= build_call_expr_loc (loc
, call
, 3, expr
, bits
, kind
);
5333 expr
= fold_build1 (code
, dest_type
, expr
);
5336 gimplify_assign (dest_var
, expr
, seq
);
5339 /* Lazily generate the global lock var decl and return its address. */
5342 nvptx_global_lock_addr ()
5344 tree v
= global_lock_var
;
5348 tree name
= get_identifier ("__reduction_lock");
5349 tree type
= build_qualified_type (unsigned_type_node
,
5350 TYPE_QUAL_VOLATILE
);
5351 v
= build_decl (BUILTINS_LOCATION
, VAR_DECL
, name
, type
);
5352 global_lock_var
= v
;
5353 DECL_ARTIFICIAL (v
) = 1;
5354 DECL_EXTERNAL (v
) = 1;
5355 TREE_STATIC (v
) = 1;
5356 TREE_PUBLIC (v
) = 1;
5358 mark_addressable (v
);
5359 mark_decl_referenced (v
);
5362 return build_fold_addr_expr (v
);
5365 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5366 GSI. We use a lockless scheme for nearly all case, which looks
5368 actual = initval(OP);
5371 write = guess OP myval;
5372 actual = cmp&swap (ptr, guess, write)
5373 } while (actual bit-different-to guess);
5376 This relies on a cmp&swap instruction, which is available for 32-
5377 and 64-bit types. Larger types must use a locking scheme. */
5380 nvptx_lockless_update (location_t loc
, gimple_stmt_iterator
*gsi
,
5381 tree ptr
, tree var
, tree_code op
)
5383 unsigned fn
= NVPTX_BUILTIN_CMP_SWAP
;
5384 tree_code code
= NOP_EXPR
;
5385 tree arg_type
= unsigned_type_node
;
5386 tree var_type
= TREE_TYPE (var
);
5388 if (TREE_CODE (var_type
) == COMPLEX_TYPE
5389 || TREE_CODE (var_type
) == REAL_TYPE
)
5390 code
= VIEW_CONVERT_EXPR
;
5392 if (TYPE_SIZE (var_type
) == TYPE_SIZE (long_long_unsigned_type_node
))
5394 arg_type
= long_long_unsigned_type_node
;
5395 fn
= NVPTX_BUILTIN_CMP_SWAPLL
;
5398 tree swap_fn
= nvptx_builtin_decl (fn
, true);
5400 gimple_seq init_seq
= NULL
;
5401 tree init_var
= make_ssa_name (arg_type
);
5402 tree init_expr
= omp_reduction_init_op (loc
, op
, var_type
);
5403 init_expr
= fold_build1 (code
, arg_type
, init_expr
);
5404 gimplify_assign (init_var
, init_expr
, &init_seq
);
5405 gimple
*init_end
= gimple_seq_last (init_seq
);
5407 gsi_insert_seq_before (gsi
, init_seq
, GSI_SAME_STMT
);
5409 /* Split the block just after the init stmts. */
5410 basic_block pre_bb
= gsi_bb (*gsi
);
5411 edge pre_edge
= split_block (pre_bb
, init_end
);
5412 basic_block loop_bb
= pre_edge
->dest
;
5413 pre_bb
= pre_edge
->src
;
5414 /* Reset the iterator. */
5415 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5417 tree expect_var
= make_ssa_name (arg_type
);
5418 tree actual_var
= make_ssa_name (arg_type
);
5419 tree write_var
= make_ssa_name (arg_type
);
5421 /* Build and insert the reduction calculation. */
5422 gimple_seq red_seq
= NULL
;
5423 tree write_expr
= fold_build1 (code
, var_type
, expect_var
);
5424 write_expr
= fold_build2 (op
, var_type
, write_expr
, var
);
5425 write_expr
= fold_build1 (code
, arg_type
, write_expr
);
5426 gimplify_assign (write_var
, write_expr
, &red_seq
);
5428 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
5430 /* Build & insert the cmp&swap sequence. */
5431 gimple_seq latch_seq
= NULL
;
5432 tree swap_expr
= build_call_expr_loc (loc
, swap_fn
, 3,
5433 ptr
, expect_var
, write_var
);
5434 gimplify_assign (actual_var
, swap_expr
, &latch_seq
);
5436 gcond
*cond
= gimple_build_cond (EQ_EXPR
, actual_var
, expect_var
,
5437 NULL_TREE
, NULL_TREE
);
5438 gimple_seq_add_stmt (&latch_seq
, cond
);
5440 gimple
*latch_end
= gimple_seq_last (latch_seq
);
5441 gsi_insert_seq_before (gsi
, latch_seq
, GSI_SAME_STMT
);
5443 /* Split the block just after the latch stmts. */
5444 edge post_edge
= split_block (loop_bb
, latch_end
);
5445 basic_block post_bb
= post_edge
->dest
;
5446 loop_bb
= post_edge
->src
;
5447 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5449 post_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
5450 post_edge
->probability
= profile_probability::even ();
5451 edge loop_edge
= make_edge (loop_bb
, loop_bb
, EDGE_FALSE_VALUE
);
5452 loop_edge
->probability
= profile_probability::even ();
5453 set_immediate_dominator (CDI_DOMINATORS
, loop_bb
, pre_bb
);
5454 set_immediate_dominator (CDI_DOMINATORS
, post_bb
, loop_bb
);
5456 gphi
*phi
= create_phi_node (expect_var
, loop_bb
);
5457 add_phi_arg (phi
, init_var
, pre_edge
, loc
);
5458 add_phi_arg (phi
, actual_var
, loop_edge
, loc
);
5460 loop
*loop
= alloc_loop ();
5461 loop
->header
= loop_bb
;
5462 loop
->latch
= loop_bb
;
5463 add_loop (loop
, loop_bb
->loop_father
);
5465 return fold_build1 (code
, var_type
, write_var
);
5468 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5469 GSI. This is necessary for types larger than 64 bits, where there
5470 is no cmp&swap instruction to implement a lockless scheme. We use
5471 a lock variable in global memory.
5473 while (cmp&swap (&lock_var, 0, 1))
5476 accum = accum OP var;
5478 cmp&swap (&lock_var, 1, 0);
5481 A lock in global memory is necessary to force execution engine
5482 descheduling and avoid resource starvation that can occur if the
5483 lock is in .shared memory. */
5486 nvptx_lockfull_update (location_t loc
, gimple_stmt_iterator
*gsi
,
5487 tree ptr
, tree var
, tree_code op
)
5489 tree var_type
= TREE_TYPE (var
);
5490 tree swap_fn
= nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP
, true);
5491 tree uns_unlocked
= build_int_cst (unsigned_type_node
, 0);
5492 tree uns_locked
= build_int_cst (unsigned_type_node
, 1);
5494 /* Split the block just before the gsi. Insert a gimple nop to make
5496 gimple
*nop
= gimple_build_nop ();
5497 gsi_insert_before (gsi
, nop
, GSI_SAME_STMT
);
5498 basic_block entry_bb
= gsi_bb (*gsi
);
5499 edge entry_edge
= split_block (entry_bb
, nop
);
5500 basic_block lock_bb
= entry_edge
->dest
;
5501 /* Reset the iterator. */
5502 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5504 /* Build and insert the locking sequence. */
5505 gimple_seq lock_seq
= NULL
;
5506 tree lock_var
= make_ssa_name (unsigned_type_node
);
5507 tree lock_expr
= nvptx_global_lock_addr ();
5508 lock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, lock_expr
,
5509 uns_unlocked
, uns_locked
);
5510 gimplify_assign (lock_var
, lock_expr
, &lock_seq
);
5511 gcond
*cond
= gimple_build_cond (EQ_EXPR
, lock_var
, uns_unlocked
,
5512 NULL_TREE
, NULL_TREE
);
5513 gimple_seq_add_stmt (&lock_seq
, cond
);
5514 gimple
*lock_end
= gimple_seq_last (lock_seq
);
5515 gsi_insert_seq_before (gsi
, lock_seq
, GSI_SAME_STMT
);
5517 /* Split the block just after the lock sequence. */
5518 edge locked_edge
= split_block (lock_bb
, lock_end
);
5519 basic_block update_bb
= locked_edge
->dest
;
5520 lock_bb
= locked_edge
->src
;
5521 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5523 /* Create the lock loop ... */
5524 locked_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
5525 locked_edge
->probability
= profile_probability::even ();
5526 edge loop_edge
= make_edge (lock_bb
, lock_bb
, EDGE_FALSE_VALUE
);
5527 loop_edge
->probability
= profile_probability::even ();
5528 set_immediate_dominator (CDI_DOMINATORS
, lock_bb
, entry_bb
);
5529 set_immediate_dominator (CDI_DOMINATORS
, update_bb
, lock_bb
);
5531 /* ... and the loop structure. */
5532 loop
*lock_loop
= alloc_loop ();
5533 lock_loop
->header
= lock_bb
;
5534 lock_loop
->latch
= lock_bb
;
5535 lock_loop
->nb_iterations_estimate
= 1;
5536 lock_loop
->any_estimate
= true;
5537 add_loop (lock_loop
, entry_bb
->loop_father
);
5539 /* Build and insert the reduction calculation. */
5540 gimple_seq red_seq
= NULL
;
5541 tree acc_in
= make_ssa_name (var_type
);
5542 tree ref_in
= build_simple_mem_ref (ptr
);
5543 TREE_THIS_VOLATILE (ref_in
) = 1;
5544 gimplify_assign (acc_in
, ref_in
, &red_seq
);
5546 tree acc_out
= make_ssa_name (var_type
);
5547 tree update_expr
= fold_build2 (op
, var_type
, ref_in
, var
);
5548 gimplify_assign (acc_out
, update_expr
, &red_seq
);
5550 tree ref_out
= build_simple_mem_ref (ptr
);
5551 TREE_THIS_VOLATILE (ref_out
) = 1;
5552 gimplify_assign (ref_out
, acc_out
, &red_seq
);
5554 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
5556 /* Build & insert the unlock sequence. */
5557 gimple_seq unlock_seq
= NULL
;
5558 tree unlock_expr
= nvptx_global_lock_addr ();
5559 unlock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, unlock_expr
,
5560 uns_locked
, uns_unlocked
);
5561 gimplify_and_add (unlock_expr
, &unlock_seq
);
5562 gsi_insert_seq_before (gsi
, unlock_seq
, GSI_SAME_STMT
);
5567 /* Emit a sequence to update a reduction accumlator at *PTR with the
5568 value held in VAR using operator OP. Return the updated value.
5570 TODO: optimize for atomic ops and indepedent complex ops. */
5573 nvptx_reduction_update (location_t loc
, gimple_stmt_iterator
*gsi
,
5574 tree ptr
, tree var
, tree_code op
)
5576 tree type
= TREE_TYPE (var
);
5577 tree size
= TYPE_SIZE (type
);
5579 if (size
== TYPE_SIZE (unsigned_type_node
)
5580 || size
== TYPE_SIZE (long_long_unsigned_type_node
))
5581 return nvptx_lockless_update (loc
, gsi
, ptr
, var
, op
);
5583 return nvptx_lockfull_update (loc
, gsi
, ptr
, var
, op
);
5586 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5589 nvptx_goacc_reduction_setup (gcall
*call
)
5591 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
5592 tree lhs
= gimple_call_lhs (call
);
5593 tree var
= gimple_call_arg (call
, 2);
5594 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
5595 gimple_seq seq
= NULL
;
5597 push_gimplify_context (true);
5599 if (level
!= GOMP_DIM_GANG
)
5601 /* Copy the receiver object. */
5602 tree ref_to_res
= gimple_call_arg (call
, 1);
5604 if (!integer_zerop (ref_to_res
))
5605 var
= build_simple_mem_ref (ref_to_res
);
5608 if (level
== GOMP_DIM_WORKER
)
5610 /* Store incoming value to worker reduction buffer. */
5611 tree offset
= gimple_call_arg (call
, 5);
5612 tree call
= nvptx_get_worker_red_addr (TREE_TYPE (var
), offset
);
5613 tree ptr
= make_ssa_name (TREE_TYPE (call
));
5615 gimplify_assign (ptr
, call
, &seq
);
5616 tree ref
= build_simple_mem_ref (ptr
);
5617 TREE_THIS_VOLATILE (ref
) = 1;
5618 gimplify_assign (ref
, var
, &seq
);
5622 gimplify_assign (lhs
, var
, &seq
);
5624 pop_gimplify_context (NULL
);
5625 gsi_replace_with_seq (&gsi
, seq
, true);
5628 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
5631 nvptx_goacc_reduction_init (gcall
*call
)
5633 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
5634 tree lhs
= gimple_call_lhs (call
);
5635 tree var
= gimple_call_arg (call
, 2);
5636 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
5637 enum tree_code rcode
5638 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
5639 tree init
= omp_reduction_init_op (gimple_location (call
), rcode
,
5641 gimple_seq seq
= NULL
;
5643 push_gimplify_context (true);
5645 if (level
== GOMP_DIM_VECTOR
)
5647 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5648 tree tid
= make_ssa_name (integer_type_node
);
5649 tree dim_vector
= gimple_call_arg (call
, 3);
5650 gimple
*tid_call
= gimple_build_call_internal (IFN_GOACC_DIM_POS
, 1,
5652 gimple
*cond_stmt
= gimple_build_cond (NE_EXPR
, tid
, integer_zero_node
,
5653 NULL_TREE
, NULL_TREE
);
5655 gimple_call_set_lhs (tid_call
, tid
);
5656 gimple_seq_add_stmt (&seq
, tid_call
);
5657 gimple_seq_add_stmt (&seq
, cond_stmt
);
5659 /* Split the block just after the call. */
5660 edge init_edge
= split_block (gsi_bb (gsi
), call
);
5661 basic_block init_bb
= init_edge
->dest
;
5662 basic_block call_bb
= init_edge
->src
;
5664 /* Fixup flags from call_bb to init_bb. */
5665 init_edge
->flags
^= EDGE_FALLTHRU
| EDGE_TRUE_VALUE
;
5666 init_edge
->probability
= profile_probability::even ();
5668 /* Set the initialization stmts. */
5669 gimple_seq init_seq
= NULL
;
5670 tree init_var
= make_ssa_name (TREE_TYPE (var
));
5671 gimplify_assign (init_var
, init
, &init_seq
);
5672 gsi
= gsi_start_bb (init_bb
);
5673 gsi_insert_seq_before (&gsi
, init_seq
, GSI_SAME_STMT
);
5675 /* Split block just after the init stmt. */
5677 edge inited_edge
= split_block (gsi_bb (gsi
), gsi_stmt (gsi
));
5678 basic_block dst_bb
= inited_edge
->dest
;
5680 /* Create false edge from call_bb to dst_bb. */
5681 edge nop_edge
= make_edge (call_bb
, dst_bb
, EDGE_FALSE_VALUE
);
5682 nop_edge
->probability
= profile_probability::even ();
5684 /* Create phi node in dst block. */
5685 gphi
*phi
= create_phi_node (lhs
, dst_bb
);
5686 add_phi_arg (phi
, init_var
, inited_edge
, gimple_location (call
));
5687 add_phi_arg (phi
, var
, nop_edge
, gimple_location (call
));
5689 /* Reset dominator of dst bb. */
5690 set_immediate_dominator (CDI_DOMINATORS
, dst_bb
, call_bb
);
5692 /* Reset the gsi. */
5693 gsi
= gsi_for_stmt (call
);
5697 if (level
== GOMP_DIM_GANG
)
5699 /* If there's no receiver object, propagate the incoming VAR. */
5700 tree ref_to_res
= gimple_call_arg (call
, 1);
5701 if (integer_zerop (ref_to_res
))
5705 gimplify_assign (lhs
, init
, &seq
);
5708 pop_gimplify_context (NULL
);
5709 gsi_replace_with_seq (&gsi
, seq
, true);
5712 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
5715 nvptx_goacc_reduction_fini (gcall
*call
)
5717 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
5718 tree lhs
= gimple_call_lhs (call
);
5719 tree ref_to_res
= gimple_call_arg (call
, 1);
5720 tree var
= gimple_call_arg (call
, 2);
5721 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
5723 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
5724 gimple_seq seq
= NULL
;
5725 tree r
= NULL_TREE
;;
5727 push_gimplify_context (true);
5729 if (level
== GOMP_DIM_VECTOR
)
5731 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5732 but that requires a method of emitting a unified jump at the
5734 for (int shfl
= PTX_VECTOR_LENGTH
/ 2; shfl
> 0; shfl
= shfl
>> 1)
5736 tree other_var
= make_ssa_name (TREE_TYPE (var
));
5737 nvptx_generate_vector_shuffle (gimple_location (call
),
5738 other_var
, var
, shfl
, &seq
);
5740 r
= make_ssa_name (TREE_TYPE (var
));
5741 gimplify_assign (r
, fold_build2 (op
, TREE_TYPE (var
),
5742 var
, other_var
), &seq
);
5748 tree accum
= NULL_TREE
;
5750 if (level
== GOMP_DIM_WORKER
)
5752 /* Get reduction buffer address. */
5753 tree offset
= gimple_call_arg (call
, 5);
5754 tree call
= nvptx_get_worker_red_addr (TREE_TYPE (var
), offset
);
5755 tree ptr
= make_ssa_name (TREE_TYPE (call
));
5757 gimplify_assign (ptr
, call
, &seq
);
5760 else if (integer_zerop (ref_to_res
))
5767 /* UPDATE the accumulator. */
5768 gsi_insert_seq_before (&gsi
, seq
, GSI_SAME_STMT
);
5770 r
= nvptx_reduction_update (gimple_location (call
), &gsi
,
5776 gimplify_assign (lhs
, r
, &seq
);
5777 pop_gimplify_context (NULL
);
5779 gsi_replace_with_seq (&gsi
, seq
, true);
5782 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5785 nvptx_goacc_reduction_teardown (gcall
*call
)
5787 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
5788 tree lhs
= gimple_call_lhs (call
);
5789 tree var
= gimple_call_arg (call
, 2);
5790 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
5791 gimple_seq seq
= NULL
;
5793 push_gimplify_context (true);
5794 if (level
== GOMP_DIM_WORKER
)
5796 /* Read the worker reduction buffer. */
5797 tree offset
= gimple_call_arg (call
, 5);
5798 tree call
= nvptx_get_worker_red_addr(TREE_TYPE (var
), offset
);
5799 tree ptr
= make_ssa_name (TREE_TYPE (call
));
5801 gimplify_assign (ptr
, call
, &seq
);
5802 var
= build_simple_mem_ref (ptr
);
5803 TREE_THIS_VOLATILE (var
) = 1;
5806 if (level
!= GOMP_DIM_GANG
)
5808 /* Write to the receiver object. */
5809 tree ref_to_res
= gimple_call_arg (call
, 1);
5811 if (!integer_zerop (ref_to_res
))
5812 gimplify_assign (build_simple_mem_ref (ref_to_res
), var
, &seq
);
5816 gimplify_assign (lhs
, var
, &seq
);
5818 pop_gimplify_context (NULL
);
5820 gsi_replace_with_seq (&gsi
, seq
, true);
5823 /* NVPTX reduction expander. */
5826 nvptx_goacc_reduction (gcall
*call
)
5828 unsigned code
= (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call
, 0));
5832 case IFN_GOACC_REDUCTION_SETUP
:
5833 nvptx_goacc_reduction_setup (call
);
5836 case IFN_GOACC_REDUCTION_INIT
:
5837 nvptx_goacc_reduction_init (call
);
5840 case IFN_GOACC_REDUCTION_FINI
:
5841 nvptx_goacc_reduction_fini (call
);
5844 case IFN_GOACC_REDUCTION_TEARDOWN
:
5845 nvptx_goacc_reduction_teardown (call
);
5854 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
,
5855 rtx x ATTRIBUTE_UNUSED
)
5861 nvptx_vector_mode_supported (machine_mode mode
)
5863 return (mode
== V2SImode
5864 || mode
== V2DImode
);
5867 /* Return the preferred mode for vectorizing scalar MODE. */
5870 nvptx_preferred_simd_mode (scalar_mode mode
)
5880 return default_preferred_simd_mode (mode
);
5885 nvptx_data_alignment (const_tree type
, unsigned int basic_align
)
5887 if (TREE_CODE (type
) == INTEGER_TYPE
)
5889 unsigned HOST_WIDE_INT size
= tree_to_uhwi (TYPE_SIZE_UNIT (type
));
5890 if (size
== GET_MODE_SIZE (TImode
))
5891 return GET_MODE_BITSIZE (maybe_split_mode (TImode
));
5897 /* Implement TARGET_MODES_TIEABLE_P. */
5900 nvptx_modes_tieable_p (machine_mode
, machine_mode
)
5905 /* Implement TARGET_HARD_REGNO_NREGS. */
5908 nvptx_hard_regno_nregs (unsigned int, machine_mode
)
5913 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
5916 nvptx_can_change_mode_class (machine_mode
, machine_mode
, reg_class_t
)
5921 #undef TARGET_OPTION_OVERRIDE
5922 #define TARGET_OPTION_OVERRIDE nvptx_option_override
5924 #undef TARGET_ATTRIBUTE_TABLE
5925 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5928 #define TARGET_LRA_P hook_bool_void_false
5930 #undef TARGET_LEGITIMATE_ADDRESS_P
5931 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5933 #undef TARGET_PROMOTE_FUNCTION_MODE
5934 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5936 #undef TARGET_FUNCTION_ARG
5937 #define TARGET_FUNCTION_ARG nvptx_function_arg
5938 #undef TARGET_FUNCTION_INCOMING_ARG
5939 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5940 #undef TARGET_FUNCTION_ARG_ADVANCE
5941 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
5942 #undef TARGET_FUNCTION_ARG_BOUNDARY
5943 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
5944 #undef TARGET_PASS_BY_REFERENCE
5945 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5946 #undef TARGET_FUNCTION_VALUE_REGNO_P
5947 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5948 #undef TARGET_FUNCTION_VALUE
5949 #define TARGET_FUNCTION_VALUE nvptx_function_value
5950 #undef TARGET_LIBCALL_VALUE
5951 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
5952 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
5953 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
5954 #undef TARGET_GET_DRAP_RTX
5955 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
5956 #undef TARGET_SPLIT_COMPLEX_ARG
5957 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5958 #undef TARGET_RETURN_IN_MEMORY
5959 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5960 #undef TARGET_OMIT_STRUCT_RETURN_REG
5961 #define TARGET_OMIT_STRUCT_RETURN_REG true
5962 #undef TARGET_STRICT_ARGUMENT_NAMING
5963 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
5964 #undef TARGET_CALL_ARGS
5965 #define TARGET_CALL_ARGS nvptx_call_args
5966 #undef TARGET_END_CALL_ARGS
5967 #define TARGET_END_CALL_ARGS nvptx_end_call_args
5969 #undef TARGET_ASM_FILE_START
5970 #define TARGET_ASM_FILE_START nvptx_file_start
5971 #undef TARGET_ASM_FILE_END
5972 #define TARGET_ASM_FILE_END nvptx_file_end
5973 #undef TARGET_ASM_GLOBALIZE_LABEL
5974 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
5975 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
5976 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
5977 #undef TARGET_PRINT_OPERAND
5978 #define TARGET_PRINT_OPERAND nvptx_print_operand
5979 #undef TARGET_PRINT_OPERAND_ADDRESS
5980 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
5981 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
5982 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
5983 #undef TARGET_ASM_INTEGER
5984 #define TARGET_ASM_INTEGER nvptx_assemble_integer
5985 #undef TARGET_ASM_DECL_END
5986 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
5987 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
5988 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
5989 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
5990 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
5991 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
5992 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
5994 #undef TARGET_MACHINE_DEPENDENT_REORG
5995 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
5996 #undef TARGET_NO_REGISTER_ALLOCATION
5997 #define TARGET_NO_REGISTER_ALLOCATION true
5999 #undef TARGET_ENCODE_SECTION_INFO
6000 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
6001 #undef TARGET_RECORD_OFFLOAD_SYMBOL
6002 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6004 #undef TARGET_VECTOR_ALIGNMENT
6005 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6007 #undef TARGET_CANNOT_COPY_INSN_P
6008 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6010 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6011 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6013 #undef TARGET_INIT_BUILTINS
6014 #define TARGET_INIT_BUILTINS nvptx_init_builtins
6015 #undef TARGET_EXPAND_BUILTIN
6016 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6017 #undef TARGET_BUILTIN_DECL
6018 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
6020 #undef TARGET_SIMT_VF
6021 #define TARGET_SIMT_VF nvptx_simt_vf
6023 #undef TARGET_GOACC_VALIDATE_DIMS
6024 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6026 #undef TARGET_GOACC_DIM_LIMIT
6027 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6029 #undef TARGET_GOACC_FORK_JOIN
6030 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6032 #undef TARGET_GOACC_REDUCTION
6033 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6035 #undef TARGET_CANNOT_FORCE_CONST_MEM
6036 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6038 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6039 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6041 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6042 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6043 nvptx_preferred_simd_mode
6045 #undef TARGET_MODES_TIEABLE_P
6046 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6048 #undef TARGET_HARD_REGNO_NREGS
6049 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6051 #undef TARGET_CAN_CHANGE_MODE_CLASS
6052 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6054 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6055 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6057 struct gcc_target targetm
= TARGET_INITIALIZER
;
6059 #include "gt-nvptx.h"