1 /* Target code for NVPTX.
2 Copyright (C) 2014-2018 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
26 #include "coretypes.h"
40 #include "diagnostic.h"
42 #include "insn-flags.h"
44 #include "insn-attr.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
58 #include "stor-layout.h"
60 #include "omp-general.h"
62 #include "gomp-constants.h"
64 #include "internal-fn.h"
65 #include "gimple-iterator.h"
66 #include "stringpool.h"
69 #include "tree-ssa-operands.h"
70 #include "tree-ssanames.h"
72 #include "tree-phinodes.h"
74 #include "fold-const.h"
77 /* This file should be included last. */
78 #include "target-def.h"
80 #define WORKAROUND_PTXJIT_BUG 1
82 /* The various PTX memory areas an object might reside in. */
94 /* We record the data area in the target symbol flags. */
95 #define SYMBOL_DATA_AREA(SYM) \
96 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
98 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
99 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
101 /* Record the function decls we've written, and the libfuncs and function
102 decls corresponding to them. */
103 static std::stringstream func_decls
;
105 struct declared_libfunc_hasher
: ggc_cache_ptr_hash
<rtx_def
>
107 static hashval_t
hash (rtx x
) { return htab_hash_pointer (x
); }
108 static bool equal (rtx a
, rtx b
) { return a
== b
; }
112 hash_table
<declared_libfunc_hasher
> *declared_libfuncs_htab
;
114 struct tree_hasher
: ggc_cache_ptr_hash
<tree_node
>
116 static hashval_t
hash (tree t
) { return htab_hash_pointer (t
); }
117 static bool equal (tree a
, tree b
) { return a
== b
; }
120 static GTY((cache
)) hash_table
<tree_hasher
> *declared_fndecls_htab
;
121 static GTY((cache
)) hash_table
<tree_hasher
> *needed_fndecls_htab
;
123 /* Buffer needed to broadcast across workers. This is used for both
124 worker-neutering and worker broadcasting. It is shared by all
125 functions emitted. The buffer is placed in shared memory. It'd be
126 nice if PTX supported common blocks, because then this could be
127 shared across TUs (taking the largest size). */
128 static unsigned worker_bcast_size
;
129 static unsigned worker_bcast_align
;
130 static GTY(()) rtx worker_bcast_sym
;
132 /* Buffer needed for worker reductions. This has to be distinct from
133 the worker broadcast array, as both may be live concurrently. */
134 static unsigned worker_red_size
;
135 static unsigned worker_red_align
;
136 static GTY(()) rtx worker_red_sym
;
138 /* Global lock variable, needed for 128bit worker & gang reductions. */
139 static GTY(()) tree global_lock_var
;
141 /* True if any function references __nvptx_stacks. */
142 static bool need_softstack_decl
;
144 /* True if any function references __nvptx_uni. */
145 static bool need_unisimt_decl
;
147 /* Allocate a new, cleared machine_function structure. */
149 static struct machine_function
*
150 nvptx_init_machine_status (void)
152 struct machine_function
*p
= ggc_cleared_alloc
<machine_function
> ();
153 p
->return_mode
= VOIDmode
;
157 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
158 and -fopenacc is also enabled. */
161 diagnose_openacc_conflict (bool optval
, const char *optname
)
163 if (flag_openacc
&& optval
)
164 error ("option %s is not supported together with -fopenacc", optname
);
167 /* Implement TARGET_OPTION_OVERRIDE. */
170 nvptx_option_override (void)
172 init_machine_status
= nvptx_init_machine_status
;
174 /* Set toplevel_reorder, unless explicitly disabled. We need
175 reordering so that we emit necessary assembler decls of
176 undeclared variables. */
177 if (!global_options_set
.x_flag_toplevel_reorder
)
178 flag_toplevel_reorder
= 1;
180 debug_nonbind_markers_p
= 0;
182 /* Set flag_no_common, unless explicitly disabled. We fake common
183 using .weak, and that's not entirely accurate, so avoid it
185 if (!global_options_set
.x_flag_no_common
)
188 /* The patch area requires nops, which we don't have. */
189 if (function_entry_patch_area_size
> 0)
190 sorry ("not generating patch area, nops not supported");
192 /* Assumes that it will see only hard registers. */
193 flag_var_tracking
= 0;
195 if (nvptx_optimize
< 0)
196 nvptx_optimize
= optimize
> 0;
198 declared_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
199 needed_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
200 declared_libfuncs_htab
201 = hash_table
<declared_libfunc_hasher
>::create_ggc (17);
203 worker_bcast_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__worker_bcast");
204 SET_SYMBOL_DATA_AREA (worker_bcast_sym
, DATA_AREA_SHARED
);
205 worker_bcast_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
207 worker_red_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__worker_red");
208 SET_SYMBOL_DATA_AREA (worker_red_sym
, DATA_AREA_SHARED
);
209 worker_red_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
211 diagnose_openacc_conflict (TARGET_GOMP
, "-mgomp");
212 diagnose_openacc_conflict (TARGET_SOFT_STACK
, "-msoft-stack");
213 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT
, "-muniform-simt");
216 target_flags
|= MASK_SOFT_STACK
| MASK_UNIFORM_SIMT
;
219 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
220 deal with ptx ideosyncracies. */
223 nvptx_ptx_type_from_mode (machine_mode mode
, bool promote
)
258 /* Encode the PTX data area that DECL (which might not actually be a
259 _DECL) should reside in. */
262 nvptx_encode_section_info (tree decl
, rtx rtl
, int first
)
264 default_encode_section_info (decl
, rtl
, first
);
265 if (first
&& MEM_P (rtl
))
267 nvptx_data_area area
= DATA_AREA_GENERIC
;
269 if (TREE_CONSTANT (decl
))
270 area
= DATA_AREA_CONST
;
271 else if (TREE_CODE (decl
) == VAR_DECL
)
273 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl
)))
275 area
= DATA_AREA_SHARED
;
276 if (DECL_INITIAL (decl
))
277 error ("static initialization of variable %q+D in %<.shared%>"
278 " memory is not supported", decl
);
281 area
= TREE_READONLY (decl
) ? DATA_AREA_CONST
: DATA_AREA_GLOBAL
;
284 SET_SYMBOL_DATA_AREA (XEXP (rtl
, 0), area
);
288 /* Return the PTX name of the data area in which SYM should be
289 placed. The symbol must have already been processed by
290 nvptx_encode_seciton_info, or equivalent. */
293 section_for_sym (rtx sym
)
295 nvptx_data_area area
= SYMBOL_DATA_AREA (sym
);
296 /* Same order as nvptx_data_area enum. */
297 static char const *const areas
[] =
298 {"", ".global", ".shared", ".local", ".const", ".param"};
303 /* Similarly for a decl. */
306 section_for_decl (const_tree decl
)
308 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree
, decl
)), 0));
311 /* Check NAME for special function names and redirect them by returning a
312 replacement. This applies to malloc, free and realloc, for which we
313 want to use libgcc wrappers, and call, which triggers a bug in
314 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
315 not active in an offload compiler -- the names are all set by the
316 host-side compiler. */
319 nvptx_name_replacement (const char *name
)
321 if (strcmp (name
, "call") == 0)
322 return "__nvptx_call";
323 if (strcmp (name
, "malloc") == 0)
324 return "__nvptx_malloc";
325 if (strcmp (name
, "free") == 0)
326 return "__nvptx_free";
327 if (strcmp (name
, "realloc") == 0)
328 return "__nvptx_realloc";
332 /* If MODE should be treated as two registers of an inner mode, return
333 that inner mode. Otherwise return VOIDmode. */
336 maybe_split_mode (machine_mode mode
)
338 if (COMPLEX_MODE_P (mode
))
339 return GET_MODE_INNER (mode
);
347 /* Return true if mode should be treated as two registers. */
350 split_mode_p (machine_mode mode
)
352 return maybe_split_mode (mode
) != VOIDmode
;
355 /* Output a register, subreg, or register pair (with optional
356 enclosing braces). */
359 output_reg (FILE *file
, unsigned regno
, machine_mode inner_mode
,
360 int subreg_offset
= -1)
362 if (inner_mode
== VOIDmode
)
364 if (HARD_REGISTER_NUM_P (regno
))
365 fprintf (file
, "%s", reg_names
[regno
]);
367 fprintf (file
, "%%r%d", regno
);
369 else if (subreg_offset
>= 0)
371 output_reg (file
, regno
, VOIDmode
);
372 fprintf (file
, "$%d", subreg_offset
);
376 if (subreg_offset
== -1)
378 output_reg (file
, regno
, inner_mode
, GET_MODE_SIZE (inner_mode
));
380 output_reg (file
, regno
, inner_mode
, 0);
381 if (subreg_offset
== -1)
386 /* Emit forking instructions for MASK. */
389 nvptx_emit_forking (unsigned mask
, bool is_call
)
391 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
392 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
395 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
397 /* Emit fork at all levels. This helps form SESE regions, as
398 it creates a block with a single successor before entering a
399 partitooned region. That is a good candidate for the end of
402 emit_insn (gen_nvptx_fork (op
));
403 emit_insn (gen_nvptx_forked (op
));
407 /* Emit joining instructions for MASK. */
410 nvptx_emit_joining (unsigned mask
, bool is_call
)
412 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
413 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
416 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
418 /* Emit joining for all non-call pars to ensure there's a single
419 predecessor for the block the join insn ends up in. This is
420 needed for skipping entire loops. */
422 emit_insn (gen_nvptx_joining (op
));
423 emit_insn (gen_nvptx_join (op
));
428 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
429 returned in memory. Integer and floating types supported by the
430 machine are passed in registers, everything else is passed in
431 memory. Complex types are split. */
434 pass_in_memory (machine_mode mode
, const_tree type
, bool for_return
)
438 if (AGGREGATE_TYPE_P (type
))
440 if (TREE_CODE (type
) == VECTOR_TYPE
)
444 if (!for_return
&& COMPLEX_MODE_P (mode
))
445 /* Complex types are passed as two underlying args. */
446 mode
= GET_MODE_INNER (mode
);
448 if (GET_MODE_CLASS (mode
) != MODE_INT
449 && GET_MODE_CLASS (mode
) != MODE_FLOAT
)
452 if (GET_MODE_SIZE (mode
) > UNITS_PER_WORD
)
458 /* A non-memory argument of mode MODE is being passed, determine the mode it
459 should be promoted to. This is also used for determining return
463 promote_arg (machine_mode mode
, bool prototyped
)
465 if (!prototyped
&& mode
== SFmode
)
466 /* K&R float promotion for unprototyped functions. */
468 else if (GET_MODE_SIZE (mode
) < GET_MODE_SIZE (SImode
))
474 /* A non-memory return type of MODE is being returned. Determine the
475 mode it should be promoted to. */
478 promote_return (machine_mode mode
)
480 return promote_arg (mode
, true);
483 /* Implement TARGET_FUNCTION_ARG. */
486 nvptx_function_arg (cumulative_args_t
ARG_UNUSED (cum_v
), machine_mode mode
,
487 const_tree
, bool named
)
489 if (mode
== VOIDmode
|| !named
)
492 return gen_reg_rtx (mode
);
495 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
498 nvptx_function_incoming_arg (cumulative_args_t cum_v
, machine_mode mode
,
499 const_tree
, bool named
)
501 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
503 if (mode
== VOIDmode
|| !named
)
506 /* No need to deal with split modes here, the only case that can
507 happen is complex modes and those are dealt with by
508 TARGET_SPLIT_COMPLEX_ARG. */
509 return gen_rtx_UNSPEC (mode
,
510 gen_rtvec (1, GEN_INT (cum
->count
)),
514 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
517 nvptx_function_arg_advance (cumulative_args_t cum_v
,
518 machine_mode
ARG_UNUSED (mode
),
519 const_tree
ARG_UNUSED (type
),
520 bool ARG_UNUSED (named
))
522 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
527 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
529 For nvptx This is only used for varadic args. The type has already
530 been promoted and/or converted to invisible reference. */
533 nvptx_function_arg_boundary (machine_mode mode
, const_tree
ARG_UNUSED (type
))
535 return GET_MODE_ALIGNMENT (mode
);
538 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
540 For nvptx, we know how to handle functions declared as stdarg: by
541 passing an extra pointer to the unnamed arguments. However, the
542 Fortran frontend can produce a different situation, where a
543 function pointer is declared with no arguments, but the actual
544 function and calls to it take more arguments. In that case, we
545 want to ensure the call matches the definition of the function. */
548 nvptx_strict_argument_naming (cumulative_args_t cum_v
)
550 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
552 return cum
->fntype
== NULL_TREE
|| stdarg_p (cum
->fntype
);
555 /* Implement TARGET_LIBCALL_VALUE. */
558 nvptx_libcall_value (machine_mode mode
, const_rtx
)
560 if (!cfun
|| !cfun
->machine
->doing_call
)
561 /* Pretend to return in a hard reg for early uses before pseudos can be
563 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
565 return gen_reg_rtx (mode
);
568 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
569 where function FUNC returns or receives a value of data type TYPE. */
572 nvptx_function_value (const_tree type
, const_tree
ARG_UNUSED (func
),
575 machine_mode mode
= promote_return (TYPE_MODE (type
));
580 cfun
->machine
->return_mode
= mode
;
581 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
584 return nvptx_libcall_value (mode
, NULL_RTX
);
587 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
590 nvptx_function_value_regno_p (const unsigned int regno
)
592 return regno
== NVPTX_RETURN_REGNUM
;
595 /* Types with a mode other than those supported by the machine are passed by
596 reference in memory. */
599 nvptx_pass_by_reference (cumulative_args_t
ARG_UNUSED (cum
),
600 machine_mode mode
, const_tree type
,
601 bool ARG_UNUSED (named
))
603 return pass_in_memory (mode
, type
, false);
606 /* Implement TARGET_RETURN_IN_MEMORY. */
609 nvptx_return_in_memory (const_tree type
, const_tree
)
611 return pass_in_memory (TYPE_MODE (type
), type
, true);
614 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
617 nvptx_promote_function_mode (const_tree type
, machine_mode mode
,
618 int *ARG_UNUSED (punsignedp
),
619 const_tree funtype
, int for_return
)
621 return promote_arg (mode
, for_return
|| !type
|| TYPE_ARG_TYPES (funtype
));
624 /* Helper for write_arg. Emit a single PTX argument of MODE, either
625 in a prototype, or as copy in a function prologue. ARGNO is the
626 index of this argument in the PTX function. FOR_REG is negative,
627 if we're emitting the PTX prototype. It is zero if we're copying
628 to an argument register and it is greater than zero if we're
629 copying to a specific hard register. */
632 write_arg_mode (std::stringstream
&s
, int for_reg
, int argno
,
635 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
639 /* Writing PTX prototype. */
640 s
<< (argno
? ", " : " (");
641 s
<< ".param" << ptx_type
<< " %in_ar" << argno
;
645 s
<< "\t.reg" << ptx_type
<< " ";
647 s
<< reg_names
[for_reg
];
653 s
<< "\tld.param" << ptx_type
<< " ";
655 s
<< reg_names
[for_reg
];
658 s
<< ", [%in_ar" << argno
<< "];\n";
664 /* Process function parameter TYPE to emit one or more PTX
665 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
666 is true, if this is a prototyped function, rather than an old-style
667 C declaration. Returns the next argument number to use.
669 The promotion behavior here must match the regular GCC function
670 parameter marshalling machinery. */
673 write_arg_type (std::stringstream
&s
, int for_reg
, int argno
,
674 tree type
, bool prototyped
)
676 machine_mode mode
= TYPE_MODE (type
);
678 if (mode
== VOIDmode
)
681 if (pass_in_memory (mode
, type
, false))
685 bool split
= TREE_CODE (type
) == COMPLEX_TYPE
;
689 /* Complex types are sent as two separate args. */
690 type
= TREE_TYPE (type
);
691 mode
= TYPE_MODE (type
);
695 mode
= promote_arg (mode
, prototyped
);
697 argno
= write_arg_mode (s
, for_reg
, argno
, mode
);
700 return write_arg_mode (s
, for_reg
, argno
, mode
);
703 /* Emit a PTX return as a prototype or function prologue declaration
707 write_return_mode (std::stringstream
&s
, bool for_proto
, machine_mode mode
)
709 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
710 const char *pfx
= "\t.reg";
711 const char *sfx
= ";\n";
714 pfx
= "(.param", sfx
= "_out) ";
716 s
<< pfx
<< ptx_type
<< " " << reg_names
[NVPTX_RETURN_REGNUM
] << sfx
;
719 /* Process a function return TYPE to emit a PTX return as a prototype
720 or function prologue declaration. Returns true if return is via an
721 additional pointer parameter. The promotion behavior here must
722 match the regular GCC function return mashalling. */
725 write_return_type (std::stringstream
&s
, bool for_proto
, tree type
)
727 machine_mode mode
= TYPE_MODE (type
);
729 if (mode
== VOIDmode
)
732 bool return_in_mem
= pass_in_memory (mode
, type
, true);
737 return return_in_mem
;
739 /* Named return values can cause us to return a pointer as well
740 as expect an argument for the return location. This is
741 optimization-level specific, so no caller can make use of
742 this data, but more importantly for us, we must ensure it
743 doesn't change the PTX prototype. */
744 mode
= (machine_mode
) cfun
->machine
->return_mode
;
746 if (mode
== VOIDmode
)
747 return return_in_mem
;
749 /* Clear return_mode to inhibit copy of retval to non-existent
751 cfun
->machine
->return_mode
= VOIDmode
;
754 mode
= promote_return (mode
);
756 write_return_mode (s
, for_proto
, mode
);
758 return return_in_mem
;
761 /* Look for attributes in ATTRS that would indicate we must write a function
762 as a .entry kernel rather than a .func. Return true if one is found. */
765 write_as_kernel (tree attrs
)
767 return (lookup_attribute ("kernel", attrs
) != NULL_TREE
768 || (lookup_attribute ("omp target entrypoint", attrs
) != NULL_TREE
769 && lookup_attribute ("oacc function", attrs
) != NULL_TREE
));
770 /* For OpenMP target regions, the corresponding kernel entry is emitted from
771 write_omp_entry as a separate function. */
774 /* Emit a linker marker for a function decl or defn. */
777 write_fn_marker (std::stringstream
&s
, bool is_defn
, bool globalize
,
783 s
<< " FUNCTION " << (is_defn
? "DEF: " : "DECL: ");
787 /* Emit a linker marker for a variable decl or defn. */
790 write_var_marker (FILE *file
, bool is_defn
, bool globalize
, const char *name
)
792 fprintf (file
, "\n// BEGIN%s VAR %s: ",
793 globalize
? " GLOBAL" : "",
794 is_defn
? "DEF" : "DECL");
795 assemble_name_raw (file
, name
);
799 /* Write a .func or .kernel declaration or definition along with
800 a helper comment for use by ld. S is the stream to write to, DECL
801 the decl for the function with name NAME. For definitions, emit
802 a declaration too. */
805 write_fn_proto (std::stringstream
&s
, bool is_defn
,
806 const char *name
, const_tree decl
)
809 /* Emit a declaration. The PTX assembler gets upset without it. */
810 name
= write_fn_proto (s
, false, name
, decl
);
813 /* Avoid repeating the name replacement. */
814 name
= nvptx_name_replacement (name
);
819 write_fn_marker (s
, is_defn
, TREE_PUBLIC (decl
), name
);
821 /* PTX declaration. */
822 if (DECL_EXTERNAL (decl
))
824 else if (TREE_PUBLIC (decl
))
825 s
<< (DECL_WEAK (decl
) ? ".weak " : ".visible ");
826 s
<< (write_as_kernel (DECL_ATTRIBUTES (decl
)) ? ".entry " : ".func ");
828 tree fntype
= TREE_TYPE (decl
);
829 tree result_type
= TREE_TYPE (fntype
);
831 /* atomic_compare_exchange_$n builtins have an exceptional calling
833 int not_atomic_weak_arg
= -1;
834 if (DECL_BUILT_IN_CLASS (decl
) == BUILT_IN_NORMAL
)
835 switch (DECL_FUNCTION_CODE (decl
))
837 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1
:
838 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2
:
839 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4
:
840 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8
:
841 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16
:
842 /* These atomics skip the 'weak' parm in an actual library
843 call. We must skip it in the prototype too. */
844 not_atomic_weak_arg
= 3;
851 /* Declare the result. */
852 bool return_in_mem
= write_return_type (s
, true, result_type
);
858 /* Emit argument list. */
860 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
863 NULL in TYPE_ARG_TYPES, for old-style functions
864 NULL in DECL_ARGUMENTS, for builtin functions without another
866 So we have to pick the best one we have. */
867 tree args
= TYPE_ARG_TYPES (fntype
);
868 bool prototyped
= true;
871 args
= DECL_ARGUMENTS (decl
);
875 for (; args
; args
= TREE_CHAIN (args
), not_atomic_weak_arg
--)
877 tree type
= prototyped
? TREE_VALUE (args
) : TREE_TYPE (args
);
879 if (not_atomic_weak_arg
)
880 argno
= write_arg_type (s
, -1, argno
, type
, prototyped
);
882 gcc_assert (type
== boolean_type_node
);
885 if (stdarg_p (fntype
))
886 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
888 if (DECL_STATIC_CHAIN (decl
))
889 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
891 if (!argno
&& strcmp (name
, "main") == 0)
893 argno
= write_arg_type (s
, -1, argno
, integer_type_node
, true);
894 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
900 s
<< (is_defn
? "\n" : ";\n");
905 /* Construct a function declaration from a call insn. This can be
906 necessary for two reasons - either we have an indirect call which
907 requires a .callprototype declaration, or we have a libcall
908 generated by emit_library_call for which no decl exists. */
911 write_fn_proto_from_insn (std::stringstream
&s
, const char *name
,
916 s
<< "\t.callprototype ";
921 name
= nvptx_name_replacement (name
);
922 write_fn_marker (s
, false, true, name
);
923 s
<< "\t.extern .func ";
926 if (result
!= NULL_RTX
)
927 write_return_mode (s
, true, GET_MODE (result
));
931 int arg_end
= XVECLEN (pat
, 0);
932 for (int i
= 1; i
< arg_end
; i
++)
934 /* We don't have to deal with mode splitting & promotion here,
935 as that was already done when generating the call
937 machine_mode mode
= GET_MODE (XEXP (XVECEXP (pat
, 0, i
), 0));
939 write_arg_mode (s
, -1, i
- 1, mode
);
946 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
947 table and and write a ptx prototype. These are emitted at end of
951 nvptx_record_fndecl (tree decl
)
953 tree
*slot
= declared_fndecls_htab
->find_slot (decl
, INSERT
);
957 const char *name
= get_fnname_from_decl (decl
);
958 write_fn_proto (func_decls
, false, name
, decl
);
962 /* Record a libcall or unprototyped external function. CALLEE is the
963 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
964 declaration for it. */
967 nvptx_record_libfunc (rtx callee
, rtx retval
, rtx pat
)
969 rtx
*slot
= declared_libfuncs_htab
->find_slot (callee
, INSERT
);
974 const char *name
= XSTR (callee
, 0);
975 write_fn_proto_from_insn (func_decls
, name
, retval
, pat
);
979 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
980 is prototyped, record it now. Otherwise record it as needed at end
981 of compilation, when we might have more information about it. */
984 nvptx_record_needed_fndecl (tree decl
)
986 if (TYPE_ARG_TYPES (TREE_TYPE (decl
)) == NULL_TREE
)
988 tree
*slot
= needed_fndecls_htab
->find_slot (decl
, INSERT
);
993 nvptx_record_fndecl (decl
);
996 /* SYM is a SYMBOL_REF. If it refers to an external function, record
1000 nvptx_maybe_record_fnsym (rtx sym
)
1002 tree decl
= SYMBOL_REF_DECL (sym
);
1004 if (decl
&& TREE_CODE (decl
) == FUNCTION_DECL
&& DECL_EXTERNAL (decl
))
1005 nvptx_record_needed_fndecl (decl
);
1008 /* Emit a local array to hold some part of a conventional stack frame
1009 and initialize REGNO to point to it. If the size is zero, it'll
1010 never be valid to dereference, so we can simply initialize to
1014 init_frame (FILE *file
, int regno
, unsigned align
, unsigned size
)
1017 fprintf (file
, "\t.local .align %d .b8 %s_ar[%u];\n",
1018 align
, reg_names
[regno
], size
);
1019 fprintf (file
, "\t.reg.u%d %s;\n",
1020 POINTER_SIZE
, reg_names
[regno
]);
1021 fprintf (file
, (size
? "\tcvta.local.u%d %s, %s_ar;\n"
1022 : "\tmov.u%d %s, 0;\n"),
1023 POINTER_SIZE
, reg_names
[regno
], reg_names
[regno
]);
1026 /* Emit soft stack frame setup sequence. */
1029 init_softstack_frame (FILE *file
, unsigned alignment
, HOST_WIDE_INT size
)
1031 /* Maintain 64-bit stack alignment. */
1032 unsigned keep_align
= BIGGEST_ALIGNMENT
/ BITS_PER_UNIT
;
1033 size
= ROUND_UP (size
, keep_align
);
1034 int bits
= POINTER_SIZE
;
1035 const char *reg_stack
= reg_names
[STACK_POINTER_REGNUM
];
1036 const char *reg_frame
= reg_names
[FRAME_POINTER_REGNUM
];
1037 const char *reg_sspslot
= reg_names
[SOFTSTACK_SLOT_REGNUM
];
1038 const char *reg_sspprev
= reg_names
[SOFTSTACK_PREV_REGNUM
];
1039 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_stack
);
1040 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_frame
);
1041 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_sspslot
);
1042 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_sspprev
);
1043 fprintf (file
, "\t{\n");
1044 fprintf (file
, "\t\t.reg.u32 %%fstmp0;\n");
1045 fprintf (file
, "\t\t.reg.u%d %%fstmp1;\n", bits
);
1046 fprintf (file
, "\t\t.reg.u%d %%fstmp2;\n", bits
);
1047 fprintf (file
, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1048 fprintf (file
, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1049 bits
== 64 ? ".wide" : ".lo", bits
/ 8);
1050 fprintf (file
, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits
);
1052 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1053 fprintf (file
, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits
, reg_sspslot
);
1055 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1056 fprintf (file
, "\t\tld.shared.u%d %s, [%s];\n",
1057 bits
, reg_sspprev
, reg_sspslot
);
1059 /* Initialize %frame = %sspprev - size. */
1060 fprintf (file
, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC
";\n",
1061 bits
, reg_frame
, reg_sspprev
, size
);
1063 /* Apply alignment, if larger than 64. */
1064 if (alignment
> keep_align
)
1065 fprintf (file
, "\t\tand.b%d %s, %s, %d;\n",
1066 bits
, reg_frame
, reg_frame
, -alignment
);
1068 size
= crtl
->outgoing_args_size
;
1069 gcc_assert (size
% keep_align
== 0);
1071 /* Initialize %stack. */
1072 fprintf (file
, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC
";\n",
1073 bits
, reg_stack
, reg_frame
, size
);
1076 fprintf (file
, "\t\tst.shared.u%d [%s], %s;\n",
1077 bits
, reg_sspslot
, reg_stack
);
1078 fprintf (file
, "\t}\n");
1079 cfun
->machine
->has_softstack
= true;
1080 need_softstack_decl
= true;
1083 /* Emit code to initialize the REGNO predicate register to indicate
1084 whether we are not lane zero on the NAME axis. */
1087 nvptx_init_axis_predicate (FILE *file
, int regno
, const char *name
)
1089 fprintf (file
, "\t{\n");
1090 fprintf (file
, "\t\t.reg.u32\t%%%s;\n", name
);
1091 fprintf (file
, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name
, name
);
1092 fprintf (file
, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno
, name
);
1093 fprintf (file
, "\t}\n");
1096 /* Emit code to initialize predicate and master lane index registers for
1097 -muniform-simt code generation variant. */
1100 nvptx_init_unisimt_predicate (FILE *file
)
1102 cfun
->machine
->unisimt_location
= gen_reg_rtx (Pmode
);
1103 int loc
= REGNO (cfun
->machine
->unisimt_location
);
1104 int bits
= POINTER_SIZE
;
1105 fprintf (file
, "\t.reg.u%d %%r%d;\n", bits
, loc
);
1106 fprintf (file
, "\t{\n");
1107 fprintf (file
, "\t\t.reg.u32 %%ustmp0;\n");
1108 fprintf (file
, "\t\t.reg.u%d %%ustmp1;\n", bits
);
1109 fprintf (file
, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1110 fprintf (file
, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1111 bits
== 64 ? ".wide" : ".lo");
1112 fprintf (file
, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits
, loc
);
1113 fprintf (file
, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits
, loc
, loc
);
1114 if (cfun
->machine
->unisimt_predicate
)
1116 int master
= REGNO (cfun
->machine
->unisimt_master
);
1117 int pred
= REGNO (cfun
->machine
->unisimt_predicate
);
1118 fprintf (file
, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master
, loc
);
1119 fprintf (file
, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1120 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1121 fprintf (file
, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master
, master
);
1122 /* Compute predicate as 'tid.x == master'. */
1123 fprintf (file
, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred
, master
);
1125 fprintf (file
, "\t}\n");
1126 need_unisimt_decl
= true;
1129 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1131 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1132 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1134 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1135 __nvptx_uni[tid.y] = 0;
1136 gomp_nvptx_main (ORIG, arg);
1138 ORIG itself should not be emitted as a PTX .entry function. */
1141 write_omp_entry (FILE *file
, const char *name
, const char *orig
)
1143 static bool gomp_nvptx_main_declared
;
1144 if (!gomp_nvptx_main_declared
)
1146 gomp_nvptx_main_declared
= true;
1147 write_fn_marker (func_decls
, false, true, "gomp_nvptx_main");
1148 func_decls
<< ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1149 << " %in_ar1, .param.u" << POINTER_SIZE
<< " %in_ar2);\n";
1151 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1152 #define NTID_Y "%ntid.y"
1153 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1154 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1157 .reg.u" PS " %R<4>;\n\
1158 mov.u32 %r0, %tid.y;\n\
1159 mov.u32 %r1, " NTID_Y ";\n\
1160 mov.u32 %r2, %ctaid.x;\n\
1161 cvt.u" PS ".u32 %R1, %r0;\n\
1162 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1163 mov.u" PS " %R0, __nvptx_stacks;\n\
1164 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1165 ld.param.u" PS " %R2, [%stack];\n\
1166 ld.param.u" PS " %R3, [%sz];\n\
1167 add.u" PS " %R2, %R2, %R3;\n\
1168 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1169 st.shared.u" PS " [%R0], %R2;\n\
1170 mov.u" PS " %R0, __nvptx_uni;\n\
1171 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1173 st.shared.u32 [%R0], %r0;\n\
1174 mov.u" PS " %R0, \0;\n\
1175 ld.param.u" PS " %R1, [%arg];\n\
1177 .param.u" PS " %P<2>;\n\
1178 st.param.u" PS " [%P0], %R0;\n\
1179 st.param.u" PS " [%P1], %R1;\n\
1180 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1184 static const char entry64
[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1185 static const char entry32
[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1186 #undef ENTRY_TEMPLATE
1188 const char *entry_1
= TARGET_ABI64
? entry64
: entry32
;
1189 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1190 const char *entry_2
= entry_1
+ strlen (entry64
) + 1;
1191 fprintf (file
, ".visible .entry %s%s%s%s", name
, entry_1
, orig
, entry_2
);
1192 need_softstack_decl
= need_unisimt_decl
= true;
1195 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1196 function, including local var decls and copies from the arguments to
1200 nvptx_declare_function_name (FILE *file
, const char *name
, const_tree decl
)
1202 tree fntype
= TREE_TYPE (decl
);
1203 tree result_type
= TREE_TYPE (fntype
);
1206 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl
))
1207 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl
)))
1209 char *buf
= (char *) alloca (strlen (name
) + sizeof ("$impl"));
1210 sprintf (buf
, "%s$impl", name
);
1211 write_omp_entry (file
, name
, buf
);
1214 /* We construct the initial part of the function into a string
1215 stream, in order to share the prototype writing code. */
1216 std::stringstream s
;
1217 write_fn_proto (s
, true, name
, decl
);
1220 bool return_in_mem
= write_return_type (s
, false, result_type
);
1222 argno
= write_arg_type (s
, 0, argno
, ptr_type_node
, true);
1224 /* Declare and initialize incoming arguments. */
1225 tree args
= TYPE_ARG_TYPES (fntype
);
1226 bool prototyped
= true;
1229 args
= DECL_ARGUMENTS (decl
);
1233 for (; args
!= NULL_TREE
; args
= TREE_CHAIN (args
))
1235 tree type
= prototyped
? TREE_VALUE (args
) : TREE_TYPE (args
);
1237 argno
= write_arg_type (s
, 0, argno
, type
, prototyped
);
1240 if (stdarg_p (fntype
))
1241 argno
= write_arg_type (s
, ARG_POINTER_REGNUM
, argno
, ptr_type_node
,
1244 if (DECL_STATIC_CHAIN (decl
) || cfun
->machine
->has_chain
)
1245 write_arg_type (s
, STATIC_CHAIN_REGNUM
,
1246 DECL_STATIC_CHAIN (decl
) ? argno
: -1, ptr_type_node
,
1249 fprintf (file
, "%s", s
.str().c_str());
1251 /* Usually 'crtl->is_leaf' is computed during register allocator
1252 initialization (which is not done on NVPTX) or for pressure-sensitive
1253 optimizations. Initialize it here, except if already set. */
1255 crtl
->is_leaf
= leaf_function_p ();
1257 HOST_WIDE_INT sz
= get_frame_size ();
1258 bool need_frameptr
= sz
|| cfun
->machine
->has_chain
;
1259 int alignment
= crtl
->stack_alignment_needed
/ BITS_PER_UNIT
;
1260 if (!TARGET_SOFT_STACK
)
1262 /* Declare a local var for outgoing varargs. */
1263 if (cfun
->machine
->has_varadic
)
1264 init_frame (file
, STACK_POINTER_REGNUM
,
1265 UNITS_PER_WORD
, crtl
->outgoing_args_size
);
1267 /* Declare a local variable for the frame. Force its size to be
1268 DImode-compatible. */
1270 init_frame (file
, FRAME_POINTER_REGNUM
, alignment
,
1271 ROUND_UP (sz
, GET_MODE_SIZE (DImode
)));
1273 else if (need_frameptr
|| cfun
->machine
->has_varadic
|| cfun
->calls_alloca
1274 || (cfun
->machine
->has_simtreg
&& !crtl
->is_leaf
))
1275 init_softstack_frame (file
, alignment
, sz
);
1277 if (cfun
->machine
->has_simtreg
)
1279 unsigned HOST_WIDE_INT
&simtsz
= cfun
->machine
->simt_stack_size
;
1280 unsigned HOST_WIDE_INT
&align
= cfun
->machine
->simt_stack_align
;
1281 align
= MAX (align
, GET_MODE_SIZE (DImode
));
1282 if (!crtl
->is_leaf
|| cfun
->calls_alloca
)
1283 simtsz
= HOST_WIDE_INT_M1U
;
1284 if (simtsz
== HOST_WIDE_INT_M1U
)
1285 simtsz
= nvptx_softstack_size
;
1286 if (cfun
->machine
->has_softstack
)
1287 simtsz
+= POINTER_SIZE
/ 8;
1288 simtsz
= ROUND_UP (simtsz
, GET_MODE_SIZE (DImode
));
1289 if (align
> GET_MODE_SIZE (DImode
))
1290 simtsz
+= align
- GET_MODE_SIZE (DImode
);
1292 fprintf (file
, "\t.local.align 8 .b8 %%simtstack_ar["
1293 HOST_WIDE_INT_PRINT_DEC
"];\n", simtsz
);
1295 /* Declare the pseudos we have as ptx registers. */
1296 int maxregs
= max_reg_num ();
1297 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< maxregs
; i
++)
1299 if (regno_reg_rtx
[i
] != const0_rtx
)
1301 machine_mode mode
= PSEUDO_REGNO_MODE (i
);
1302 machine_mode split
= maybe_split_mode (mode
);
1304 if (split_mode_p (mode
))
1306 fprintf (file
, "\t.reg%s ", nvptx_ptx_type_from_mode (mode
, true));
1307 output_reg (file
, i
, split
, -2);
1308 fprintf (file
, ";\n");
1312 /* Emit axis predicates. */
1313 if (cfun
->machine
->axis_predicate
[0])
1314 nvptx_init_axis_predicate (file
,
1315 REGNO (cfun
->machine
->axis_predicate
[0]), "y");
1316 if (cfun
->machine
->axis_predicate
[1])
1317 nvptx_init_axis_predicate (file
,
1318 REGNO (cfun
->machine
->axis_predicate
[1]), "x");
1319 if (cfun
->machine
->unisimt_predicate
1320 || (cfun
->machine
->has_simtreg
&& !crtl
->is_leaf
))
1321 nvptx_init_unisimt_predicate (file
);
1324 /* Output code for switching uniform-simt state. ENTERING indicates whether
1325 we are entering or leaving non-uniform execution region. */
1328 nvptx_output_unisimt_switch (FILE *file
, bool entering
)
1330 if (crtl
->is_leaf
&& !cfun
->machine
->unisimt_predicate
)
1332 fprintf (file
, "\t{\n");
1333 fprintf (file
, "\t\t.reg.u32 %%ustmp2;\n");
1334 fprintf (file
, "\t\tmov.u32 %%ustmp2, %d;\n", entering
? -1 : 0);
1337 int loc
= REGNO (cfun
->machine
->unisimt_location
);
1338 fprintf (file
, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc
);
1340 if (cfun
->machine
->unisimt_predicate
)
1342 int master
= REGNO (cfun
->machine
->unisimt_master
);
1343 int pred
= REGNO (cfun
->machine
->unisimt_predicate
);
1344 fprintf (file
, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1345 fprintf (file
, "\t\tmov.u32 %%r%d, %s;\n",
1346 master
, entering
? "%ustmp2" : "0");
1347 fprintf (file
, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred
, master
);
1349 fprintf (file
, "\t}\n");
1352 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1353 ENTERING indicates whether we are entering or leaving non-uniform execution.
1354 PTR is the register pointing to allocated storage, it is assigned to on
1355 entering and used to restore state on leaving. SIZE and ALIGN are used only
1359 nvptx_output_softstack_switch (FILE *file
, bool entering
,
1360 rtx ptr
, rtx size
, rtx align
)
1362 gcc_assert (REG_P (ptr
) && !HARD_REGISTER_P (ptr
));
1363 if (crtl
->is_leaf
&& !cfun
->machine
->simt_stack_size
)
1365 int bits
= POINTER_SIZE
, regno
= REGNO (ptr
);
1366 fprintf (file
, "\t{\n");
1369 fprintf (file
, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1370 HOST_WIDE_INT_PRINT_DEC
";\n", bits
, regno
,
1371 cfun
->machine
->simt_stack_size
);
1372 fprintf (file
, "\t\tsub.u%d %%r%d, %%r%d, ", bits
, regno
, regno
);
1373 if (CONST_INT_P (size
))
1374 fprintf (file
, HOST_WIDE_INT_PRINT_DEC
,
1375 ROUND_UP (UINTVAL (size
), GET_MODE_SIZE (DImode
)));
1377 output_reg (file
, REGNO (size
), VOIDmode
);
1378 fputs (";\n", file
);
1379 if (!CONST_INT_P (size
) || UINTVAL (align
) > GET_MODE_SIZE (DImode
))
1381 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC
";\n",
1382 bits
, regno
, regno
, UINTVAL (align
));
1384 if (cfun
->machine
->has_softstack
)
1386 const char *reg_stack
= reg_names
[STACK_POINTER_REGNUM
];
1389 fprintf (file
, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1390 bits
, regno
, bits
/ 8, reg_stack
);
1391 fprintf (file
, "\t\tsub.u%d %s, %%r%d, %d;\n",
1392 bits
, reg_stack
, regno
, bits
/ 8);
1396 fprintf (file
, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1397 bits
, reg_stack
, regno
, bits
/ 8);
1399 nvptx_output_set_softstack (REGNO (stack_pointer_rtx
));
1401 fprintf (file
, "\t}\n");
1404 /* Output code to enter non-uniform execution region. DEST is a register
1405 to hold a per-lane allocation given by SIZE and ALIGN. */
1408 nvptx_output_simt_enter (rtx dest
, rtx size
, rtx align
)
1410 nvptx_output_unisimt_switch (asm_out_file
, true);
1411 nvptx_output_softstack_switch (asm_out_file
, true, dest
, size
, align
);
1415 /* Output code to leave non-uniform execution region. SRC is the register
1416 holding per-lane storage previously allocated by omp_simt_enter insn. */
1419 nvptx_output_simt_exit (rtx src
)
1421 nvptx_output_unisimt_switch (asm_out_file
, false);
1422 nvptx_output_softstack_switch (asm_out_file
, false, src
, NULL_RTX
, NULL_RTX
);
1426 /* Output instruction that sets soft stack pointer in shared memory to the
1427 value in register given by SRC_REGNO. */
1430 nvptx_output_set_softstack (unsigned src_regno
)
1432 if (cfun
->machine
->has_softstack
&& !crtl
->is_leaf
)
1434 fprintf (asm_out_file
, "\tst.shared.u%d\t[%s], ",
1435 POINTER_SIZE
, reg_names
[SOFTSTACK_SLOT_REGNUM
]);
1436 output_reg (asm_out_file
, src_regno
, VOIDmode
);
1437 fprintf (asm_out_file
, ";\n");
1441 /* Output a return instruction. Also copy the return value to its outgoing
1445 nvptx_output_return (void)
1447 machine_mode mode
= (machine_mode
)cfun
->machine
->return_mode
;
1449 if (mode
!= VOIDmode
)
1450 fprintf (asm_out_file
, "\tst.param%s\t[%s_out], %s;\n",
1451 nvptx_ptx_type_from_mode (mode
, false),
1452 reg_names
[NVPTX_RETURN_REGNUM
],
1453 reg_names
[NVPTX_RETURN_REGNUM
]);
1458 /* Terminate a function by writing a closing brace to FILE. */
1461 nvptx_function_end (FILE *file
)
1463 fprintf (file
, "}\n");
1466 /* Decide whether we can make a sibling call to a function. For ptx, we
1470 nvptx_function_ok_for_sibcall (tree
, tree
)
1475 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1478 nvptx_get_drap_rtx (void)
1480 if (TARGET_SOFT_STACK
&& stack_realign_drap
)
1481 return arg_pointer_rtx
;
1485 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1486 argument to the next call. */
1489 nvptx_call_args (rtx arg
, tree fntype
)
1491 if (!cfun
->machine
->doing_call
)
1493 cfun
->machine
->doing_call
= true;
1494 cfun
->machine
->is_varadic
= false;
1495 cfun
->machine
->num_args
= 0;
1497 if (fntype
&& stdarg_p (fntype
))
1499 cfun
->machine
->is_varadic
= true;
1500 cfun
->machine
->has_varadic
= true;
1501 cfun
->machine
->num_args
++;
1505 if (REG_P (arg
) && arg
!= pc_rtx
)
1507 cfun
->machine
->num_args
++;
1508 cfun
->machine
->call_args
= alloc_EXPR_LIST (VOIDmode
, arg
,
1509 cfun
->machine
->call_args
);
1513 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1514 information we recorded. */
1517 nvptx_end_call_args (void)
1519 cfun
->machine
->doing_call
= false;
1520 free_EXPR_LIST_list (&cfun
->machine
->call_args
);
1523 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1524 track of whether calls involving static chains or varargs were seen
1525 in the current function.
1526 For libcalls, maintain a hash table of decls we have seen, and
1527 record a function decl for later when encountering a new one. */
1530 nvptx_expand_call (rtx retval
, rtx address
)
1532 rtx callee
= XEXP (address
, 0);
1533 rtx varargs
= NULL_RTX
;
1534 unsigned parallel
= 0;
1536 if (!call_insn_operand (callee
, Pmode
))
1538 callee
= force_reg (Pmode
, callee
);
1539 address
= change_address (address
, QImode
, callee
);
1542 if (GET_CODE (callee
) == SYMBOL_REF
)
1544 tree decl
= SYMBOL_REF_DECL (callee
);
1545 if (decl
!= NULL_TREE
)
1547 if (DECL_STATIC_CHAIN (decl
))
1548 cfun
->machine
->has_chain
= true;
1550 tree attr
= oacc_get_fn_attrib (decl
);
1553 tree dims
= TREE_VALUE (attr
);
1555 parallel
= GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1;
1556 for (int ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++)
1558 if (TREE_PURPOSE (dims
)
1559 && !integer_zerop (TREE_PURPOSE (dims
)))
1561 /* Not on this axis. */
1562 parallel
^= GOMP_DIM_MASK (ix
);
1563 dims
= TREE_CHAIN (dims
);
1569 unsigned nargs
= cfun
->machine
->num_args
;
1570 if (cfun
->machine
->is_varadic
)
1572 varargs
= gen_reg_rtx (Pmode
);
1573 emit_move_insn (varargs
, stack_pointer_rtx
);
1576 rtvec vec
= rtvec_alloc (nargs
+ 1);
1577 rtx pat
= gen_rtx_PARALLEL (VOIDmode
, vec
);
1580 rtx call
= gen_rtx_CALL (VOIDmode
, address
, const0_rtx
);
1581 rtx tmp_retval
= retval
;
1584 if (!nvptx_register_operand (retval
, GET_MODE (retval
)))
1585 tmp_retval
= gen_reg_rtx (GET_MODE (retval
));
1586 call
= gen_rtx_SET (tmp_retval
, call
);
1588 XVECEXP (pat
, 0, vec_pos
++) = call
;
1590 /* Construct the call insn, including a USE for each argument pseudo
1591 register. These will be used when printing the insn. */
1592 for (rtx arg
= cfun
->machine
->call_args
; arg
; arg
= XEXP (arg
, 1))
1593 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, XEXP (arg
, 0));
1596 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, varargs
);
1598 gcc_assert (vec_pos
= XVECLEN (pat
, 0));
1600 nvptx_emit_forking (parallel
, true);
1601 emit_call_insn (pat
);
1602 nvptx_emit_joining (parallel
, true);
1604 if (tmp_retval
!= retval
)
1605 emit_move_insn (retval
, tmp_retval
);
1608 /* Emit a comparison COMPARE, and return the new test to be used in the
1612 nvptx_expand_compare (rtx compare
)
1614 rtx pred
= gen_reg_rtx (BImode
);
1615 rtx cmp
= gen_rtx_fmt_ee (GET_CODE (compare
), BImode
,
1616 XEXP (compare
, 0), XEXP (compare
, 1));
1617 emit_insn (gen_rtx_SET (pred
, cmp
));
1618 return gen_rtx_NE (BImode
, pred
, const0_rtx
);
1621 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1624 nvptx_expand_oacc_fork (unsigned mode
)
1626 nvptx_emit_forking (GOMP_DIM_MASK (mode
), false);
1630 nvptx_expand_oacc_join (unsigned mode
)
1632 nvptx_emit_joining (GOMP_DIM_MASK (mode
), false);
1635 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1639 nvptx_gen_unpack (rtx dst0
, rtx dst1
, rtx src
)
1643 switch (GET_MODE (src
))
1646 res
= gen_unpackdisi2 (dst0
, dst1
, src
);
1649 res
= gen_unpackdfsi2 (dst0
, dst1
, src
);
1651 default: gcc_unreachable ();
1656 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1660 nvptx_gen_pack (rtx dst
, rtx src0
, rtx src1
)
1664 switch (GET_MODE (dst
))
1667 res
= gen_packsidi2 (dst
, src0
, src1
);
1670 res
= gen_packsidf2 (dst
, src0
, src1
);
1672 default: gcc_unreachable ();
1677 /* Generate an instruction or sequence to broadcast register REG
1678 across the vectors of a single warp. */
1681 nvptx_gen_shuffle (rtx dst
, rtx src
, rtx idx
, nvptx_shuffle_kind kind
)
1685 switch (GET_MODE (dst
))
1688 res
= gen_nvptx_shufflesi (dst
, src
, idx
, GEN_INT (kind
));
1691 res
= gen_nvptx_shufflesf (dst
, src
, idx
, GEN_INT (kind
));
1696 rtx tmp0
= gen_reg_rtx (SImode
);
1697 rtx tmp1
= gen_reg_rtx (SImode
);
1700 emit_insn (nvptx_gen_unpack (tmp0
, tmp1
, src
));
1701 emit_insn (nvptx_gen_shuffle (tmp0
, tmp0
, idx
, kind
));
1702 emit_insn (nvptx_gen_shuffle (tmp1
, tmp1
, idx
, kind
));
1703 emit_insn (nvptx_gen_pack (dst
, tmp0
, tmp1
));
1710 rtx tmp
= gen_reg_rtx (SImode
);
1713 emit_insn (gen_sel_truesi (tmp
, src
, GEN_INT (1), const0_rtx
));
1714 emit_insn (nvptx_gen_shuffle (tmp
, tmp
, idx
, kind
));
1715 emit_insn (gen_rtx_SET (dst
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
1723 rtx tmp
= gen_reg_rtx (SImode
);
1726 emit_insn (gen_rtx_SET (tmp
, gen_rtx_fmt_e (ZERO_EXTEND
, SImode
, src
)));
1727 emit_insn (nvptx_gen_shuffle (tmp
, tmp
, idx
, kind
));
1728 emit_insn (gen_rtx_SET (dst
, gen_rtx_fmt_e (TRUNCATE
, GET_MODE (dst
),
1741 /* Generate an instruction or sequence to broadcast register REG
1742 across the vectors of a single warp. */
1745 nvptx_gen_vcast (rtx reg
)
1747 return nvptx_gen_shuffle (reg
, reg
, const0_rtx
, SHUFFLE_IDX
);
1750 /* Structure used when generating a worker-level spill or fill. */
1754 rtx base
; /* Register holding base addr of buffer. */
1755 rtx ptr
; /* Iteration var, if needed. */
1756 unsigned offset
; /* Offset into worker buffer. */
1759 /* Direction of the spill/fill and looping setup/teardown indicator. */
1765 PM_loop_begin
= 1 << 2,
1766 PM_loop_end
= 1 << 3,
1768 PM_read_write
= PM_read
| PM_write
1771 /* Generate instruction(s) to spill or fill register REG to/from the
1772 worker broadcast array. PM indicates what is to be done, REP
1773 how many loop iterations will be executed (0 for not a loop). */
1776 nvptx_gen_wcast (rtx reg
, propagate_mask pm
, unsigned rep
, wcast_data_t
*data
)
1779 machine_mode mode
= GET_MODE (reg
);
1785 rtx tmp
= gen_reg_rtx (SImode
);
1789 emit_insn (gen_sel_truesi (tmp
, reg
, GEN_INT (1), const0_rtx
));
1790 emit_insn (nvptx_gen_wcast (tmp
, pm
, rep
, data
));
1792 emit_insn (gen_rtx_SET (reg
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
1800 rtx addr
= data
->ptr
;
1804 unsigned align
= GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
;
1806 if (align
> worker_bcast_align
)
1807 worker_bcast_align
= align
;
1808 data
->offset
= (data
->offset
+ align
- 1) & ~(align
- 1);
1811 addr
= gen_rtx_PLUS (Pmode
, addr
, GEN_INT (data
->offset
));
1814 addr
= gen_rtx_MEM (mode
, addr
);
1816 res
= gen_rtx_SET (addr
, reg
);
1817 else if (pm
== PM_write
)
1818 res
= gen_rtx_SET (reg
, addr
);
1824 /* We're using a ptr, increment it. */
1828 emit_insn (gen_adddi3 (data
->ptr
, data
->ptr
,
1829 GEN_INT (GET_MODE_SIZE (GET_MODE (reg
)))));
1835 data
->offset
+= rep
* GET_MODE_SIZE (GET_MODE (reg
));
1842 /* Returns true if X is a valid address for use in a memory reference. */
1845 nvptx_legitimate_address_p (machine_mode
, rtx x
, bool)
1847 enum rtx_code code
= GET_CODE (x
);
1855 if (REG_P (XEXP (x
, 0)) && CONST_INT_P (XEXP (x
, 1)))
1869 /* Machinery to output constant initializers. When beginning an
1870 initializer, we decide on a fragment size (which is visible in ptx
1871 in the type used), and then all initializer data is buffered until
1872 a fragment is filled and ready to be written out. */
1876 unsigned HOST_WIDE_INT mask
; /* Mask for storing fragment. */
1877 unsigned HOST_WIDE_INT val
; /* Current fragment value. */
1878 unsigned HOST_WIDE_INT remaining
; /* Remaining bytes to be written
1880 unsigned size
; /* Fragment size to accumulate. */
1881 unsigned offset
; /* Offset within current fragment. */
1882 bool started
; /* Whether we've output any initializer. */
1885 /* The current fragment is full, write it out. SYM may provide a
1886 symbolic reference we should output, in which case the fragment
1887 value is the addend. */
1890 output_init_frag (rtx sym
)
1892 fprintf (asm_out_file
, init_frag
.started
? ", " : " = { ");
1893 unsigned HOST_WIDE_INT val
= init_frag
.val
;
1895 init_frag
.started
= true;
1897 init_frag
.offset
= 0;
1898 init_frag
.remaining
--;
1902 bool function
= (SYMBOL_REF_DECL (sym
)
1903 && (TREE_CODE (SYMBOL_REF_DECL (sym
)) == FUNCTION_DECL
));
1905 fprintf (asm_out_file
, "generic(");
1906 output_address (VOIDmode
, sym
);
1908 fprintf (asm_out_file
, ")");
1910 fprintf (asm_out_file
, " + ");
1914 fprintf (asm_out_file
, HOST_WIDE_INT_PRINT_DEC
, val
);
1917 /* Add value VAL of size SIZE to the data we're emitting, and keep
1918 writing out chunks as they fill up. */
1921 nvptx_assemble_value (unsigned HOST_WIDE_INT val
, unsigned size
)
1923 val
&= ((unsigned HOST_WIDE_INT
)2 << (size
* BITS_PER_UNIT
- 1)) - 1;
1925 for (unsigned part
= 0; size
; size
-= part
)
1927 val
>>= part
* BITS_PER_UNIT
;
1928 part
= init_frag
.size
- init_frag
.offset
;
1932 unsigned HOST_WIDE_INT partial
1933 = val
<< (init_frag
.offset
* BITS_PER_UNIT
);
1934 init_frag
.val
|= partial
& init_frag
.mask
;
1935 init_frag
.offset
+= part
;
1937 if (init_frag
.offset
== init_frag
.size
)
1938 output_init_frag (NULL
);
1942 /* Target hook for assembling integer object X of size SIZE. */
1945 nvptx_assemble_integer (rtx x
, unsigned int size
, int ARG_UNUSED (aligned_p
))
1947 HOST_WIDE_INT val
= 0;
1949 switch (GET_CODE (x
))
1952 /* Let the generic machinery figure it out, usually for a
1957 nvptx_assemble_value (INTVAL (x
), size
);
1962 gcc_assert (GET_CODE (x
) == PLUS
);
1963 val
= INTVAL (XEXP (x
, 1));
1965 gcc_assert (GET_CODE (x
) == SYMBOL_REF
);
1969 gcc_assert (size
== init_frag
.size
);
1970 if (init_frag
.offset
)
1971 sorry ("cannot emit unaligned pointers in ptx assembly");
1973 nvptx_maybe_record_fnsym (x
);
1974 init_frag
.val
= val
;
1975 output_init_frag (x
);
1982 /* Output SIZE zero bytes. We ignore the FILE argument since the
1983 functions we're calling to perform the output just use
1987 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size
)
1989 /* Finish the current fragment, if it's started. */
1990 if (init_frag
.offset
)
1992 unsigned part
= init_frag
.size
- init_frag
.offset
;
1994 part
= (unsigned) size
;
1996 nvptx_assemble_value (0, part
);
1999 /* If this skip doesn't terminate the initializer, write as many
2000 remaining pieces as possible directly. */
2001 if (size
< init_frag
.remaining
* init_frag
.size
)
2003 while (size
>= init_frag
.size
)
2005 size
-= init_frag
.size
;
2006 output_init_frag (NULL_RTX
);
2009 nvptx_assemble_value (0, size
);
2013 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2014 ignore the FILE arg. */
2017 nvptx_output_ascii (FILE *, const char *str
, unsigned HOST_WIDE_INT size
)
2019 for (unsigned HOST_WIDE_INT i
= 0; i
< size
; i
++)
2020 nvptx_assemble_value (str
[i
], 1);
2023 /* Emit a PTX variable decl and prepare for emission of its
2024 initializer. NAME is the symbol name and SETION the PTX data
2025 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2026 The caller has already emitted any indentation and linkage
2027 specifier. It is responsible for any initializer, terminating ;
2028 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2029 this is the opposite way round that PTX wants them! */
2032 nvptx_assemble_decl_begin (FILE *file
, const char *name
, const char *section
,
2033 const_tree type
, HOST_WIDE_INT size
, unsigned align
)
2035 while (TREE_CODE (type
) == ARRAY_TYPE
)
2036 type
= TREE_TYPE (type
);
2038 if (TREE_CODE (type
) == VECTOR_TYPE
2039 || TREE_CODE (type
) == COMPLEX_TYPE
)
2040 /* Neither vector nor complex types can contain the other. */
2041 type
= TREE_TYPE (type
);
2043 unsigned elt_size
= int_size_in_bytes (type
);
2045 /* Largest mode we're prepared to accept. For BLKmode types we
2046 don't know if it'll contain pointer constants, so have to choose
2047 pointer size, otherwise we can choose DImode. */
2048 machine_mode elt_mode
= TYPE_MODE (type
) == BLKmode
? Pmode
: DImode
;
2050 elt_size
|= GET_MODE_SIZE (elt_mode
);
2051 elt_size
&= -elt_size
; /* Extract LSB set. */
2053 init_frag
.size
= elt_size
;
2054 /* Avoid undefined shift behavior by using '2'. */
2055 init_frag
.mask
= ((unsigned HOST_WIDE_INT
)2
2056 << (elt_size
* BITS_PER_UNIT
- 1)) - 1;
2058 init_frag
.offset
= 0;
2059 init_frag
.started
= false;
2060 /* Size might not be a multiple of elt size, if there's an
2061 initialized trailing struct array with smaller type than
2063 init_frag
.remaining
= (size
+ elt_size
- 1) / elt_size
;
2065 fprintf (file
, "%s .align %d .u%d ",
2066 section
, align
/ BITS_PER_UNIT
,
2067 elt_size
* BITS_PER_UNIT
);
2068 assemble_name (file
, name
);
2071 /* We make everything an array, to simplify any initialization
2073 fprintf (file
, "[" HOST_WIDE_INT_PRINT_DEC
"]", init_frag
.remaining
);
2076 /* Called when the initializer for a decl has been completely output through
2077 combinations of the three functions above. */
2080 nvptx_assemble_decl_end (void)
2082 if (init_frag
.offset
)
2083 /* This can happen with a packed struct with trailing array member. */
2084 nvptx_assemble_value (0, init_frag
.size
- init_frag
.offset
);
2085 fprintf (asm_out_file
, init_frag
.started
? " };\n" : ";\n");
2088 /* Output an uninitialized common or file-scope variable. */
2091 nvptx_output_aligned_decl (FILE *file
, const char *name
,
2092 const_tree decl
, HOST_WIDE_INT size
, unsigned align
)
2094 write_var_marker (file
, true, TREE_PUBLIC (decl
), name
);
2096 /* If this is public, it is common. The nearest thing we have to
2098 fprintf (file
, "\t%s", TREE_PUBLIC (decl
) ? ".weak " : "");
2100 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2101 TREE_TYPE (decl
), size
, align
);
2102 nvptx_assemble_decl_end ();
2105 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2106 writing a constant variable EXP with NAME and SIZE and its
2107 initializer to FILE. */
2110 nvptx_asm_declare_constant_name (FILE *file
, const char *name
,
2111 const_tree exp
, HOST_WIDE_INT obj_size
)
2113 write_var_marker (file
, true, false, name
);
2115 fprintf (file
, "\t");
2117 tree type
= TREE_TYPE (exp
);
2118 nvptx_assemble_decl_begin (file
, name
, ".const", type
, obj_size
,
2122 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2123 a variable DECL with NAME to FILE. */
2126 nvptx_declare_object_name (FILE *file
, const char *name
, const_tree decl
)
2128 write_var_marker (file
, true, TREE_PUBLIC (decl
), name
);
2130 fprintf (file
, "\t%s", (!TREE_PUBLIC (decl
) ? ""
2131 : DECL_WEAK (decl
) ? ".weak " : ".visible "));
2133 tree type
= TREE_TYPE (decl
);
2134 HOST_WIDE_INT obj_size
= tree_to_shwi (DECL_SIZE_UNIT (decl
));
2135 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2136 type
, obj_size
, DECL_ALIGN (decl
));
2139 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2142 nvptx_globalize_label (FILE *, const char *)
2146 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2147 declaration only for variable DECL with NAME to FILE. */
2150 nvptx_assemble_undefined_decl (FILE *file
, const char *name
, const_tree decl
)
2152 /* The middle end can place constant pool decls into the varpool as
2153 undefined. Until that is fixed, catch the problem here. */
2154 if (DECL_IN_CONSTANT_POOL (decl
))
2157 /* We support weak defintions, and hence have the right
2158 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2159 if (DECL_WEAK (decl
))
2160 error_at (DECL_SOURCE_LOCATION (decl
),
2161 "PTX does not support weak declarations"
2162 " (only weak definitions)");
2163 write_var_marker (file
, false, TREE_PUBLIC (decl
), name
);
2165 fprintf (file
, "\t.extern ");
2166 tree size
= DECL_SIZE_UNIT (decl
);
2167 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2168 TREE_TYPE (decl
), size
? tree_to_shwi (size
) : 0,
2170 nvptx_assemble_decl_end ();
2173 /* Output a pattern for a move instruction. */
2176 nvptx_output_mov_insn (rtx dst
, rtx src
)
2178 machine_mode dst_mode
= GET_MODE (dst
);
2179 machine_mode dst_inner
= (GET_CODE (dst
) == SUBREG
2180 ? GET_MODE (XEXP (dst
, 0)) : dst_mode
);
2181 machine_mode src_inner
= (GET_CODE (src
) == SUBREG
2182 ? GET_MODE (XEXP (src
, 0)) : dst_mode
);
2185 if (GET_CODE (sym
) == CONST
)
2186 sym
= XEXP (XEXP (sym
, 0), 0);
2187 if (SYMBOL_REF_P (sym
))
2189 if (SYMBOL_DATA_AREA (sym
) != DATA_AREA_GENERIC
)
2190 return "%.\tcvta%D1%t0\t%0, %1;";
2191 nvptx_maybe_record_fnsym (sym
);
2194 if (src_inner
== dst_inner
)
2195 return "%.\tmov%t0\t%0, %1;";
2197 if (CONSTANT_P (src
))
2198 return (GET_MODE_CLASS (dst_inner
) == MODE_INT
2199 && GET_MODE_CLASS (src_inner
) != MODE_FLOAT
2200 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2202 if (GET_MODE_SIZE (dst_inner
) == GET_MODE_SIZE (src_inner
))
2204 if (GET_MODE_BITSIZE (dst_mode
) == 128
2205 && GET_MODE_BITSIZE (GET_MODE (src
)) == 128)
2207 /* mov.b128 is not supported. */
2208 if (dst_inner
== V2DImode
&& src_inner
== TImode
)
2209 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2210 else if (dst_inner
== TImode
&& src_inner
== V2DImode
)
2211 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2215 return "%.\tmov.b%T0\t%0, %1;";
2218 return "%.\tcvt%t0%t1\t%0, %1;";
2221 static void nvptx_print_operand (FILE *, rtx
, int);
2223 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2224 involves writing .param declarations and in/out copies into them. For
2225 indirect calls, also write the .callprototype. */
2228 nvptx_output_call_insn (rtx_insn
*insn
, rtx result
, rtx callee
)
2232 bool needs_tgt
= register_operand (callee
, Pmode
);
2233 rtx pat
= PATTERN (insn
);
2234 if (GET_CODE (pat
) == COND_EXEC
)
2235 pat
= COND_EXEC_CODE (pat
);
2236 int arg_end
= XVECLEN (pat
, 0);
2237 tree decl
= NULL_TREE
;
2239 fprintf (asm_out_file
, "\t{\n");
2241 fprintf (asm_out_file
, "\t\t.param%s %s_in;\n",
2242 nvptx_ptx_type_from_mode (GET_MODE (result
), false),
2243 reg_names
[NVPTX_RETURN_REGNUM
]);
2245 /* Ensure we have a ptx declaration in the output if necessary. */
2246 if (GET_CODE (callee
) == SYMBOL_REF
)
2248 decl
= SYMBOL_REF_DECL (callee
);
2250 || (DECL_EXTERNAL (decl
) && !TYPE_ARG_TYPES (TREE_TYPE (decl
))))
2251 nvptx_record_libfunc (callee
, result
, pat
);
2252 else if (DECL_EXTERNAL (decl
))
2253 nvptx_record_fndecl (decl
);
2258 ASM_GENERATE_INTERNAL_LABEL (buf
, "LCT", labelno
);
2260 ASM_OUTPUT_LABEL (asm_out_file
, buf
);
2261 std::stringstream s
;
2262 write_fn_proto_from_insn (s
, NULL
, result
, pat
);
2263 fputs (s
.str().c_str(), asm_out_file
);
2266 for (int argno
= 1; argno
< arg_end
; argno
++)
2268 rtx t
= XEXP (XVECEXP (pat
, 0, argno
), 0);
2269 machine_mode mode
= GET_MODE (t
);
2270 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
2272 /* Mode splitting has already been done. */
2273 fprintf (asm_out_file
, "\t\t.param%s %%out_arg%d;\n"
2274 "\t\tst.param%s [%%out_arg%d], ",
2275 ptx_type
, argno
, ptx_type
, argno
);
2276 output_reg (asm_out_file
, REGNO (t
), VOIDmode
);
2277 fprintf (asm_out_file
, ";\n");
2280 /* The '.' stands for the call's predicate, if any. */
2281 nvptx_print_operand (asm_out_file
, NULL_RTX
, '.');
2282 fprintf (asm_out_file
, "\t\tcall ");
2283 if (result
!= NULL_RTX
)
2284 fprintf (asm_out_file
, "(%s_in), ", reg_names
[NVPTX_RETURN_REGNUM
]);
2288 const char *name
= get_fnname_from_decl (decl
);
2289 name
= nvptx_name_replacement (name
);
2290 assemble_name (asm_out_file
, name
);
2293 output_address (VOIDmode
, callee
);
2295 const char *open
= "(";
2296 for (int argno
= 1; argno
< arg_end
; argno
++)
2298 fprintf (asm_out_file
, ", %s%%out_arg%d", open
, argno
);
2301 if (decl
&& DECL_STATIC_CHAIN (decl
))
2303 fprintf (asm_out_file
, ", %s%s", open
, reg_names
[STATIC_CHAIN_REGNUM
]);
2307 fprintf (asm_out_file
, ")");
2311 fprintf (asm_out_file
, ", ");
2312 assemble_name (asm_out_file
, buf
);
2314 fprintf (asm_out_file
, ";\n");
2316 if (find_reg_note (insn
, REG_NORETURN
, NULL
))
2318 /* No return functions confuse the PTX JIT, as it doesn't realize
2319 the flow control barrier they imply. It can seg fault if it
2320 encounters what looks like an unexitable loop. Emit a trailing
2321 trap and exit, which it does grok. */
2322 fprintf (asm_out_file
, "\t\ttrap; // (noreturn)\n");
2323 fprintf (asm_out_file
, "\t\texit; // (noreturn)\n");
2328 static char rval
[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2331 /* We must escape the '%' that starts RETURN_REGNUM. */
2332 sprintf (rval
, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2333 reg_names
[NVPTX_RETURN_REGNUM
]);
2340 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2343 nvptx_print_operand_punct_valid_p (unsigned char c
)
2345 return c
== '.' || c
== '#';
2348 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2351 nvptx_print_address_operand (FILE *file
, rtx x
, machine_mode
)
2354 if (GET_CODE (x
) == CONST
)
2356 switch (GET_CODE (x
))
2360 output_address (VOIDmode
, XEXP (x
, 0));
2361 fprintf (file
, "+");
2362 output_address (VOIDmode
, off
);
2367 output_addr_const (file
, x
);
2371 gcc_assert (GET_CODE (x
) != MEM
);
2372 nvptx_print_operand (file
, x
, 0);
2377 /* Write assembly language output for the address ADDR to FILE. */
2380 nvptx_print_operand_address (FILE *file
, machine_mode mode
, rtx addr
)
2382 nvptx_print_address_operand (file
, addr
, mode
);
2385 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2388 . -- print the predicate for the instruction or an emptry string for an
2390 # -- print a rounding mode for the instruction
2392 A -- print a data area for a MEM
2393 c -- print an opcode suffix for a comparison operator, including a type code
2394 D -- print a data area for a MEM operand
2395 S -- print a shuffle kind specified by CONST_INT
2396 t -- print a type opcode suffix, promoting QImode to 32 bits
2397 T -- print a type size in bits
2398 u -- print a type opcode suffix without promotions. */
2401 nvptx_print_operand (FILE *file
, rtx x
, int code
)
2405 x
= current_insn_predicate
;
2409 if (GET_CODE (x
) == EQ
)
2411 output_reg (file
, REGNO (XEXP (x
, 0)), VOIDmode
);
2415 else if (code
== '#')
2417 fputs (".rn", file
);
2421 enum rtx_code x_code
= GET_CODE (x
);
2422 machine_mode mode
= GET_MODE (x
);
2431 if (GET_CODE (x
) == CONST
)
2433 if (GET_CODE (x
) == PLUS
)
2436 if (GET_CODE (x
) == SYMBOL_REF
)
2437 fputs (section_for_sym (x
), file
);
2442 if (x_code
== SUBREG
)
2444 machine_mode inner_mode
= GET_MODE (SUBREG_REG (x
));
2445 if (VECTOR_MODE_P (inner_mode
)
2446 && (GET_MODE_SIZE (mode
)
2447 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode
))))
2448 mode
= GET_MODE_INNER (inner_mode
);
2449 else if (split_mode_p (inner_mode
))
2450 mode
= maybe_split_mode (inner_mode
);
2454 fprintf (file
, "%s", nvptx_ptx_type_from_mode (mode
, code
== 't'));
2460 rtx inner_x
= SUBREG_REG (x
);
2461 machine_mode inner_mode
= GET_MODE (inner_x
);
2462 machine_mode split
= maybe_split_mode (inner_mode
);
2464 output_reg (file
, REGNO (inner_x
), split
,
2466 ? GET_MODE_SIZE (inner_mode
) / 2
2473 nvptx_shuffle_kind kind
= (nvptx_shuffle_kind
) UINTVAL (x
);
2474 /* Same order as nvptx_shuffle_kind. */
2475 static const char *const kinds
[] =
2476 {".up", ".down", ".bfly", ".idx"};
2477 fputs (kinds
[kind
], file
);
2482 fprintf (file
, "%d", GET_MODE_BITSIZE (mode
));
2486 fprintf (file
, "@");
2490 fprintf (file
, "@!");
2494 mode
= GET_MODE (XEXP (x
, 0));
2498 fputs (".eq", file
);
2501 if (FLOAT_MODE_P (mode
))
2502 fputs (".neu", file
);
2504 fputs (".ne", file
);
2508 fputs (".le", file
);
2512 fputs (".ge", file
);
2516 fputs (".lt", file
);
2520 fputs (".gt", file
);
2523 fputs (".ne", file
);
2526 fputs (".equ", file
);
2529 fputs (".leu", file
);
2532 fputs (".geu", file
);
2535 fputs (".ltu", file
);
2538 fputs (".gtu", file
);
2541 fputs (".nan", file
);
2544 fputs (".num", file
);
2549 if (FLOAT_MODE_P (mode
)
2550 || x_code
== EQ
|| x_code
== NE
2551 || x_code
== GEU
|| x_code
== GTU
2552 || x_code
== LEU
|| x_code
== LTU
)
2553 fputs (nvptx_ptx_type_from_mode (mode
, true), file
);
2555 fprintf (file
, ".s%d", GET_MODE_BITSIZE (mode
));
2563 rtx inner_x
= SUBREG_REG (x
);
2564 machine_mode inner_mode
= GET_MODE (inner_x
);
2565 machine_mode split
= maybe_split_mode (inner_mode
);
2567 if (VECTOR_MODE_P (inner_mode
)
2568 && (GET_MODE_SIZE (mode
)
2569 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode
))))
2571 output_reg (file
, REGNO (inner_x
), VOIDmode
);
2572 fprintf (file
, ".%s", SUBREG_BYTE (x
) == 0 ? "x" : "y");
2574 else if (split_mode_p (inner_mode
)
2575 && (GET_MODE_SIZE (inner_mode
) == GET_MODE_SIZE (mode
)))
2576 output_reg (file
, REGNO (inner_x
), split
);
2578 output_reg (file
, REGNO (inner_x
), split
, SUBREG_BYTE (x
));
2583 output_reg (file
, REGNO (x
), maybe_split_mode (mode
));
2588 nvptx_print_address_operand (file
, XEXP (x
, 0), mode
);
2593 output_addr_const (file
, x
);
2599 /* We could use output_addr_const, but that can print things like
2600 "x-8", which breaks ptxas. Need to ensure it is output as
2602 nvptx_print_address_operand (file
, x
, VOIDmode
);
2607 real_to_target (vals
, CONST_DOUBLE_REAL_VALUE (x
), mode
);
2608 vals
[0] &= 0xffffffff;
2609 vals
[1] &= 0xffffffff;
2611 fprintf (file
, "0f%08lx", vals
[0]);
2613 fprintf (file
, "0d%08lx%08lx", vals
[1], vals
[0]);
2618 unsigned n
= CONST_VECTOR_NUNITS (x
);
2619 fprintf (file
, "{ ");
2620 for (unsigned i
= 0; i
< n
; ++i
)
2623 fprintf (file
, ", ");
2625 rtx elem
= CONST_VECTOR_ELT (x
, i
);
2626 output_addr_const (file
, elem
);
2628 fprintf (file
, " }");
2633 output_addr_const (file
, x
);
2638 /* Record replacement regs used to deal with subreg operands. */
2641 rtx replacement
[MAX_RECOG_OPERANDS
];
2647 /* Allocate or reuse a replacement in R and return the rtx. */
2650 get_replacement (struct reg_replace
*r
)
2652 if (r
->n_allocated
== r
->n_in_use
)
2653 r
->replacement
[r
->n_allocated
++] = gen_reg_rtx (r
->mode
);
2654 return r
->replacement
[r
->n_in_use
++];
2657 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2658 the presence of subregs would break the rules for most instructions.
2659 Replace them with a suitable new register of the right size, plus
2660 conversion copyin/copyout instructions. */
2663 nvptx_reorg_subreg (void)
2665 struct reg_replace qiregs
, hiregs
, siregs
, diregs
;
2666 rtx_insn
*insn
, *next
;
2668 qiregs
.n_allocated
= 0;
2669 hiregs
.n_allocated
= 0;
2670 siregs
.n_allocated
= 0;
2671 diregs
.n_allocated
= 0;
2672 qiregs
.mode
= QImode
;
2673 hiregs
.mode
= HImode
;
2674 siregs
.mode
= SImode
;
2675 diregs
.mode
= DImode
;
2677 for (insn
= get_insns (); insn
; insn
= next
)
2679 next
= NEXT_INSN (insn
);
2680 if (!NONDEBUG_INSN_P (insn
)
2681 || asm_noperands (PATTERN (insn
)) >= 0
2682 || GET_CODE (PATTERN (insn
)) == USE
2683 || GET_CODE (PATTERN (insn
)) == CLOBBER
)
2686 qiregs
.n_in_use
= 0;
2687 hiregs
.n_in_use
= 0;
2688 siregs
.n_in_use
= 0;
2689 diregs
.n_in_use
= 0;
2690 extract_insn (insn
);
2691 enum attr_subregs_ok s_ok
= get_attr_subregs_ok (insn
);
2693 for (int i
= 0; i
< recog_data
.n_operands
; i
++)
2695 rtx op
= recog_data
.operand
[i
];
2696 if (GET_CODE (op
) != SUBREG
)
2699 rtx inner
= SUBREG_REG (op
);
2701 machine_mode outer_mode
= GET_MODE (op
);
2702 machine_mode inner_mode
= GET_MODE (inner
);
2705 && (GET_MODE_PRECISION (inner_mode
)
2706 >= GET_MODE_PRECISION (outer_mode
)))
2708 gcc_assert (SCALAR_INT_MODE_P (outer_mode
));
2709 struct reg_replace
*r
= (outer_mode
== QImode
? &qiregs
2710 : outer_mode
== HImode
? &hiregs
2711 : outer_mode
== SImode
? &siregs
2713 rtx new_reg
= get_replacement (r
);
2715 if (recog_data
.operand_type
[i
] != OP_OUT
)
2718 if (GET_MODE_PRECISION (inner_mode
)
2719 < GET_MODE_PRECISION (outer_mode
))
2724 rtx pat
= gen_rtx_SET (new_reg
,
2725 gen_rtx_fmt_e (code
, outer_mode
, inner
));
2726 emit_insn_before (pat
, insn
);
2729 if (recog_data
.operand_type
[i
] != OP_IN
)
2732 if (GET_MODE_PRECISION (inner_mode
)
2733 < GET_MODE_PRECISION (outer_mode
))
2738 rtx pat
= gen_rtx_SET (inner
,
2739 gen_rtx_fmt_e (code
, inner_mode
, new_reg
));
2740 emit_insn_after (pat
, insn
);
2742 validate_change (insn
, recog_data
.operand_loc
[i
], new_reg
, false);
2747 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2751 nvptx_get_unisimt_master ()
2753 rtx
&master
= cfun
->machine
->unisimt_master
;
2754 return master
? master
: master
= gen_reg_rtx (SImode
);
2757 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2760 nvptx_get_unisimt_predicate ()
2762 rtx
&pred
= cfun
->machine
->unisimt_predicate
;
2763 return pred
? pred
: pred
= gen_reg_rtx (BImode
);
2766 /* Return true if given call insn references one of the functions provided by
2767 the CUDA runtime: malloc, free, vprintf. */
2770 nvptx_call_insn_is_syscall_p (rtx_insn
*insn
)
2772 rtx pat
= PATTERN (insn
);
2773 gcc_checking_assert (GET_CODE (pat
) == PARALLEL
);
2774 pat
= XVECEXP (pat
, 0, 0);
2775 if (GET_CODE (pat
) == SET
)
2776 pat
= SET_SRC (pat
);
2777 gcc_checking_assert (GET_CODE (pat
) == CALL
2778 && GET_CODE (XEXP (pat
, 0)) == MEM
);
2779 rtx addr
= XEXP (XEXP (pat
, 0), 0);
2780 if (GET_CODE (addr
) != SYMBOL_REF
)
2782 const char *name
= XSTR (addr
, 0);
2783 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2784 references with forced assembler name refer to PTX syscalls. For vprintf,
2785 accept both normal and forced-assembler-name references. */
2786 return (!strcmp (name
, "vprintf") || !strcmp (name
, "*vprintf")
2787 || !strcmp (name
, "*malloc")
2788 || !strcmp (name
, "*free"));
2791 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2792 propagate its value from lane MASTER to current lane. */
2795 nvptx_unisimt_handle_set (rtx set
, rtx_insn
*insn
, rtx master
)
2798 if (GET_CODE (set
) == SET
&& REG_P (reg
= SET_DEST (set
)))
2799 emit_insn_after (nvptx_gen_shuffle (reg
, reg
, master
, SHUFFLE_IDX
), insn
);
2802 /* Adjust code for uniform-simt code generation variant by making atomics and
2803 "syscalls" conditionally executed, and inserting shuffle-based propagation
2804 for registers being set. */
2807 nvptx_reorg_uniform_simt ()
2809 rtx_insn
*insn
, *next
;
2811 for (insn
= get_insns (); insn
; insn
= next
)
2813 next
= NEXT_INSN (insn
);
2814 if (!(CALL_P (insn
) && nvptx_call_insn_is_syscall_p (insn
))
2815 && !(NONJUMP_INSN_P (insn
)
2816 && GET_CODE (PATTERN (insn
)) == PARALLEL
2817 && get_attr_atomic (insn
)))
2819 rtx pat
= PATTERN (insn
);
2820 rtx master
= nvptx_get_unisimt_master ();
2821 for (int i
= 0; i
< XVECLEN (pat
, 0); i
++)
2822 nvptx_unisimt_handle_set (XVECEXP (pat
, 0, i
), insn
, master
);
2823 rtx pred
= nvptx_get_unisimt_predicate ();
2824 pred
= gen_rtx_NE (BImode
, pred
, const0_rtx
);
2825 pat
= gen_rtx_COND_EXEC (VOIDmode
, pred
, pat
);
2826 validate_change (insn
, &PATTERN (insn
), pat
, false);
2830 /* Loop structure of the function. The entire function is described as
2835 /* Parent parallel. */
2838 /* Next sibling parallel. */
2841 /* First child parallel. */
2844 /* Partitioning mask of the parallel. */
2847 /* Partitioning used within inner parallels. */
2848 unsigned inner_mask
;
2850 /* Location of parallel forked and join. The forked is the first
2851 block in the parallel and the join is the first block after of
2853 basic_block forked_block
;
2854 basic_block join_block
;
2856 rtx_insn
*forked_insn
;
2857 rtx_insn
*join_insn
;
2859 rtx_insn
*fork_insn
;
2860 rtx_insn
*joining_insn
;
2862 /* Basic blocks in this parallel, but not in child parallels. The
2863 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2865 auto_vec
<basic_block
> blocks
;
2868 parallel (parallel
*parent
, unsigned mode
);
2872 /* Constructor links the new parallel into it's parent's chain of
2875 parallel::parallel (parallel
*parent_
, unsigned mask_
)
2876 :parent (parent_
), next (0), inner (0), mask (mask_
), inner_mask (0)
2878 forked_block
= join_block
= 0;
2879 forked_insn
= join_insn
= 0;
2880 fork_insn
= joining_insn
= 0;
2884 next
= parent
->inner
;
2885 parent
->inner
= this;
2889 parallel::~parallel ()
2895 /* Map of basic blocks to insns */
2896 typedef hash_map
<basic_block
, rtx_insn
*> bb_insn_map_t
;
2898 /* A tuple of an insn of interest and the BB in which it resides. */
2899 typedef std::pair
<rtx_insn
*, basic_block
> insn_bb_t
;
2900 typedef auto_vec
<insn_bb_t
> insn_bb_vec_t
;
2902 /* Split basic blocks such that each forked and join unspecs are at
2903 the start of their basic blocks. Thus afterwards each block will
2904 have a single partitioning mode. We also do the same for return
2905 insns, as they are executed by every thread. Return the
2906 partitioning mode of the function as a whole. Populate MAP with
2907 head and tail blocks. We also clear the BB visited flag, which is
2908 used when finding partitions. */
2911 nvptx_split_blocks (bb_insn_map_t
*map
)
2913 insn_bb_vec_t worklist
;
2917 /* Locate all the reorg instructions of interest. */
2918 FOR_ALL_BB_FN (block
, cfun
)
2920 bool seen_insn
= false;
2922 /* Clear visited flag, for use by parallel locator */
2923 block
->flags
&= ~BB_VISITED
;
2925 FOR_BB_INSNS (block
, insn
)
2929 switch (recog_memoized (insn
))
2934 case CODE_FOR_nvptx_forked
:
2935 case CODE_FOR_nvptx_join
:
2938 case CODE_FOR_return
:
2939 /* We also need to split just before return insns, as
2940 that insn needs executing by all threads, but the
2941 block it is in probably does not. */
2946 /* We've found an instruction that must be at the start of
2947 a block, but isn't. Add it to the worklist. */
2948 worklist
.safe_push (insn_bb_t (insn
, block
));
2950 /* It was already the first instruction. Just add it to
2952 map
->get_or_insert (block
) = insn
;
2957 /* Split blocks on the worklist. */
2960 basic_block remap
= 0;
2961 for (ix
= 0; worklist
.iterate (ix
, &elt
); ix
++)
2963 if (remap
!= elt
->second
)
2965 block
= elt
->second
;
2969 /* Split block before insn. The insn is in the new block */
2970 edge e
= split_block (block
, PREV_INSN (elt
->first
));
2973 map
->get_or_insert (block
) = elt
->first
;
2977 /* BLOCK is a basic block containing a head or tail instruction.
2978 Locate the associated prehead or pretail instruction, which must be
2979 in the single predecessor block. */
2982 nvptx_discover_pre (basic_block block
, int expected
)
2984 gcc_assert (block
->preds
->length () == 1);
2985 basic_block pre_block
= (*block
->preds
)[0]->src
;
2988 for (pre_insn
= BB_END (pre_block
); !INSN_P (pre_insn
);
2989 pre_insn
= PREV_INSN (pre_insn
))
2990 gcc_assert (pre_insn
!= BB_HEAD (pre_block
));
2992 gcc_assert (recog_memoized (pre_insn
) == expected
);
2996 /* Dump this parallel and all its inner parallels. */
2999 nvptx_dump_pars (parallel
*par
, unsigned depth
)
3001 fprintf (dump_file
, "%u: mask %d head=%d, tail=%d\n",
3003 par
->forked_block
? par
->forked_block
->index
: -1,
3004 par
->join_block
? par
->join_block
->index
: -1);
3006 fprintf (dump_file
, " blocks:");
3009 for (unsigned ix
= 0; par
->blocks
.iterate (ix
, &block
); ix
++)
3010 fprintf (dump_file
, " %d", block
->index
);
3011 fprintf (dump_file
, "\n");
3013 nvptx_dump_pars (par
->inner
, depth
+ 1);
3016 nvptx_dump_pars (par
->next
, depth
);
3019 /* If BLOCK contains a fork/join marker, process it to create or
3020 terminate a loop structure. Add this block to the current loop,
3021 and then walk successor blocks. */
3024 nvptx_find_par (bb_insn_map_t
*map
, parallel
*par
, basic_block block
)
3026 if (block
->flags
& BB_VISITED
)
3028 block
->flags
|= BB_VISITED
;
3030 if (rtx_insn
**endp
= map
->get (block
))
3032 rtx_insn
*end
= *endp
;
3034 /* This is a block head or tail, or return instruction. */
3035 switch (recog_memoized (end
))
3037 case CODE_FOR_return
:
3038 /* Return instructions are in their own block, and we
3039 don't need to do anything more. */
3042 case CODE_FOR_nvptx_forked
:
3043 /* Loop head, create a new inner loop and add it into
3044 our parent's child list. */
3046 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
3049 par
= new parallel (par
, mask
);
3050 par
->forked_block
= block
;
3051 par
->forked_insn
= end
;
3052 if (!(mask
& GOMP_DIM_MASK (GOMP_DIM_MAX
))
3053 && (mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
)))
3055 = nvptx_discover_pre (block
, CODE_FOR_nvptx_fork
);
3059 case CODE_FOR_nvptx_join
:
3060 /* A loop tail. Finish the current loop and return to
3063 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
3065 gcc_assert (par
->mask
== mask
);
3066 par
->join_block
= block
;
3067 par
->join_insn
= end
;
3068 if (!(mask
& GOMP_DIM_MASK (GOMP_DIM_MAX
))
3069 && (mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
)))
3071 = nvptx_discover_pre (block
, CODE_FOR_nvptx_joining
);
3082 /* Add this block onto the current loop's list of blocks. */
3083 par
->blocks
.safe_push (block
);
3085 /* This must be the entry block. Create a NULL parallel. */
3086 par
= new parallel (0, 0);
3088 /* Walk successor blocks. */
3092 FOR_EACH_EDGE (e
, ei
, block
->succs
)
3093 nvptx_find_par (map
, par
, e
->dest
);
3098 /* DFS walk the CFG looking for fork & join markers. Construct
3099 loop structures as we go. MAP is a mapping of basic blocks
3100 to head & tail markers, discovered when splitting blocks. This
3101 speeds up the discovery. We rely on the BB visited flag having
3102 been cleared when splitting blocks. */
3105 nvptx_discover_pars (bb_insn_map_t
*map
)
3109 /* Mark exit blocks as visited. */
3110 block
= EXIT_BLOCK_PTR_FOR_FN (cfun
);
3111 block
->flags
|= BB_VISITED
;
3113 /* And entry block as not. */
3114 block
= ENTRY_BLOCK_PTR_FOR_FN (cfun
);
3115 block
->flags
&= ~BB_VISITED
;
3117 parallel
*par
= nvptx_find_par (map
, 0, block
);
3121 fprintf (dump_file
, "\nLoops\n");
3122 nvptx_dump_pars (par
, 0);
3123 fprintf (dump_file
, "\n");
3129 /* Analyse a group of BBs within a partitioned region and create N
3130 Single-Entry-Single-Exit regions. Some of those regions will be
3131 trivial ones consisting of a single BB. The blocks of a
3132 partitioned region might form a set of disjoint graphs -- because
3133 the region encloses a differently partitoned sub region.
3135 We use the linear time algorithm described in 'Finding Regions Fast:
3136 Single Entry Single Exit and control Regions in Linear Time'
3137 Johnson, Pearson & Pingali. That algorithm deals with complete
3138 CFGs, where a back edge is inserted from END to START, and thus the
3139 problem becomes one of finding equivalent loops.
3141 In this case we have a partial CFG. We complete it by redirecting
3142 any incoming edge to the graph to be from an arbitrary external BB,
3143 and similarly redirecting any outgoing edge to be to that BB.
3144 Thus we end up with a closed graph.
3146 The algorithm works by building a spanning tree of an undirected
3147 graph and keeping track of back edges from nodes further from the
3148 root in the tree to nodes nearer to the root in the tree. In the
3149 description below, the root is up and the tree grows downwards.
3151 We avoid having to deal with degenerate back-edges to the same
3152 block, by splitting each BB into 3 -- one for input edges, one for
3153 the node itself and one for the output edges. Such back edges are
3154 referred to as 'Brackets'. Cycle equivalent nodes will have the
3155 same set of brackets.
3157 Determining bracket equivalency is done by maintaining a list of
3158 brackets in such a manner that the list length and final bracket
3159 uniquely identify the set.
3161 We use coloring to mark all BBs with cycle equivalency with the
3162 same color. This is the output of the 'Finding Regions Fast'
3163 algorithm. Notice it doesn't actually find the set of nodes within
3164 a particular region, just unorderd sets of nodes that are the
3165 entries and exits of SESE regions.
3167 After determining cycle equivalency, we need to find the minimal
3168 set of SESE regions. Do this with a DFS coloring walk of the
3169 complete graph. We're either 'looking' or 'coloring'. When
3170 looking, and we're in the subgraph, we start coloring the color of
3171 the current node, and remember that node as the start of the
3172 current color's SESE region. Every time we go to a new node, we
3173 decrement the count of nodes with thet color. If it reaches zero,
3174 we remember that node as the end of the current color's SESE region
3175 and return to 'looking'. Otherwise we color the node the current
3178 This way we end up with coloring the inside of non-trivial SESE
3179 regions with the color of that region. */
3181 /* A pair of BBs. We use this to represent SESE regions. */
3182 typedef std::pair
<basic_block
, basic_block
> bb_pair_t
;
3183 typedef auto_vec
<bb_pair_t
> bb_pair_vec_t
;
3185 /* A node in the undirected CFG. The discriminator SECOND indicates just
3186 above or just below the BB idicated by FIRST. */
3187 typedef std::pair
<basic_block
, int> pseudo_node_t
;
3189 /* A bracket indicates an edge towards the root of the spanning tree of the
3190 undirected graph. Each bracket has a color, determined
3191 from the currrent set of brackets. */
3194 pseudo_node_t back
; /* Back target */
3196 /* Current color and size of set. */
3200 bracket (pseudo_node_t back_
)
3201 : back (back_
), color (~0u), size (~0u)
3205 unsigned get_color (auto_vec
<unsigned> &color_counts
, unsigned length
)
3210 color
= color_counts
.length ();
3211 color_counts
.quick_push (0);
3213 color_counts
[color
]++;
3218 typedef auto_vec
<bracket
> bracket_vec_t
;
3220 /* Basic block info for finding SESE regions. */
3224 int node
; /* Node number in spanning tree. */
3225 int parent
; /* Parent node number. */
3227 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3228 edges arrive at pseudo-node Ai and the outgoing edges leave at
3229 pseudo-node Ao. We have to remember which way we arrived at a
3230 particular node when generating the spanning tree. dir > 0 means
3231 we arrived at Ai, dir < 0 means we arrived at Ao. */
3234 /* Lowest numbered pseudo-node reached via a backedge from thsis
3235 node, or any descendant. */
3238 int color
; /* Cycle-equivalence color */
3240 /* Stack of brackets for this node. */
3241 bracket_vec_t brackets
;
3243 bb_sese (unsigned node_
, unsigned p
, int dir_
)
3244 :node (node_
), parent (p
), dir (dir_
)
3249 /* Push a bracket ending at BACK. */
3250 void push (const pseudo_node_t
&back
)
3253 fprintf (dump_file
, "Pushing backedge %d:%+d\n",
3254 back
.first
? back
.first
->index
: 0, back
.second
);
3255 brackets
.safe_push (bracket (back
));
3258 void append (bb_sese
*child
);
3259 void remove (const pseudo_node_t
&);
3261 /* Set node's color. */
3262 void set_color (auto_vec
<unsigned> &color_counts
)
3264 color
= brackets
.last ().get_color (color_counts
, brackets
.length ());
3268 bb_sese::~bb_sese ()
3272 /* Destructively append CHILD's brackets. */
3275 bb_sese::append (bb_sese
*child
)
3277 if (int len
= child
->brackets
.length ())
3283 for (ix
= 0; ix
< len
; ix
++)
3285 const pseudo_node_t
&pseudo
= child
->brackets
[ix
].back
;
3286 fprintf (dump_file
, "Appending (%d)'s backedge %d:%+d\n",
3287 child
->node
, pseudo
.first
? pseudo
.first
->index
: 0,
3291 if (!brackets
.length ())
3292 std::swap (brackets
, child
->brackets
);
3295 brackets
.reserve (len
);
3296 for (ix
= 0; ix
< len
; ix
++)
3297 brackets
.quick_push (child
->brackets
[ix
]);
3302 /* Remove brackets that terminate at PSEUDO. */
3305 bb_sese::remove (const pseudo_node_t
&pseudo
)
3307 unsigned removed
= 0;
3308 int len
= brackets
.length ();
3310 for (int ix
= 0; ix
< len
; ix
++)
3312 if (brackets
[ix
].back
== pseudo
)
3315 fprintf (dump_file
, "Removing backedge %d:%+d\n",
3316 pseudo
.first
? pseudo
.first
->index
: 0, pseudo
.second
);
3320 brackets
[ix
-removed
] = brackets
[ix
];
3326 /* Accessors for BB's aux pointer. */
3327 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3328 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3330 /* DFS walk creating SESE data structures. Only cover nodes with
3331 BB_VISITED set. Append discovered blocks to LIST. We number in
3332 increments of 3 so that the above and below pseudo nodes can be
3333 implicitly numbered too. */
3336 nvptx_sese_number (int n
, int p
, int dir
, basic_block b
,
3337 auto_vec
<basic_block
> *list
)
3339 if (BB_GET_SESE (b
))
3343 fprintf (dump_file
, "Block %d(%d), parent (%d), orientation %+d\n",
3344 b
->index
, n
, p
, dir
);
3346 BB_SET_SESE (b
, new bb_sese (n
, p
, dir
));
3350 list
->quick_push (b
);
3352 /* First walk the nodes on the 'other side' of this node, then walk
3353 the nodes on the same side. */
3354 for (unsigned ix
= 2; ix
; ix
--)
3356 vec
<edge
, va_gc
> *edges
= dir
> 0 ? b
->succs
: b
->preds
;
3357 size_t offset
= (dir
> 0 ? offsetof (edge_def
, dest
)
3358 : offsetof (edge_def
, src
));
3362 FOR_EACH_EDGE (e
, ei
, edges
)
3364 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3366 if (target
->flags
& BB_VISITED
)
3367 n
= nvptx_sese_number (n
, p
, dir
, target
, list
);
3374 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3375 EDGES are the outgoing edges and OFFSET is the offset to the src
3376 or dst block on the edges. */
3379 nvptx_sese_pseudo (basic_block me
, bb_sese
*sese
, int depth
, int dir
,
3380 vec
<edge
, va_gc
> *edges
, size_t offset
)
3384 int hi_back
= depth
;
3385 pseudo_node_t
node_back (0, depth
);
3386 int hi_child
= depth
;
3387 pseudo_node_t
node_child (0, depth
);
3388 basic_block child
= NULL
;
3389 unsigned num_children
= 0;
3390 int usd
= -dir
* sese
->dir
;
3393 fprintf (dump_file
, "\nProcessing %d(%d) %+d\n",
3394 me
->index
, sese
->node
, dir
);
3398 /* This is the above pseudo-child. It has the BB itself as an
3399 additional child node. */
3400 node_child
= sese
->high
;
3401 hi_child
= node_child
.second
;
3402 if (node_child
.first
)
3403 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
3407 /* Examine each edge.
3408 - if it is a child (a) append its bracket list and (b) record
3409 whether it is the child with the highest reaching bracket.
3410 - if it is an edge to ancestor, record whether it's the highest
3411 reaching backlink. */
3412 FOR_EACH_EDGE (e
, ei
, edges
)
3414 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3416 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
3418 if (t_sese
->parent
== sese
->node
&& !(t_sese
->dir
+ usd
))
3420 /* Child node. Append its bracket list. */
3422 sese
->append (t_sese
);
3424 /* Compare it's hi value. */
3425 int t_hi
= t_sese
->high
.second
;
3427 if (basic_block child_hi_block
= t_sese
->high
.first
)
3428 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
3430 if (hi_child
> t_hi
)
3433 node_child
= t_sese
->high
;
3437 else if (t_sese
->node
< sese
->node
+ dir
3438 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
3440 /* Non-parental ancestor node -- a backlink. */
3441 int d
= usd
* t_sese
->dir
;
3442 int back
= t_sese
->node
+ d
;
3447 node_back
= pseudo_node_t (target
, d
);
3452 { /* Fallen off graph, backlink to entry node. */
3454 node_back
= pseudo_node_t (0, 0);
3458 /* Remove any brackets that terminate at this pseudo node. */
3459 sese
->remove (pseudo_node_t (me
, dir
));
3461 /* Now push any backlinks from this pseudo node. */
3462 FOR_EACH_EDGE (e
, ei
, edges
)
3464 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3465 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
3467 if (t_sese
->node
< sese
->node
+ dir
3468 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
3469 /* Non-parental ancestor node - backedge from me. */
3470 sese
->push (pseudo_node_t (target
, usd
* t_sese
->dir
));
3474 /* back edge to entry node */
3475 sese
->push (pseudo_node_t (0, 0));
3479 /* If this node leads directly or indirectly to a no-return region of
3480 the graph, then fake a backedge to entry node. */
3481 if (!sese
->brackets
.length () || !edges
|| !edges
->length ())
3484 node_back
= pseudo_node_t (0, 0);
3485 sese
->push (node_back
);
3488 /* Record the highest reaching backedge from us or a descendant. */
3489 sese
->high
= hi_back
< hi_child
? node_back
: node_child
;
3491 if (num_children
> 1)
3493 /* There is more than one child -- this is a Y shaped piece of
3494 spanning tree. We have to insert a fake backedge from this
3495 node to the highest ancestor reached by not-the-highest
3496 reaching child. Note that there may be multiple children
3497 with backedges to the same highest node. That's ok and we
3498 insert the edge to that highest node. */
3500 if (dir
< 0 && child
)
3502 node_child
= sese
->high
;
3503 hi_child
= node_child
.second
;
3504 if (node_child
.first
)
3505 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
3508 FOR_EACH_EDGE (e
, ei
, edges
)
3510 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3512 if (target
== child
)
3513 /* Ignore the highest child. */
3516 bb_sese
*t_sese
= BB_GET_SESE (target
);
3519 if (t_sese
->parent
!= sese
->node
)
3523 /* Compare its hi value. */
3524 int t_hi
= t_sese
->high
.second
;
3526 if (basic_block child_hi_block
= t_sese
->high
.first
)
3527 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
3529 if (hi_child
> t_hi
)
3532 node_child
= t_sese
->high
;
3536 sese
->push (node_child
);
3541 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3542 proceed to successors. Set SESE entry and exit nodes of
3546 nvptx_sese_color (auto_vec
<unsigned> &color_counts
, bb_pair_vec_t
®ions
,
3547 basic_block block
, int coloring
)
3549 bb_sese
*sese
= BB_GET_SESE (block
);
3551 if (block
->flags
& BB_VISITED
)
3553 /* If we've already encountered this block, either we must not
3554 be coloring, or it must have been colored the current color. */
3555 gcc_assert (coloring
< 0 || (sese
&& coloring
== sese
->color
));
3559 block
->flags
|= BB_VISITED
;
3565 /* Start coloring a region. */
3566 regions
[sese
->color
].first
= block
;
3567 coloring
= sese
->color
;
3570 if (!--color_counts
[sese
->color
] && sese
->color
== coloring
)
3572 /* Found final block of SESE region. */
3573 regions
[sese
->color
].second
= block
;
3577 /* Color the node, so we can assert on revisiting the node
3578 that the graph is indeed SESE. */
3579 sese
->color
= coloring
;
3582 /* Fallen off the subgraph, we cannot be coloring. */
3583 gcc_assert (coloring
< 0);
3585 /* Walk each successor block. */
3586 if (block
->succs
&& block
->succs
->length ())
3591 FOR_EACH_EDGE (e
, ei
, block
->succs
)
3592 nvptx_sese_color (color_counts
, regions
, e
->dest
, coloring
);
3595 gcc_assert (coloring
< 0);
3598 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3599 end up with NULL entries in it. */
3602 nvptx_find_sese (auto_vec
<basic_block
> &blocks
, bb_pair_vec_t
®ions
)
3607 /* First clear each BB of the whole function. */
3608 FOR_ALL_BB_FN (block
, cfun
)
3610 block
->flags
&= ~BB_VISITED
;
3611 BB_SET_SESE (block
, 0);
3614 /* Mark blocks in the function that are in this graph. */
3615 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3616 block
->flags
|= BB_VISITED
;
3618 /* Counts of nodes assigned to each color. There cannot be more
3619 colors than blocks (and hopefully there will be fewer). */
3620 auto_vec
<unsigned> color_counts
;
3621 color_counts
.reserve (blocks
.length ());
3623 /* Worklist of nodes in the spanning tree. Again, there cannot be
3624 more nodes in the tree than blocks (there will be fewer if the
3625 CFG of blocks is disjoint). */
3626 auto_vec
<basic_block
> spanlist
;
3627 spanlist
.reserve (blocks
.length ());
3629 /* Make sure every block has its cycle class determined. */
3630 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3632 if (BB_GET_SESE (block
))
3633 /* We already met this block in an earlier graph solve. */
3637 fprintf (dump_file
, "Searching graph starting at %d\n", block
->index
);
3639 /* Number the nodes reachable from block initial DFS order. */
3640 int depth
= nvptx_sese_number (2, 0, +1, block
, &spanlist
);
3642 /* Now walk in reverse DFS order to find cycle equivalents. */
3643 while (spanlist
.length ())
3645 block
= spanlist
.pop ();
3646 bb_sese
*sese
= BB_GET_SESE (block
);
3648 /* Do the pseudo node below. */
3649 nvptx_sese_pseudo (block
, sese
, depth
, +1,
3650 sese
->dir
> 0 ? block
->succs
: block
->preds
,
3651 (sese
->dir
> 0 ? offsetof (edge_def
, dest
)
3652 : offsetof (edge_def
, src
)));
3653 sese
->set_color (color_counts
);
3654 /* Do the pseudo node above. */
3655 nvptx_sese_pseudo (block
, sese
, depth
, -1,
3656 sese
->dir
< 0 ? block
->succs
: block
->preds
,
3657 (sese
->dir
< 0 ? offsetof (edge_def
, dest
)
3658 : offsetof (edge_def
, src
)));
3661 fprintf (dump_file
, "\n");
3667 const char *comma
= "";
3669 fprintf (dump_file
, "Found %d cycle equivalents\n",
3670 color_counts
.length ());
3671 for (ix
= 0; color_counts
.iterate (ix
, &count
); ix
++)
3673 fprintf (dump_file
, "%s%d[%d]={", comma
, ix
, count
);
3676 for (unsigned jx
= 0; blocks
.iterate (jx
, &block
); jx
++)
3677 if (BB_GET_SESE (block
)->color
== ix
)
3679 block
->flags
|= BB_VISITED
;
3680 fprintf (dump_file
, "%s%d", comma
, block
->index
);
3683 fprintf (dump_file
, "}");
3686 fprintf (dump_file
, "\n");
3689 /* Now we've colored every block in the subgraph. We now need to
3690 determine the minimal set of SESE regions that cover that
3691 subgraph. Do this with a DFS walk of the complete function.
3692 During the walk we're either 'looking' or 'coloring'. When we
3693 reach the last node of a particular color, we stop coloring and
3694 return to looking. */
3696 /* There cannot be more SESE regions than colors. */
3697 regions
.reserve (color_counts
.length ());
3698 for (ix
= color_counts
.length (); ix
--;)
3699 regions
.quick_push (bb_pair_t (0, 0));
3701 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3702 block
->flags
&= ~BB_VISITED
;
3704 nvptx_sese_color (color_counts
, regions
, ENTRY_BLOCK_PTR_FOR_FN (cfun
), -1);
3708 const char *comma
= "";
3709 int len
= regions
.length ();
3711 fprintf (dump_file
, "SESE regions:");
3712 for (ix
= 0; ix
!= len
; ix
++)
3714 basic_block from
= regions
[ix
].first
;
3715 basic_block to
= regions
[ix
].second
;
3719 fprintf (dump_file
, "%s %d{%d", comma
, ix
, from
->index
);
3721 fprintf (dump_file
, "->%d", to
->index
);
3723 int color
= BB_GET_SESE (from
)->color
;
3725 /* Print the blocks within the region (excluding ends). */
3726 FOR_EACH_BB_FN (block
, cfun
)
3728 bb_sese
*sese
= BB_GET_SESE (block
);
3730 if (sese
&& sese
->color
== color
3731 && block
!= from
&& block
!= to
)
3732 fprintf (dump_file
, ".%d", block
->index
);
3734 fprintf (dump_file
, "}");
3738 fprintf (dump_file
, "\n\n");
3741 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3742 delete BB_GET_SESE (block
);
3748 /* Propagate live state at the start of a partitioned region. BLOCK
3749 provides the live register information, and might not contain
3750 INSN. Propagation is inserted just after INSN. RW indicates whether
3751 we are reading and/or writing state. This
3752 separation is needed for worker-level proppagation where we
3753 essentially do a spill & fill. FN is the underlying worker
3754 function to generate the propagation instructions for single
3755 register. DATA is user data.
3757 We propagate the live register set and the entire frame. We could
3758 do better by (a) propagating just the live set that is used within
3759 the partitioned regions and (b) only propagating stack entries that
3760 are used. The latter might be quite hard to determine. */
3762 typedef rtx (*propagator_fn
) (rtx
, propagate_mask
, unsigned, void *);
3765 nvptx_propagate (basic_block block
, rtx_insn
*insn
, propagate_mask rw
,
3766 propagator_fn fn
, void *data
)
3768 bitmap live
= DF_LIVE_IN (block
);
3769 bitmap_iterator iterator
;
3772 /* Copy the frame array. */
3773 HOST_WIDE_INT fs
= get_frame_size ();
3776 rtx tmp
= gen_reg_rtx (DImode
);
3778 rtx ptr
= gen_reg_rtx (Pmode
);
3779 rtx pred
= NULL_RTX
;
3780 rtx_code_label
*label
= NULL
;
3782 /* The frame size might not be DImode compatible, but the frame
3783 array's declaration will be. So it's ok to round up here. */
3784 fs
= (fs
+ GET_MODE_SIZE (DImode
) - 1) / GET_MODE_SIZE (DImode
);
3785 /* Detect single iteration loop. */
3790 emit_insn (gen_rtx_SET (ptr
, frame_pointer_rtx
));
3793 idx
= gen_reg_rtx (SImode
);
3794 pred
= gen_reg_rtx (BImode
);
3795 label
= gen_label_rtx ();
3797 emit_insn (gen_rtx_SET (idx
, GEN_INT (fs
)));
3798 /* Allow worker function to initialize anything needed. */
3799 rtx init
= fn (tmp
, PM_loop_begin
, fs
, data
);
3803 LABEL_NUSES (label
)++;
3804 emit_insn (gen_addsi3 (idx
, idx
, GEN_INT (-1)));
3807 emit_insn (gen_rtx_SET (tmp
, gen_rtx_MEM (DImode
, ptr
)));
3808 emit_insn (fn (tmp
, rw
, fs
, data
));
3810 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode
, ptr
), tmp
));
3813 emit_insn (gen_rtx_SET (pred
, gen_rtx_NE (BImode
, idx
, const0_rtx
)));
3814 emit_insn (gen_adddi3 (ptr
, ptr
, GEN_INT (GET_MODE_SIZE (DImode
))));
3815 emit_insn (gen_br_true_uni (pred
, label
));
3816 rtx fini
= fn (tmp
, PM_loop_end
, fs
, data
);
3819 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx
), idx
));
3821 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp
), tmp
));
3822 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr
), ptr
));
3823 rtx cpy
= get_insns ();
3825 insn
= emit_insn_after (cpy
, insn
);
3828 /* Copy live registers. */
3829 EXECUTE_IF_SET_IN_BITMAP (live
, 0, ix
, iterator
)
3831 rtx reg
= regno_reg_rtx
[ix
];
3833 if (REGNO (reg
) >= FIRST_PSEUDO_REGISTER
)
3835 rtx bcast
= fn (reg
, rw
, 0, data
);
3837 insn
= emit_insn_after (bcast
, insn
);
3842 /* Worker for nvptx_vpropagate. */
3845 vprop_gen (rtx reg
, propagate_mask pm
,
3846 unsigned ARG_UNUSED (count
), void *ARG_UNUSED (data
))
3848 if (!(pm
& PM_read_write
))
3851 return nvptx_gen_vcast (reg
);
3854 /* Propagate state that is live at start of BLOCK across the vectors
3855 of a single warp. Propagation is inserted just after INSN. */
3858 nvptx_vpropagate (basic_block block
, rtx_insn
*insn
)
3860 nvptx_propagate (block
, insn
, PM_read_write
, vprop_gen
, 0);
3863 /* Worker for nvptx_wpropagate. */
3866 wprop_gen (rtx reg
, propagate_mask pm
, unsigned rep
, void *data_
)
3868 wcast_data_t
*data
= (wcast_data_t
*)data_
;
3870 if (pm
& PM_loop_begin
)
3872 /* Starting a loop, initialize pointer. */
3873 unsigned align
= GET_MODE_ALIGNMENT (GET_MODE (reg
)) / BITS_PER_UNIT
;
3875 if (align
> worker_bcast_align
)
3876 worker_bcast_align
= align
;
3877 data
->offset
= (data
->offset
+ align
- 1) & ~(align
- 1);
3879 data
->ptr
= gen_reg_rtx (Pmode
);
3881 return gen_adddi3 (data
->ptr
, data
->base
, GEN_INT (data
->offset
));
3883 else if (pm
& PM_loop_end
)
3885 rtx clobber
= gen_rtx_CLOBBER (GET_MODE (data
->ptr
), data
->ptr
);
3886 data
->ptr
= NULL_RTX
;
3890 return nvptx_gen_wcast (reg
, pm
, rep
, data
);
3893 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3894 indicates if this is just before partitioned mode (do spill), or
3895 just after it starts (do fill). Sequence is inserted just after
3899 nvptx_wpropagate (bool pre_p
, basic_block block
, rtx_insn
*insn
)
3903 data
.base
= gen_reg_rtx (Pmode
);
3905 data
.ptr
= NULL_RTX
;
3907 nvptx_propagate (block
, insn
, pre_p
? PM_read
: PM_write
, wprop_gen
, &data
);
3910 /* Stuff was emitted, initialize the base pointer now. */
3911 rtx init
= gen_rtx_SET (data
.base
, worker_bcast_sym
);
3912 emit_insn_after (init
, insn
);
3914 if (worker_bcast_size
< data
.offset
)
3915 worker_bcast_size
= data
.offset
;
3919 /* Emit a worker-level synchronization barrier. We use different
3920 markers for before and after synchronizations. */
3923 nvptx_wsync (bool after
)
3925 return gen_nvptx_barsync (GEN_INT (after
));
3928 #if WORKAROUND_PTXJIT_BUG
3929 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
3933 bb_first_real_insn (basic_block bb
)
3937 /* Find first insn of from block. */
3938 FOR_BB_INSNS (bb
, insn
)
3946 /* Single neutering according to MASK. FROM is the incoming block and
3947 TO is the outgoing block. These may be the same block. Insert at
3950 if (tid.<axis>) goto end.
3952 and insert before ending branch of TO (if there is such an insn):
3955 <possibly-broadcast-cond>
3958 We currently only use differnt FROM and TO when skipping an entire
3959 loop. We could do more if we detected superblocks. */
3962 nvptx_single (unsigned mask
, basic_block from
, basic_block to
)
3964 rtx_insn
*head
= BB_HEAD (from
);
3965 rtx_insn
*tail
= BB_END (to
);
3966 unsigned skip_mask
= mask
;
3970 /* Find first insn of from block. */
3971 while (head
!= BB_END (from
) && !INSN_P (head
))
3972 head
= NEXT_INSN (head
);
3977 if (!(JUMP_P (head
) && single_succ_p (from
)))
3980 basic_block jump_target
= single_succ (from
);
3981 if (!single_pred_p (jump_target
))
3985 head
= BB_HEAD (from
);
3988 /* Find last insn of to block */
3989 rtx_insn
*limit
= from
== to
? head
: BB_HEAD (to
);
3990 while (tail
!= limit
&& !INSN_P (tail
) && !LABEL_P (tail
))
3991 tail
= PREV_INSN (tail
);
3993 /* Detect if tail is a branch. */
3994 rtx tail_branch
= NULL_RTX
;
3995 rtx cond_branch
= NULL_RTX
;
3996 if (tail
&& INSN_P (tail
))
3998 tail_branch
= PATTERN (tail
);
3999 if (GET_CODE (tail_branch
) != SET
|| SET_DEST (tail_branch
) != pc_rtx
)
4000 tail_branch
= NULL_RTX
;
4003 cond_branch
= SET_SRC (tail_branch
);
4004 if (GET_CODE (cond_branch
) != IF_THEN_ELSE
)
4005 cond_branch
= NULL_RTX
;
4011 /* If this is empty, do nothing. */
4012 if (!head
|| !INSN_P (head
))
4015 /* If this is a dummy insn, do nothing. */
4016 switch (recog_memoized (head
))
4020 case CODE_FOR_nvptx_fork
:
4021 case CODE_FOR_nvptx_forked
:
4022 case CODE_FOR_nvptx_joining
:
4023 case CODE_FOR_nvptx_join
:
4029 /* If we're only doing vector single, there's no need to
4030 emit skip code because we'll not insert anything. */
4031 if (!(mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)))
4034 else if (tail_branch
)
4035 /* Block with only unconditional branch. Nothing to do. */
4039 /* Insert the vector test inside the worker test. */
4041 rtx_insn
*before
= tail
;
4042 for (mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
4043 if (GOMP_DIM_MASK (mode
) & skip_mask
)
4045 rtx_code_label
*label
= gen_label_rtx ();
4046 rtx pred
= cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
];
4050 pred
= gen_reg_rtx (BImode
);
4051 cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
] = pred
;
4055 if (mode
== GOMP_DIM_VECTOR
)
4056 br
= gen_br_true (pred
, label
);
4058 br
= gen_br_true_uni (pred
, label
);
4059 emit_insn_before (br
, head
);
4061 LABEL_NUSES (label
)++;
4063 before
= emit_label_before (label
, before
);
4065 emit_label_after (label
, tail
);
4068 /* Now deal with propagating the branch condition. */
4071 rtx pvar
= XEXP (XEXP (cond_branch
, 0), 0);
4073 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR
) == mask
)
4075 /* Vector mode only, do a shuffle. */
4076 #if WORKAROUND_PTXJIT_BUG
4077 /* The branch condition %rcond is propagated like this:
4082 setp.ne.u32 %rnotvzero,%x,0;
4085 @%rnotvzero bra Lskip;
4086 setp.<op>.<type> %rcond,op1,op2;
4088 selp.u32 %rcondu32,1,0,%rcond;
4089 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4090 setp.ne.u32 %rcond,%rcondu32,0;
4092 There seems to be a bug in the ptx JIT compiler (observed at driver
4093 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4094 unless %rcond is initialized to something before 'bra Lskip'. The
4095 bug is not observed with ptxas from cuda 8.0.61.
4097 It is true that the code is non-trivial: at Lskip, %rcond is
4098 uninitialized in threads 1-31, and after the selp the same holds
4099 for %rcondu32. But shfl propagates the defined value in thread 0
4100 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4101 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4103 There is nothing in the PTX spec to suggest that this is wrong, or
4104 to explain why the extra initialization is needed. So, we classify
4105 it as a JIT bug, and the extra initialization as workaround. */
4106 emit_insn_before (gen_movbi (pvar
, const0_rtx
),
4107 bb_first_real_insn (from
));
4109 emit_insn_before (nvptx_gen_vcast (pvar
), tail
);
4113 /* Includes worker mode, do spill & fill. By construction
4114 we should never have worker mode only. */
4117 data
.base
= worker_bcast_sym
;
4120 if (worker_bcast_size
< GET_MODE_SIZE (SImode
))
4121 worker_bcast_size
= GET_MODE_SIZE (SImode
);
4124 emit_insn_before (nvptx_gen_wcast (pvar
, PM_read
, 0, &data
),
4126 /* Barrier so other workers can see the write. */
4127 emit_insn_before (nvptx_wsync (false), tail
);
4129 emit_insn_before (nvptx_gen_wcast (pvar
, PM_write
, 0, &data
), tail
);
4130 /* This barrier is needed to avoid worker zero clobbering
4131 the broadcast buffer before all the other workers have
4132 had a chance to read this instance of it. */
4133 emit_insn_before (nvptx_wsync (true), tail
);
4136 extract_insn (tail
);
4137 rtx unsp
= gen_rtx_UNSPEC (BImode
, gen_rtvec (1, pvar
),
4139 validate_change (tail
, recog_data
.operand_loc
[0], unsp
, false);
4143 /* PAR is a parallel that is being skipped in its entirety according to
4144 MASK. Treat this as skipping a superblock starting at forked
4145 and ending at joining. */
4148 nvptx_skip_par (unsigned mask
, parallel
*par
)
4150 basic_block tail
= par
->join_block
;
4151 gcc_assert (tail
->preds
->length () == 1);
4153 basic_block pre_tail
= (*tail
->preds
)[0]->src
;
4154 gcc_assert (pre_tail
->succs
->length () == 1);
4156 nvptx_single (mask
, par
->forked_block
, pre_tail
);
4159 /* If PAR has a single inner parallel and PAR itself only contains
4160 empty entry and exit blocks, swallow the inner PAR. */
4163 nvptx_optimize_inner (parallel
*par
)
4165 parallel
*inner
= par
->inner
;
4167 /* We mustn't be the outer dummy par. */
4171 /* We must have a single inner par. */
4172 if (!inner
|| inner
->next
)
4175 /* We must only contain 2 blocks ourselves -- the head and tail of
4177 if (par
->blocks
.length () != 2)
4180 /* We must be disjoint partitioning. As we only have vector and
4181 worker partitioning, this is sufficient to guarantee the pars
4182 have adjacent partitioning. */
4183 if ((par
->mask
& inner
->mask
) & (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1))
4184 /* This indicates malformed code generation. */
4187 /* The outer forked insn should be immediately followed by the inner
4189 rtx_insn
*forked
= par
->forked_insn
;
4190 rtx_insn
*fork
= BB_END (par
->forked_block
);
4192 if (NEXT_INSN (forked
) != fork
)
4194 gcc_checking_assert (recog_memoized (fork
) == CODE_FOR_nvptx_fork
);
4196 /* The outer joining insn must immediately follow the inner join
4198 rtx_insn
*joining
= par
->joining_insn
;
4199 rtx_insn
*join
= inner
->join_insn
;
4200 if (NEXT_INSN (join
) != joining
)
4203 /* Preconditions met. Swallow the inner par. */
4205 fprintf (dump_file
, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4206 inner
->mask
, inner
->forked_block
->index
,
4207 inner
->join_block
->index
,
4208 par
->mask
, par
->forked_block
->index
, par
->join_block
->index
);
4210 par
->mask
|= inner
->mask
& (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1);
4212 par
->blocks
.reserve (inner
->blocks
.length ());
4213 while (inner
->blocks
.length ())
4214 par
->blocks
.quick_push (inner
->blocks
.pop ());
4216 par
->inner
= inner
->inner
;
4217 inner
->inner
= NULL
;
4222 /* Process the parallel PAR and all its contained
4223 parallels. We do everything but the neutering. Return mask of
4224 partitioned modes used within this parallel. */
4227 nvptx_process_pars (parallel
*par
)
4230 nvptx_optimize_inner (par
);
4232 unsigned inner_mask
= par
->mask
;
4234 /* Do the inner parallels first. */
4237 par
->inner_mask
= nvptx_process_pars (par
->inner
);
4238 inner_mask
|= par
->inner_mask
;
4241 if (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_MAX
))
4242 /* No propagation needed for a call. */;
4243 else if (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
4245 nvptx_wpropagate (false, par
->forked_block
, par
->forked_insn
);
4246 nvptx_wpropagate (true, par
->forked_block
, par
->fork_insn
);
4247 /* Insert begin and end synchronizations. */
4248 emit_insn_after (nvptx_wsync (false), par
->forked_insn
);
4249 emit_insn_before (nvptx_wsync (true), par
->joining_insn
);
4251 else if (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
4252 nvptx_vpropagate (par
->forked_block
, par
->forked_insn
);
4254 /* Now do siblings. */
4256 inner_mask
|= nvptx_process_pars (par
->next
);
4260 /* Neuter the parallel described by PAR. We recurse in depth-first
4261 order. MODES are the partitioning of the execution and OUTER is
4262 the partitioning of the parallels we are contained in. */
4265 nvptx_neuter_pars (parallel
*par
, unsigned modes
, unsigned outer
)
4267 unsigned me
= (par
->mask
4268 & (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
4269 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
4270 unsigned skip_mask
= 0, neuter_mask
= 0;
4273 nvptx_neuter_pars (par
->inner
, modes
, outer
| me
);
4275 for (unsigned mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
4277 if ((outer
| me
) & GOMP_DIM_MASK (mode
))
4278 {} /* Mode is partitioned: no neutering. */
4279 else if (!(modes
& GOMP_DIM_MASK (mode
)))
4280 {} /* Mode is not used: nothing to do. */
4281 else if (par
->inner_mask
& GOMP_DIM_MASK (mode
)
4282 || !par
->forked_insn
)
4283 /* Partitioned in inner parallels, or we're not a partitioned
4284 at all: neuter individual blocks. */
4285 neuter_mask
|= GOMP_DIM_MASK (mode
);
4286 else if (!par
->parent
|| !par
->parent
->forked_insn
4287 || par
->parent
->inner_mask
& GOMP_DIM_MASK (mode
))
4288 /* Parent isn't a parallel or contains this paralleling: skip
4289 parallel at this level. */
4290 skip_mask
|= GOMP_DIM_MASK (mode
);
4292 {} /* Parent will skip this parallel itself. */
4301 /* Neuter whole SESE regions. */
4302 bb_pair_vec_t regions
;
4304 nvptx_find_sese (par
->blocks
, regions
);
4305 len
= regions
.length ();
4306 for (ix
= 0; ix
!= len
; ix
++)
4308 basic_block from
= regions
[ix
].first
;
4309 basic_block to
= regions
[ix
].second
;
4312 nvptx_single (neuter_mask
, from
, to
);
4319 /* Neuter each BB individually. */
4320 len
= par
->blocks
.length ();
4321 for (ix
= 0; ix
!= len
; ix
++)
4323 basic_block block
= par
->blocks
[ix
];
4325 nvptx_single (neuter_mask
, block
, block
);
4331 nvptx_skip_par (skip_mask
, par
);
4334 nvptx_neuter_pars (par
->next
, modes
, outer
);
4337 /* PTX-specific reorganization
4338 - Split blocks at fork and join instructions
4339 - Compute live registers
4340 - Mark now-unused registers, so function begin doesn't declare
4342 - Insert state propagation when entering partitioned mode
4343 - Insert neutering instructions when in single mode
4344 - Replace subregs with suitable sequences.
4350 /* We are freeing block_for_insn in the toplev to keep compatibility
4351 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4352 compute_bb_for_insn ();
4354 thread_prologue_and_epilogue_insns ();
4356 /* Split blocks and record interesting unspecs. */
4357 bb_insn_map_t bb_insn_map
;
4359 nvptx_split_blocks (&bb_insn_map
);
4361 /* Compute live regs */
4362 df_clear_flags (DF_LR_RUN_DCE
);
4363 df_set_flags (DF_NO_INSN_RESCAN
| DF_NO_HARD_REGS
);
4364 df_live_add_problem ();
4365 df_live_set_all_dirty ();
4367 regstat_init_n_sets_and_refs ();
4370 df_dump (dump_file
);
4372 /* Mark unused regs as unused. */
4373 int max_regs
= max_reg_num ();
4374 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< max_regs
; i
++)
4375 if (REG_N_SETS (i
) == 0 && REG_N_REFS (i
) == 0)
4376 regno_reg_rtx
[i
] = const0_rtx
;
4378 /* Determine launch dimensions of the function. If it is not an
4379 offloaded function (i.e. this is a regular compiler), the
4380 function has no neutering. */
4381 tree attr
= oacc_get_fn_attrib (current_function_decl
);
4384 /* If we determined this mask before RTL expansion, we could
4385 elide emission of some levels of forks and joins. */
4387 tree dims
= TREE_VALUE (attr
);
4390 for (ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++, dims
= TREE_CHAIN (dims
))
4392 int size
= TREE_INT_CST_LOW (TREE_VALUE (dims
));
4393 tree allowed
= TREE_PURPOSE (dims
);
4395 if (size
!= 1 && !(allowed
&& integer_zerop (allowed
)))
4396 mask
|= GOMP_DIM_MASK (ix
);
4398 /* If there is worker neutering, there must be vector
4399 neutering. Otherwise the hardware will fail. */
4400 gcc_assert (!(mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
4401 || (mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
4403 /* Discover & process partitioned regions. */
4404 parallel
*pars
= nvptx_discover_pars (&bb_insn_map
);
4405 nvptx_process_pars (pars
);
4406 nvptx_neuter_pars (pars
, mask
, 0);
4410 /* Replace subregs. */
4411 nvptx_reorg_subreg ();
4413 if (TARGET_UNIFORM_SIMT
)
4414 nvptx_reorg_uniform_simt ();
4416 regstat_free_n_sets_and_refs ();
4418 df_finish_pass (true);
4421 /* Handle a "kernel" attribute; arguments as in
4422 struct attribute_spec.handler. */
4425 nvptx_handle_kernel_attribute (tree
*node
, tree name
, tree
ARG_UNUSED (args
),
4426 int ARG_UNUSED (flags
), bool *no_add_attrs
)
4430 if (TREE_CODE (decl
) != FUNCTION_DECL
)
4432 error ("%qE attribute only applies to functions", name
);
4433 *no_add_attrs
= true;
4435 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl
))))
4437 error ("%qE attribute requires a void return type", name
);
4438 *no_add_attrs
= true;
4444 /* Handle a "shared" attribute; arguments as in
4445 struct attribute_spec.handler. */
4448 nvptx_handle_shared_attribute (tree
*node
, tree name
, tree
ARG_UNUSED (args
),
4449 int ARG_UNUSED (flags
), bool *no_add_attrs
)
4453 if (TREE_CODE (decl
) != VAR_DECL
)
4455 error ("%qE attribute only applies to variables", name
);
4456 *no_add_attrs
= true;
4458 else if (!(TREE_PUBLIC (decl
) || TREE_STATIC (decl
)))
4460 error ("%qE attribute not allowed with auto storage class", name
);
4461 *no_add_attrs
= true;
4467 /* Table of valid machine attributes. */
4468 static const struct attribute_spec nvptx_attribute_table
[] =
4470 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
4471 affects_type_identity, handler, exclude } */
4472 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute
,
4474 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute
,
4476 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
4479 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
4481 static HOST_WIDE_INT
4482 nvptx_vector_alignment (const_tree type
)
4484 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
4486 return MIN (align
, BIGGEST_ALIGNMENT
);
4489 /* Indicate that INSN cannot be duplicated. */
4492 nvptx_cannot_copy_insn_p (rtx_insn
*insn
)
4494 switch (recog_memoized (insn
))
4496 case CODE_FOR_nvptx_shufflesi
:
4497 case CODE_FOR_nvptx_shufflesf
:
4498 case CODE_FOR_nvptx_barsync
:
4499 case CODE_FOR_nvptx_fork
:
4500 case CODE_FOR_nvptx_forked
:
4501 case CODE_FOR_nvptx_joining
:
4502 case CODE_FOR_nvptx_join
:
4509 /* Section anchors do not work. Initialization for flag_section_anchor
4510 probes the existence of the anchoring target hooks and prevents
4511 anchoring if they don't exist. However, we may be being used with
4512 a host-side compiler that does support anchoring, and hence see
4513 the anchor flag set (as it's not recalculated). So provide an
4514 implementation denying anchoring. */
4517 nvptx_use_anchors_for_symbol_p (const_rtx
ARG_UNUSED (a
))
4522 /* Record a symbol for mkoffload to enter into the mapping table. */
4525 nvptx_record_offload_symbol (tree decl
)
4527 switch (TREE_CODE (decl
))
4530 fprintf (asm_out_file
, "//:VAR_MAP \"%s\"\n",
4531 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
4536 tree attr
= oacc_get_fn_attrib (decl
);
4537 /* OpenMP offloading does not set this attribute. */
4538 tree dims
= attr
? TREE_VALUE (attr
) : NULL_TREE
;
4540 fprintf (asm_out_file
, "//:FUNC_MAP \"%s\"",
4541 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
4543 for (; dims
; dims
= TREE_CHAIN (dims
))
4545 int size
= TREE_INT_CST_LOW (TREE_VALUE (dims
));
4547 gcc_assert (!TREE_PURPOSE (dims
));
4548 fprintf (asm_out_file
, ", %#x", size
);
4551 fprintf (asm_out_file
, "\n");
4560 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4561 at the start of a file. */
4564 nvptx_file_start (void)
4566 fputs ("// BEGIN PREAMBLE\n", asm_out_file
);
4567 fputs ("\t.version\t3.1\n", asm_out_file
);
4568 fputs ("\t.target\tsm_30\n", asm_out_file
);
4569 fprintf (asm_out_file
, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode
));
4570 fputs ("// END PREAMBLE\n", asm_out_file
);
4573 /* Emit a declaration for a worker-level buffer in .shared memory. */
4576 write_worker_buffer (FILE *file
, rtx sym
, unsigned align
, unsigned size
)
4578 const char *name
= XSTR (sym
, 0);
4580 write_var_marker (file
, true, false, name
);
4581 fprintf (file
, ".shared .align %d .u8 %s[%d];\n",
4585 /* Write out the function declarations we've collected and declare storage
4586 for the broadcast buffer. */
4589 nvptx_file_end (void)
4591 hash_table
<tree_hasher
>::iterator iter
;
4593 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab
, decl
, tree
, iter
)
4594 nvptx_record_fndecl (decl
);
4595 fputs (func_decls
.str().c_str(), asm_out_file
);
4597 if (worker_bcast_size
)
4598 write_worker_buffer (asm_out_file
, worker_bcast_sym
,
4599 worker_bcast_align
, worker_bcast_size
);
4601 if (worker_red_size
)
4602 write_worker_buffer (asm_out_file
, worker_red_sym
,
4603 worker_red_align
, worker_red_size
);
4605 if (need_softstack_decl
)
4607 write_var_marker (asm_out_file
, false, true, "__nvptx_stacks");
4608 /* 32 is the maximum number of warps in a block. Even though it's an
4609 external declaration, emit the array size explicitly; otherwise, it
4610 may fail at PTX JIT time if the definition is later in link order. */
4611 fprintf (asm_out_file
, ".extern .shared .u%d __nvptx_stacks[32];\n",
4614 if (need_unisimt_decl
)
4616 write_var_marker (asm_out_file
, false, true, "__nvptx_uni");
4617 fprintf (asm_out_file
, ".extern .shared .u32 __nvptx_uni[32];\n");
4621 /* Expander for the shuffle builtins. */
4624 nvptx_expand_shuffle (tree exp
, rtx target
, machine_mode mode
, int ignore
)
4629 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 0),
4630 NULL_RTX
, mode
, EXPAND_NORMAL
);
4632 src
= copy_to_mode_reg (mode
, src
);
4634 rtx idx
= expand_expr (CALL_EXPR_ARG (exp
, 1),
4635 NULL_RTX
, SImode
, EXPAND_NORMAL
);
4636 rtx op
= expand_expr (CALL_EXPR_ARG (exp
, 2),
4637 NULL_RTX
, SImode
, EXPAND_NORMAL
);
4639 if (!REG_P (idx
) && GET_CODE (idx
) != CONST_INT
)
4640 idx
= copy_to_mode_reg (SImode
, idx
);
4642 rtx pat
= nvptx_gen_shuffle (target
, src
, idx
,
4643 (nvptx_shuffle_kind
) INTVAL (op
));
4650 /* Worker reduction address expander. */
4653 nvptx_expand_worker_addr (tree exp
, rtx target
,
4654 machine_mode
ARG_UNUSED (mode
), int ignore
)
4659 unsigned align
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 2));
4660 if (align
> worker_red_align
)
4661 worker_red_align
= align
;
4663 unsigned offset
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 0));
4664 unsigned size
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 1));
4665 if (size
+ offset
> worker_red_size
)
4666 worker_red_size
= size
+ offset
;
4668 rtx addr
= worker_red_sym
;
4671 addr
= gen_rtx_PLUS (Pmode
, addr
, GEN_INT (offset
));
4672 addr
= gen_rtx_CONST (Pmode
, addr
);
4675 emit_move_insn (target
, addr
);
4680 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4681 not require taking the address of any object, other than the memory
4682 cell being operated on. */
4685 nvptx_expand_cmp_swap (tree exp
, rtx target
,
4686 machine_mode
ARG_UNUSED (m
), int ARG_UNUSED (ignore
))
4688 machine_mode mode
= TYPE_MODE (TREE_TYPE (exp
));
4691 target
= gen_reg_rtx (mode
);
4693 rtx mem
= expand_expr (CALL_EXPR_ARG (exp
, 0),
4694 NULL_RTX
, Pmode
, EXPAND_NORMAL
);
4695 rtx cmp
= expand_expr (CALL_EXPR_ARG (exp
, 1),
4696 NULL_RTX
, mode
, EXPAND_NORMAL
);
4697 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 2),
4698 NULL_RTX
, mode
, EXPAND_NORMAL
);
4701 mem
= gen_rtx_MEM (mode
, mem
);
4703 cmp
= copy_to_mode_reg (mode
, cmp
);
4705 src
= copy_to_mode_reg (mode
, src
);
4708 pat
= gen_atomic_compare_and_swapsi_1 (target
, mem
, cmp
, src
, const0_rtx
);
4710 pat
= gen_atomic_compare_and_swapdi_1 (target
, mem
, cmp
, src
, const0_rtx
);
4718 /* Codes for all the NVPTX builtins. */
4721 NVPTX_BUILTIN_SHUFFLE
,
4722 NVPTX_BUILTIN_SHUFFLELL
,
4723 NVPTX_BUILTIN_WORKER_ADDR
,
4724 NVPTX_BUILTIN_CMP_SWAP
,
4725 NVPTX_BUILTIN_CMP_SWAPLL
,
4729 static GTY(()) tree nvptx_builtin_decls
[NVPTX_BUILTIN_MAX
];
4731 /* Return the NVPTX builtin for CODE. */
4734 nvptx_builtin_decl (unsigned code
, bool ARG_UNUSED (initialize_p
))
4736 if (code
>= NVPTX_BUILTIN_MAX
)
4737 return error_mark_node
;
4739 return nvptx_builtin_decls
[code
];
4742 /* Set up all builtin functions for this target. */
4745 nvptx_init_builtins (void)
4747 #define DEF(ID, NAME, T) \
4748 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4749 = add_builtin_function ("__builtin_nvptx_" NAME, \
4750 build_function_type_list T, \
4751 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4753 #define UINT unsigned_type_node
4754 #define LLUINT long_long_unsigned_type_node
4755 #define PTRVOID ptr_type_node
4757 DEF (SHUFFLE
, "shuffle", (UINT
, UINT
, UINT
, UINT
, NULL_TREE
));
4758 DEF (SHUFFLELL
, "shufflell", (LLUINT
, LLUINT
, UINT
, UINT
, NULL_TREE
));
4759 DEF (WORKER_ADDR
, "worker_addr",
4760 (PTRVOID
, ST
, UINT
, UINT
, NULL_TREE
));
4761 DEF (CMP_SWAP
, "cmp_swap", (UINT
, PTRVOID
, UINT
, UINT
, NULL_TREE
));
4762 DEF (CMP_SWAPLL
, "cmp_swapll", (LLUINT
, PTRVOID
, LLUINT
, LLUINT
, NULL_TREE
));
4771 /* Expand an expression EXP that calls a built-in function,
4772 with result going to TARGET if that's convenient
4773 (and in mode MODE if that's convenient).
4774 SUBTARGET may be used as the target for computing one of EXP's operands.
4775 IGNORE is nonzero if the value is to be ignored. */
4778 nvptx_expand_builtin (tree exp
, rtx target
, rtx
ARG_UNUSED (subtarget
),
4779 machine_mode mode
, int ignore
)
4781 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
4782 switch (DECL_FUNCTION_CODE (fndecl
))
4784 case NVPTX_BUILTIN_SHUFFLE
:
4785 case NVPTX_BUILTIN_SHUFFLELL
:
4786 return nvptx_expand_shuffle (exp
, target
, mode
, ignore
);
4788 case NVPTX_BUILTIN_WORKER_ADDR
:
4789 return nvptx_expand_worker_addr (exp
, target
, mode
, ignore
);
4791 case NVPTX_BUILTIN_CMP_SWAP
:
4792 case NVPTX_BUILTIN_CMP_SWAPLL
:
4793 return nvptx_expand_cmp_swap (exp
, target
, mode
, ignore
);
4795 default: gcc_unreachable ();
4799 /* Define dimension sizes for known hardware. */
4800 #define PTX_VECTOR_LENGTH 32
4801 #define PTX_WORKER_LENGTH 32
4802 #define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
4804 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
4809 return PTX_VECTOR_LENGTH
;
4812 /* Validate compute dimensions of an OpenACC offload or routine, fill
4813 in non-unity defaults. FN_LEVEL indicates the level at which a
4814 routine might spawn a loop. It is negative for non-routines. If
4815 DECL is null, we are validating the default dimensions. */
4818 nvptx_goacc_validate_dims (tree decl
, int dims
[], int fn_level
)
4820 bool changed
= false;
4822 /* The vector size must be 32, unless this is a SEQ routine. */
4823 if (fn_level
<= GOMP_DIM_VECTOR
&& fn_level
>= -1
4824 && dims
[GOMP_DIM_VECTOR
] >= 0
4825 && dims
[GOMP_DIM_VECTOR
] != PTX_VECTOR_LENGTH
)
4827 if (fn_level
< 0 && dims
[GOMP_DIM_VECTOR
] >= 0)
4828 warning_at (decl
? DECL_SOURCE_LOCATION (decl
) : UNKNOWN_LOCATION
, 0,
4829 dims
[GOMP_DIM_VECTOR
]
4830 ? G_("using vector_length (%d), ignoring %d")
4831 : G_("using vector_length (%d), ignoring runtime setting"),
4832 PTX_VECTOR_LENGTH
, dims
[GOMP_DIM_VECTOR
]);
4833 dims
[GOMP_DIM_VECTOR
] = PTX_VECTOR_LENGTH
;
4837 /* Check the num workers is not too large. */
4838 if (dims
[GOMP_DIM_WORKER
] > PTX_WORKER_LENGTH
)
4840 warning_at (decl
? DECL_SOURCE_LOCATION (decl
) : UNKNOWN_LOCATION
, 0,
4841 "using num_workers (%d), ignoring %d",
4842 PTX_WORKER_LENGTH
, dims
[GOMP_DIM_WORKER
]);
4843 dims
[GOMP_DIM_WORKER
] = PTX_WORKER_LENGTH
;
4849 dims
[GOMP_DIM_VECTOR
] = PTX_VECTOR_LENGTH
;
4850 if (dims
[GOMP_DIM_WORKER
] < 0)
4851 dims
[GOMP_DIM_WORKER
] = PTX_WORKER_LENGTH
;
4852 if (dims
[GOMP_DIM_GANG
] < 0)
4853 dims
[GOMP_DIM_GANG
] = PTX_GANG_DEFAULT
;
4860 /* Return maximum dimension size, or zero for unbounded. */
4863 nvptx_dim_limit (int axis
)
4867 case GOMP_DIM_WORKER
:
4868 return PTX_WORKER_LENGTH
;
4870 case GOMP_DIM_VECTOR
:
4871 return PTX_VECTOR_LENGTH
;
4879 /* Determine whether fork & joins are needed. */
4882 nvptx_goacc_fork_join (gcall
*call
, const int dims
[],
4883 bool ARG_UNUSED (is_fork
))
4885 tree arg
= gimple_call_arg (call
, 2);
4886 unsigned axis
= TREE_INT_CST_LOW (arg
);
4888 /* We only care about worker and vector partitioning. */
4889 if (axis
< GOMP_DIM_WORKER
)
4892 /* If the size is 1, there's no partitioning. */
4893 if (dims
[axis
] == 1)
4899 /* Generate a PTX builtin function call that returns the address in
4900 the worker reduction buffer at OFFSET. TYPE is the type of the
4901 data at that location. */
4904 nvptx_get_worker_red_addr (tree type
, tree offset
)
4906 machine_mode mode
= TYPE_MODE (type
);
4907 tree fndecl
= nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR
, true);
4908 tree size
= build_int_cst (unsigned_type_node
, GET_MODE_SIZE (mode
));
4909 tree align
= build_int_cst (unsigned_type_node
,
4910 GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
);
4911 tree call
= build_call_expr (fndecl
, 3, offset
, size
, align
);
4913 return fold_convert (build_pointer_type (type
), call
);
4916 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4917 will cast the variable if necessary. */
4920 nvptx_generate_vector_shuffle (location_t loc
,
4921 tree dest_var
, tree var
, unsigned shift
,
4924 unsigned fn
= NVPTX_BUILTIN_SHUFFLE
;
4925 tree_code code
= NOP_EXPR
;
4926 tree arg_type
= unsigned_type_node
;
4927 tree var_type
= TREE_TYPE (var
);
4928 tree dest_type
= var_type
;
4930 if (TREE_CODE (var_type
) == COMPLEX_TYPE
)
4931 var_type
= TREE_TYPE (var_type
);
4933 if (TREE_CODE (var_type
) == REAL_TYPE
)
4934 code
= VIEW_CONVERT_EXPR
;
4936 if (TYPE_SIZE (var_type
)
4937 == TYPE_SIZE (long_long_unsigned_type_node
))
4939 fn
= NVPTX_BUILTIN_SHUFFLELL
;
4940 arg_type
= long_long_unsigned_type_node
;
4943 tree call
= nvptx_builtin_decl (fn
, true);
4944 tree bits
= build_int_cst (unsigned_type_node
, shift
);
4945 tree kind
= build_int_cst (unsigned_type_node
, SHUFFLE_DOWN
);
4948 if (var_type
!= dest_type
)
4950 /* Do real and imaginary parts separately. */
4951 tree real
= fold_build1 (REALPART_EXPR
, var_type
, var
);
4952 real
= fold_build1 (code
, arg_type
, real
);
4953 real
= build_call_expr_loc (loc
, call
, 3, real
, bits
, kind
);
4954 real
= fold_build1 (code
, var_type
, real
);
4956 tree imag
= fold_build1 (IMAGPART_EXPR
, var_type
, var
);
4957 imag
= fold_build1 (code
, arg_type
, imag
);
4958 imag
= build_call_expr_loc (loc
, call
, 3, imag
, bits
, kind
);
4959 imag
= fold_build1 (code
, var_type
, imag
);
4961 expr
= fold_build2 (COMPLEX_EXPR
, dest_type
, real
, imag
);
4965 expr
= fold_build1 (code
, arg_type
, var
);
4966 expr
= build_call_expr_loc (loc
, call
, 3, expr
, bits
, kind
);
4967 expr
= fold_build1 (code
, dest_type
, expr
);
4970 gimplify_assign (dest_var
, expr
, seq
);
4973 /* Lazily generate the global lock var decl and return its address. */
4976 nvptx_global_lock_addr ()
4978 tree v
= global_lock_var
;
4982 tree name
= get_identifier ("__reduction_lock");
4983 tree type
= build_qualified_type (unsigned_type_node
,
4984 TYPE_QUAL_VOLATILE
);
4985 v
= build_decl (BUILTINS_LOCATION
, VAR_DECL
, name
, type
);
4986 global_lock_var
= v
;
4987 DECL_ARTIFICIAL (v
) = 1;
4988 DECL_EXTERNAL (v
) = 1;
4989 TREE_STATIC (v
) = 1;
4990 TREE_PUBLIC (v
) = 1;
4992 mark_addressable (v
);
4993 mark_decl_referenced (v
);
4996 return build_fold_addr_expr (v
);
4999 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5000 GSI. We use a lockless scheme for nearly all case, which looks
5002 actual = initval(OP);
5005 write = guess OP myval;
5006 actual = cmp&swap (ptr, guess, write)
5007 } while (actual bit-different-to guess);
5010 This relies on a cmp&swap instruction, which is available for 32-
5011 and 64-bit types. Larger types must use a locking scheme. */
5014 nvptx_lockless_update (location_t loc
, gimple_stmt_iterator
*gsi
,
5015 tree ptr
, tree var
, tree_code op
)
5017 unsigned fn
= NVPTX_BUILTIN_CMP_SWAP
;
5018 tree_code code
= NOP_EXPR
;
5019 tree arg_type
= unsigned_type_node
;
5020 tree var_type
= TREE_TYPE (var
);
5022 if (TREE_CODE (var_type
) == COMPLEX_TYPE
5023 || TREE_CODE (var_type
) == REAL_TYPE
)
5024 code
= VIEW_CONVERT_EXPR
;
5026 if (TYPE_SIZE (var_type
) == TYPE_SIZE (long_long_unsigned_type_node
))
5028 arg_type
= long_long_unsigned_type_node
;
5029 fn
= NVPTX_BUILTIN_CMP_SWAPLL
;
5032 tree swap_fn
= nvptx_builtin_decl (fn
, true);
5034 gimple_seq init_seq
= NULL
;
5035 tree init_var
= make_ssa_name (arg_type
);
5036 tree init_expr
= omp_reduction_init_op (loc
, op
, var_type
);
5037 init_expr
= fold_build1 (code
, arg_type
, init_expr
);
5038 gimplify_assign (init_var
, init_expr
, &init_seq
);
5039 gimple
*init_end
= gimple_seq_last (init_seq
);
5041 gsi_insert_seq_before (gsi
, init_seq
, GSI_SAME_STMT
);
5043 /* Split the block just after the init stmts. */
5044 basic_block pre_bb
= gsi_bb (*gsi
);
5045 edge pre_edge
= split_block (pre_bb
, init_end
);
5046 basic_block loop_bb
= pre_edge
->dest
;
5047 pre_bb
= pre_edge
->src
;
5048 /* Reset the iterator. */
5049 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5051 tree expect_var
= make_ssa_name (arg_type
);
5052 tree actual_var
= make_ssa_name (arg_type
);
5053 tree write_var
= make_ssa_name (arg_type
);
5055 /* Build and insert the reduction calculation. */
5056 gimple_seq red_seq
= NULL
;
5057 tree write_expr
= fold_build1 (code
, var_type
, expect_var
);
5058 write_expr
= fold_build2 (op
, var_type
, write_expr
, var
);
5059 write_expr
= fold_build1 (code
, arg_type
, write_expr
);
5060 gimplify_assign (write_var
, write_expr
, &red_seq
);
5062 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
5064 /* Build & insert the cmp&swap sequence. */
5065 gimple_seq latch_seq
= NULL
;
5066 tree swap_expr
= build_call_expr_loc (loc
, swap_fn
, 3,
5067 ptr
, expect_var
, write_var
);
5068 gimplify_assign (actual_var
, swap_expr
, &latch_seq
);
5070 gcond
*cond
= gimple_build_cond (EQ_EXPR
, actual_var
, expect_var
,
5071 NULL_TREE
, NULL_TREE
);
5072 gimple_seq_add_stmt (&latch_seq
, cond
);
5074 gimple
*latch_end
= gimple_seq_last (latch_seq
);
5075 gsi_insert_seq_before (gsi
, latch_seq
, GSI_SAME_STMT
);
5077 /* Split the block just after the latch stmts. */
5078 edge post_edge
= split_block (loop_bb
, latch_end
);
5079 basic_block post_bb
= post_edge
->dest
;
5080 loop_bb
= post_edge
->src
;
5081 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5083 post_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
5084 post_edge
->probability
= profile_probability::even ();
5085 edge loop_edge
= make_edge (loop_bb
, loop_bb
, EDGE_FALSE_VALUE
);
5086 loop_edge
->probability
= profile_probability::even ();
5087 set_immediate_dominator (CDI_DOMINATORS
, loop_bb
, pre_bb
);
5088 set_immediate_dominator (CDI_DOMINATORS
, post_bb
, loop_bb
);
5090 gphi
*phi
= create_phi_node (expect_var
, loop_bb
);
5091 add_phi_arg (phi
, init_var
, pre_edge
, loc
);
5092 add_phi_arg (phi
, actual_var
, loop_edge
, loc
);
5094 loop
*loop
= alloc_loop ();
5095 loop
->header
= loop_bb
;
5096 loop
->latch
= loop_bb
;
5097 add_loop (loop
, loop_bb
->loop_father
);
5099 return fold_build1 (code
, var_type
, write_var
);
5102 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5103 GSI. This is necessary for types larger than 64 bits, where there
5104 is no cmp&swap instruction to implement a lockless scheme. We use
5105 a lock variable in global memory.
5107 while (cmp&swap (&lock_var, 0, 1))
5110 accum = accum OP var;
5112 cmp&swap (&lock_var, 1, 0);
5115 A lock in global memory is necessary to force execution engine
5116 descheduling and avoid resource starvation that can occur if the
5117 lock is in .shared memory. */
5120 nvptx_lockfull_update (location_t loc
, gimple_stmt_iterator
*gsi
,
5121 tree ptr
, tree var
, tree_code op
)
5123 tree var_type
= TREE_TYPE (var
);
5124 tree swap_fn
= nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP
, true);
5125 tree uns_unlocked
= build_int_cst (unsigned_type_node
, 0);
5126 tree uns_locked
= build_int_cst (unsigned_type_node
, 1);
5128 /* Split the block just before the gsi. Insert a gimple nop to make
5130 gimple
*nop
= gimple_build_nop ();
5131 gsi_insert_before (gsi
, nop
, GSI_SAME_STMT
);
5132 basic_block entry_bb
= gsi_bb (*gsi
);
5133 edge entry_edge
= split_block (entry_bb
, nop
);
5134 basic_block lock_bb
= entry_edge
->dest
;
5135 /* Reset the iterator. */
5136 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5138 /* Build and insert the locking sequence. */
5139 gimple_seq lock_seq
= NULL
;
5140 tree lock_var
= make_ssa_name (unsigned_type_node
);
5141 tree lock_expr
= nvptx_global_lock_addr ();
5142 lock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, lock_expr
,
5143 uns_unlocked
, uns_locked
);
5144 gimplify_assign (lock_var
, lock_expr
, &lock_seq
);
5145 gcond
*cond
= gimple_build_cond (EQ_EXPR
, lock_var
, uns_unlocked
,
5146 NULL_TREE
, NULL_TREE
);
5147 gimple_seq_add_stmt (&lock_seq
, cond
);
5148 gimple
*lock_end
= gimple_seq_last (lock_seq
);
5149 gsi_insert_seq_before (gsi
, lock_seq
, GSI_SAME_STMT
);
5151 /* Split the block just after the lock sequence. */
5152 edge locked_edge
= split_block (lock_bb
, lock_end
);
5153 basic_block update_bb
= locked_edge
->dest
;
5154 lock_bb
= locked_edge
->src
;
5155 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5157 /* Create the lock loop ... */
5158 locked_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
5159 locked_edge
->probability
= profile_probability::even ();
5160 edge loop_edge
= make_edge (lock_bb
, lock_bb
, EDGE_FALSE_VALUE
);
5161 loop_edge
->probability
= profile_probability::even ();
5162 set_immediate_dominator (CDI_DOMINATORS
, lock_bb
, entry_bb
);
5163 set_immediate_dominator (CDI_DOMINATORS
, update_bb
, lock_bb
);
5165 /* ... and the loop structure. */
5166 loop
*lock_loop
= alloc_loop ();
5167 lock_loop
->header
= lock_bb
;
5168 lock_loop
->latch
= lock_bb
;
5169 lock_loop
->nb_iterations_estimate
= 1;
5170 lock_loop
->any_estimate
= true;
5171 add_loop (lock_loop
, entry_bb
->loop_father
);
5173 /* Build and insert the reduction calculation. */
5174 gimple_seq red_seq
= NULL
;
5175 tree acc_in
= make_ssa_name (var_type
);
5176 tree ref_in
= build_simple_mem_ref (ptr
);
5177 TREE_THIS_VOLATILE (ref_in
) = 1;
5178 gimplify_assign (acc_in
, ref_in
, &red_seq
);
5180 tree acc_out
= make_ssa_name (var_type
);
5181 tree update_expr
= fold_build2 (op
, var_type
, ref_in
, var
);
5182 gimplify_assign (acc_out
, update_expr
, &red_seq
);
5184 tree ref_out
= build_simple_mem_ref (ptr
);
5185 TREE_THIS_VOLATILE (ref_out
) = 1;
5186 gimplify_assign (ref_out
, acc_out
, &red_seq
);
5188 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
5190 /* Build & insert the unlock sequence. */
5191 gimple_seq unlock_seq
= NULL
;
5192 tree unlock_expr
= nvptx_global_lock_addr ();
5193 unlock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, unlock_expr
,
5194 uns_locked
, uns_unlocked
);
5195 gimplify_and_add (unlock_expr
, &unlock_seq
);
5196 gsi_insert_seq_before (gsi
, unlock_seq
, GSI_SAME_STMT
);
5201 /* Emit a sequence to update a reduction accumlator at *PTR with the
5202 value held in VAR using operator OP. Return the updated value.
5204 TODO: optimize for atomic ops and indepedent complex ops. */
5207 nvptx_reduction_update (location_t loc
, gimple_stmt_iterator
*gsi
,
5208 tree ptr
, tree var
, tree_code op
)
5210 tree type
= TREE_TYPE (var
);
5211 tree size
= TYPE_SIZE (type
);
5213 if (size
== TYPE_SIZE (unsigned_type_node
)
5214 || size
== TYPE_SIZE (long_long_unsigned_type_node
))
5215 return nvptx_lockless_update (loc
, gsi
, ptr
, var
, op
);
5217 return nvptx_lockfull_update (loc
, gsi
, ptr
, var
, op
);
5220 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5223 nvptx_goacc_reduction_setup (gcall
*call
)
5225 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
5226 tree lhs
= gimple_call_lhs (call
);
5227 tree var
= gimple_call_arg (call
, 2);
5228 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
5229 gimple_seq seq
= NULL
;
5231 push_gimplify_context (true);
5233 if (level
!= GOMP_DIM_GANG
)
5235 /* Copy the receiver object. */
5236 tree ref_to_res
= gimple_call_arg (call
, 1);
5238 if (!integer_zerop (ref_to_res
))
5239 var
= build_simple_mem_ref (ref_to_res
);
5242 if (level
== GOMP_DIM_WORKER
)
5244 /* Store incoming value to worker reduction buffer. */
5245 tree offset
= gimple_call_arg (call
, 5);
5246 tree call
= nvptx_get_worker_red_addr (TREE_TYPE (var
), offset
);
5247 tree ptr
= make_ssa_name (TREE_TYPE (call
));
5249 gimplify_assign (ptr
, call
, &seq
);
5250 tree ref
= build_simple_mem_ref (ptr
);
5251 TREE_THIS_VOLATILE (ref
) = 1;
5252 gimplify_assign (ref
, var
, &seq
);
5256 gimplify_assign (lhs
, var
, &seq
);
5258 pop_gimplify_context (NULL
);
5259 gsi_replace_with_seq (&gsi
, seq
, true);
5262 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
5265 nvptx_goacc_reduction_init (gcall
*call
)
5267 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
5268 tree lhs
= gimple_call_lhs (call
);
5269 tree var
= gimple_call_arg (call
, 2);
5270 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
5271 enum tree_code rcode
5272 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
5273 tree init
= omp_reduction_init_op (gimple_location (call
), rcode
,
5275 gimple_seq seq
= NULL
;
5277 push_gimplify_context (true);
5279 if (level
== GOMP_DIM_VECTOR
)
5281 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5282 tree tid
= make_ssa_name (integer_type_node
);
5283 tree dim_vector
= gimple_call_arg (call
, 3);
5284 gimple
*tid_call
= gimple_build_call_internal (IFN_GOACC_DIM_POS
, 1,
5286 gimple
*cond_stmt
= gimple_build_cond (NE_EXPR
, tid
, integer_zero_node
,
5287 NULL_TREE
, NULL_TREE
);
5289 gimple_call_set_lhs (tid_call
, tid
);
5290 gimple_seq_add_stmt (&seq
, tid_call
);
5291 gimple_seq_add_stmt (&seq
, cond_stmt
);
5293 /* Split the block just after the call. */
5294 edge init_edge
= split_block (gsi_bb (gsi
), call
);
5295 basic_block init_bb
= init_edge
->dest
;
5296 basic_block call_bb
= init_edge
->src
;
5298 /* Fixup flags from call_bb to init_bb. */
5299 init_edge
->flags
^= EDGE_FALLTHRU
| EDGE_TRUE_VALUE
;
5300 init_edge
->probability
= profile_probability::even ();
5302 /* Set the initialization stmts. */
5303 gimple_seq init_seq
= NULL
;
5304 tree init_var
= make_ssa_name (TREE_TYPE (var
));
5305 gimplify_assign (init_var
, init
, &init_seq
);
5306 gsi
= gsi_start_bb (init_bb
);
5307 gsi_insert_seq_before (&gsi
, init_seq
, GSI_SAME_STMT
);
5309 /* Split block just after the init stmt. */
5311 edge inited_edge
= split_block (gsi_bb (gsi
), gsi_stmt (gsi
));
5312 basic_block dst_bb
= inited_edge
->dest
;
5314 /* Create false edge from call_bb to dst_bb. */
5315 edge nop_edge
= make_edge (call_bb
, dst_bb
, EDGE_FALSE_VALUE
);
5316 nop_edge
->probability
= profile_probability::even ();
5318 /* Create phi node in dst block. */
5319 gphi
*phi
= create_phi_node (lhs
, dst_bb
);
5320 add_phi_arg (phi
, init_var
, inited_edge
, gimple_location (call
));
5321 add_phi_arg (phi
, var
, nop_edge
, gimple_location (call
));
5323 /* Reset dominator of dst bb. */
5324 set_immediate_dominator (CDI_DOMINATORS
, dst_bb
, call_bb
);
5326 /* Reset the gsi. */
5327 gsi
= gsi_for_stmt (call
);
5331 if (level
== GOMP_DIM_GANG
)
5333 /* If there's no receiver object, propagate the incoming VAR. */
5334 tree ref_to_res
= gimple_call_arg (call
, 1);
5335 if (integer_zerop (ref_to_res
))
5339 gimplify_assign (lhs
, init
, &seq
);
5342 pop_gimplify_context (NULL
);
5343 gsi_replace_with_seq (&gsi
, seq
, true);
5346 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
5349 nvptx_goacc_reduction_fini (gcall
*call
)
5351 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
5352 tree lhs
= gimple_call_lhs (call
);
5353 tree ref_to_res
= gimple_call_arg (call
, 1);
5354 tree var
= gimple_call_arg (call
, 2);
5355 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
5357 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
5358 gimple_seq seq
= NULL
;
5359 tree r
= NULL_TREE
;;
5361 push_gimplify_context (true);
5363 if (level
== GOMP_DIM_VECTOR
)
5365 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5366 but that requires a method of emitting a unified jump at the
5368 for (int shfl
= PTX_VECTOR_LENGTH
/ 2; shfl
> 0; shfl
= shfl
>> 1)
5370 tree other_var
= make_ssa_name (TREE_TYPE (var
));
5371 nvptx_generate_vector_shuffle (gimple_location (call
),
5372 other_var
, var
, shfl
, &seq
);
5374 r
= make_ssa_name (TREE_TYPE (var
));
5375 gimplify_assign (r
, fold_build2 (op
, TREE_TYPE (var
),
5376 var
, other_var
), &seq
);
5382 tree accum
= NULL_TREE
;
5384 if (level
== GOMP_DIM_WORKER
)
5386 /* Get reduction buffer address. */
5387 tree offset
= gimple_call_arg (call
, 5);
5388 tree call
= nvptx_get_worker_red_addr (TREE_TYPE (var
), offset
);
5389 tree ptr
= make_ssa_name (TREE_TYPE (call
));
5391 gimplify_assign (ptr
, call
, &seq
);
5394 else if (integer_zerop (ref_to_res
))
5401 /* UPDATE the accumulator. */
5402 gsi_insert_seq_before (&gsi
, seq
, GSI_SAME_STMT
);
5404 r
= nvptx_reduction_update (gimple_location (call
), &gsi
,
5410 gimplify_assign (lhs
, r
, &seq
);
5411 pop_gimplify_context (NULL
);
5413 gsi_replace_with_seq (&gsi
, seq
, true);
5416 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5419 nvptx_goacc_reduction_teardown (gcall
*call
)
5421 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
5422 tree lhs
= gimple_call_lhs (call
);
5423 tree var
= gimple_call_arg (call
, 2);
5424 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
5425 gimple_seq seq
= NULL
;
5427 push_gimplify_context (true);
5428 if (level
== GOMP_DIM_WORKER
)
5430 /* Read the worker reduction buffer. */
5431 tree offset
= gimple_call_arg (call
, 5);
5432 tree call
= nvptx_get_worker_red_addr(TREE_TYPE (var
), offset
);
5433 tree ptr
= make_ssa_name (TREE_TYPE (call
));
5435 gimplify_assign (ptr
, call
, &seq
);
5436 var
= build_simple_mem_ref (ptr
);
5437 TREE_THIS_VOLATILE (var
) = 1;
5440 if (level
!= GOMP_DIM_GANG
)
5442 /* Write to the receiver object. */
5443 tree ref_to_res
= gimple_call_arg (call
, 1);
5445 if (!integer_zerop (ref_to_res
))
5446 gimplify_assign (build_simple_mem_ref (ref_to_res
), var
, &seq
);
5450 gimplify_assign (lhs
, var
, &seq
);
5452 pop_gimplify_context (NULL
);
5454 gsi_replace_with_seq (&gsi
, seq
, true);
5457 /* NVPTX reduction expander. */
5460 nvptx_goacc_reduction (gcall
*call
)
5462 unsigned code
= (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call
, 0));
5466 case IFN_GOACC_REDUCTION_SETUP
:
5467 nvptx_goacc_reduction_setup (call
);
5470 case IFN_GOACC_REDUCTION_INIT
:
5471 nvptx_goacc_reduction_init (call
);
5474 case IFN_GOACC_REDUCTION_FINI
:
5475 nvptx_goacc_reduction_fini (call
);
5478 case IFN_GOACC_REDUCTION_TEARDOWN
:
5479 nvptx_goacc_reduction_teardown (call
);
5488 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
,
5489 rtx x ATTRIBUTE_UNUSED
)
5495 nvptx_vector_mode_supported (machine_mode mode
)
5497 return (mode
== V2SImode
5498 || mode
== V2DImode
);
5501 /* Return the preferred mode for vectorizing scalar MODE. */
5504 nvptx_preferred_simd_mode (scalar_mode mode
)
5514 return default_preferred_simd_mode (mode
);
5519 nvptx_data_alignment (const_tree type
, unsigned int basic_align
)
5521 if (TREE_CODE (type
) == INTEGER_TYPE
)
5523 unsigned HOST_WIDE_INT size
= tree_to_uhwi (TYPE_SIZE_UNIT (type
));
5524 if (size
== GET_MODE_SIZE (TImode
))
5525 return GET_MODE_BITSIZE (maybe_split_mode (TImode
));
5531 /* Implement TARGET_MODES_TIEABLE_P. */
5534 nvptx_modes_tieable_p (machine_mode
, machine_mode
)
5539 /* Implement TARGET_HARD_REGNO_NREGS. */
5542 nvptx_hard_regno_nregs (unsigned int, machine_mode
)
5547 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
5550 nvptx_can_change_mode_class (machine_mode
, machine_mode
, reg_class_t
)
5555 #undef TARGET_OPTION_OVERRIDE
5556 #define TARGET_OPTION_OVERRIDE nvptx_option_override
5558 #undef TARGET_ATTRIBUTE_TABLE
5559 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5562 #define TARGET_LRA_P hook_bool_void_false
5564 #undef TARGET_LEGITIMATE_ADDRESS_P
5565 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5567 #undef TARGET_PROMOTE_FUNCTION_MODE
5568 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5570 #undef TARGET_FUNCTION_ARG
5571 #define TARGET_FUNCTION_ARG nvptx_function_arg
5572 #undef TARGET_FUNCTION_INCOMING_ARG
5573 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5574 #undef TARGET_FUNCTION_ARG_ADVANCE
5575 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
5576 #undef TARGET_FUNCTION_ARG_BOUNDARY
5577 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
5578 #undef TARGET_PASS_BY_REFERENCE
5579 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5580 #undef TARGET_FUNCTION_VALUE_REGNO_P
5581 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5582 #undef TARGET_FUNCTION_VALUE
5583 #define TARGET_FUNCTION_VALUE nvptx_function_value
5584 #undef TARGET_LIBCALL_VALUE
5585 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
5586 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
5587 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
5588 #undef TARGET_GET_DRAP_RTX
5589 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
5590 #undef TARGET_SPLIT_COMPLEX_ARG
5591 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5592 #undef TARGET_RETURN_IN_MEMORY
5593 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5594 #undef TARGET_OMIT_STRUCT_RETURN_REG
5595 #define TARGET_OMIT_STRUCT_RETURN_REG true
5596 #undef TARGET_STRICT_ARGUMENT_NAMING
5597 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
5598 #undef TARGET_CALL_ARGS
5599 #define TARGET_CALL_ARGS nvptx_call_args
5600 #undef TARGET_END_CALL_ARGS
5601 #define TARGET_END_CALL_ARGS nvptx_end_call_args
5603 #undef TARGET_ASM_FILE_START
5604 #define TARGET_ASM_FILE_START nvptx_file_start
5605 #undef TARGET_ASM_FILE_END
5606 #define TARGET_ASM_FILE_END nvptx_file_end
5607 #undef TARGET_ASM_GLOBALIZE_LABEL
5608 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
5609 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
5610 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
5611 #undef TARGET_PRINT_OPERAND
5612 #define TARGET_PRINT_OPERAND nvptx_print_operand
5613 #undef TARGET_PRINT_OPERAND_ADDRESS
5614 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
5615 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
5616 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
5617 #undef TARGET_ASM_INTEGER
5618 #define TARGET_ASM_INTEGER nvptx_assemble_integer
5619 #undef TARGET_ASM_DECL_END
5620 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
5621 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
5622 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
5623 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
5624 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
5625 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
5626 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
5628 #undef TARGET_MACHINE_DEPENDENT_REORG
5629 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
5630 #undef TARGET_NO_REGISTER_ALLOCATION
5631 #define TARGET_NO_REGISTER_ALLOCATION true
5633 #undef TARGET_ENCODE_SECTION_INFO
5634 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
5635 #undef TARGET_RECORD_OFFLOAD_SYMBOL
5636 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
5638 #undef TARGET_VECTOR_ALIGNMENT
5639 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
5641 #undef TARGET_CANNOT_COPY_INSN_P
5642 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
5644 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
5645 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
5647 #undef TARGET_INIT_BUILTINS
5648 #define TARGET_INIT_BUILTINS nvptx_init_builtins
5649 #undef TARGET_EXPAND_BUILTIN
5650 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
5651 #undef TARGET_BUILTIN_DECL
5652 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
5654 #undef TARGET_SIMT_VF
5655 #define TARGET_SIMT_VF nvptx_simt_vf
5657 #undef TARGET_GOACC_VALIDATE_DIMS
5658 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
5660 #undef TARGET_GOACC_DIM_LIMIT
5661 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
5663 #undef TARGET_GOACC_FORK_JOIN
5664 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
5666 #undef TARGET_GOACC_REDUCTION
5667 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
5669 #undef TARGET_CANNOT_FORCE_CONST_MEM
5670 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
5672 #undef TARGET_VECTOR_MODE_SUPPORTED_P
5673 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
5675 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
5676 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
5677 nvptx_preferred_simd_mode
5679 #undef TARGET_MODES_TIEABLE_P
5680 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
5682 #undef TARGET_HARD_REGNO_NREGS
5683 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
5685 #undef TARGET_CAN_CHANGE_MODE_CLASS
5686 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
5688 struct gcc_target targetm
= TARGET_INITIALIZER
;
5690 #include "gt-nvptx.h"