[nvptx] Factor out populate_offload_attrs
[official-gcc.git] / gcc / config / nvptx / nvptx.c
blob15425337939088444e042ea5907cdd4fe0f22fe4
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "gomp-constants.h"
63 #include "dumpfile.h"
64 #include "internal-fn.h"
65 #include "gimple-iterator.h"
66 #include "stringpool.h"
67 #include "attribs.h"
68 #include "tree-vrp.h"
69 #include "tree-ssa-operands.h"
70 #include "tree-ssanames.h"
71 #include "gimplify.h"
72 #include "tree-phinodes.h"
73 #include "cfgloop.h"
74 #include "fold-const.h"
75 #include "intl.h"
77 /* This file should be included last. */
78 #include "target-def.h"
80 #define WORKAROUND_PTXJIT_BUG 1
81 #define WORKAROUND_PTXJIT_BUG_2 1
82 #define WORKAROUND_PTXJIT_BUG_3 1
84 #define PTX_WARP_SIZE 32
85 #define PTX_VECTOR_LENGTH 32
86 #define PTX_WORKER_LENGTH 32
87 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
89 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
90 block, which has had a maximum number of threads of 1024 since CUDA version
91 2.x. */
92 #define PTX_CTA_SIZE 1024
94 /* The various PTX memory areas an object might reside in. */
95 enum nvptx_data_area
97 DATA_AREA_GENERIC,
98 DATA_AREA_GLOBAL,
99 DATA_AREA_SHARED,
100 DATA_AREA_LOCAL,
101 DATA_AREA_CONST,
102 DATA_AREA_PARAM,
103 DATA_AREA_MAX
106 /* We record the data area in the target symbol flags. */
107 #define SYMBOL_DATA_AREA(SYM) \
108 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
109 & 7)
110 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
111 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
113 /* Record the function decls we've written, and the libfuncs and function
114 decls corresponding to them. */
115 static std::stringstream func_decls;
117 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
119 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
120 static bool equal (rtx a, rtx b) { return a == b; }
123 static GTY((cache))
124 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
126 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
128 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
129 static bool equal (tree a, tree b) { return a == b; }
132 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
133 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
135 /* Buffer needed to broadcast across workers and vectors. This is
136 used for both worker-neutering and worker broadcasting, and
137 vector-neutering and boardcasting when vector_length > 32. It is
138 shared by all functions emitted. The buffer is placed in shared
139 memory. It'd be nice if PTX supported common blocks, because then
140 this could be shared across TUs (taking the largest size). */
141 static unsigned oacc_bcast_size;
142 static unsigned oacc_bcast_align;
143 static GTY(()) rtx oacc_bcast_sym;
145 /* Buffer needed for worker reductions. This has to be distinct from
146 the worker broadcast array, as both may be live concurrently. */
147 static unsigned worker_red_size;
148 static unsigned worker_red_align;
149 static GTY(()) rtx worker_red_sym;
151 /* Global lock variable, needed for 128bit worker & gang reductions. */
152 static GTY(()) tree global_lock_var;
154 /* True if any function references __nvptx_stacks. */
155 static bool need_softstack_decl;
157 /* True if any function references __nvptx_uni. */
158 static bool need_unisimt_decl;
160 /* Allocate a new, cleared machine_function structure. */
162 static struct machine_function *
163 nvptx_init_machine_status (void)
165 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
166 p->return_mode = VOIDmode;
167 return p;
170 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
171 and -fopenacc is also enabled. */
173 static void
174 diagnose_openacc_conflict (bool optval, const char *optname)
176 if (flag_openacc && optval)
177 error ("option %s is not supported together with -fopenacc", optname);
180 /* Implement TARGET_OPTION_OVERRIDE. */
182 static void
183 nvptx_option_override (void)
185 init_machine_status = nvptx_init_machine_status;
187 /* Set toplevel_reorder, unless explicitly disabled. We need
188 reordering so that we emit necessary assembler decls of
189 undeclared variables. */
190 if (!global_options_set.x_flag_toplevel_reorder)
191 flag_toplevel_reorder = 1;
193 debug_nonbind_markers_p = 0;
195 /* Set flag_no_common, unless explicitly disabled. We fake common
196 using .weak, and that's not entirely accurate, so avoid it
197 unless forced. */
198 if (!global_options_set.x_flag_no_common)
199 flag_no_common = 1;
201 /* The patch area requires nops, which we don't have. */
202 if (function_entry_patch_area_size > 0)
203 sorry ("not generating patch area, nops not supported");
205 /* Assumes that it will see only hard registers. */
206 flag_var_tracking = 0;
208 if (nvptx_optimize < 0)
209 nvptx_optimize = optimize > 0;
211 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
212 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
213 declared_libfuncs_htab
214 = hash_table<declared_libfunc_hasher>::create_ggc (17);
216 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
217 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
218 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
220 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
221 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
222 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
224 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
225 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
226 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
228 if (TARGET_GOMP)
229 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
232 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
233 deal with ptx ideosyncracies. */
235 const char *
236 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
238 switch (mode)
240 case E_BLKmode:
241 return ".b8";
242 case E_BImode:
243 return ".pred";
244 case E_QImode:
245 if (promote)
246 return ".u32";
247 else
248 return ".u8";
249 case E_HImode:
250 return ".u16";
251 case E_SImode:
252 return ".u32";
253 case E_DImode:
254 return ".u64";
256 case E_SFmode:
257 return ".f32";
258 case E_DFmode:
259 return ".f64";
261 case E_V2SImode:
262 return ".v2.u32";
263 case E_V2DImode:
264 return ".v2.u64";
266 default:
267 gcc_unreachable ();
271 /* Encode the PTX data area that DECL (which might not actually be a
272 _DECL) should reside in. */
274 static void
275 nvptx_encode_section_info (tree decl, rtx rtl, int first)
277 default_encode_section_info (decl, rtl, first);
278 if (first && MEM_P (rtl))
280 nvptx_data_area area = DATA_AREA_GENERIC;
282 if (TREE_CONSTANT (decl))
283 area = DATA_AREA_CONST;
284 else if (TREE_CODE (decl) == VAR_DECL)
286 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
288 area = DATA_AREA_SHARED;
289 if (DECL_INITIAL (decl))
290 error ("static initialization of variable %q+D in %<.shared%>"
291 " memory is not supported", decl);
293 else
294 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
297 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
301 /* Return the PTX name of the data area in which SYM should be
302 placed. The symbol must have already been processed by
303 nvptx_encode_seciton_info, or equivalent. */
305 static const char *
306 section_for_sym (rtx sym)
308 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
309 /* Same order as nvptx_data_area enum. */
310 static char const *const areas[] =
311 {"", ".global", ".shared", ".local", ".const", ".param"};
313 return areas[area];
316 /* Similarly for a decl. */
318 static const char *
319 section_for_decl (const_tree decl)
321 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
324 /* Check NAME for special function names and redirect them by returning a
325 replacement. This applies to malloc, free and realloc, for which we
326 want to use libgcc wrappers, and call, which triggers a bug in
327 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
328 not active in an offload compiler -- the names are all set by the
329 host-side compiler. */
331 static const char *
332 nvptx_name_replacement (const char *name)
334 if (strcmp (name, "call") == 0)
335 return "__nvptx_call";
336 if (strcmp (name, "malloc") == 0)
337 return "__nvptx_malloc";
338 if (strcmp (name, "free") == 0)
339 return "__nvptx_free";
340 if (strcmp (name, "realloc") == 0)
341 return "__nvptx_realloc";
342 return name;
345 /* If MODE should be treated as two registers of an inner mode, return
346 that inner mode. Otherwise return VOIDmode. */
348 static machine_mode
349 maybe_split_mode (machine_mode mode)
351 if (COMPLEX_MODE_P (mode))
352 return GET_MODE_INNER (mode);
354 if (mode == TImode)
355 return DImode;
357 return VOIDmode;
360 /* Return true if mode should be treated as two registers. */
362 static bool
363 split_mode_p (machine_mode mode)
365 return maybe_split_mode (mode) != VOIDmode;
368 /* Output a register, subreg, or register pair (with optional
369 enclosing braces). */
371 static void
372 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
373 int subreg_offset = -1)
375 if (inner_mode == VOIDmode)
377 if (HARD_REGISTER_NUM_P (regno))
378 fprintf (file, "%s", reg_names[regno]);
379 else
380 fprintf (file, "%%r%d", regno);
382 else if (subreg_offset >= 0)
384 output_reg (file, regno, VOIDmode);
385 fprintf (file, "$%d", subreg_offset);
387 else
389 if (subreg_offset == -1)
390 fprintf (file, "{");
391 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
392 fprintf (file, ",");
393 output_reg (file, regno, inner_mode, 0);
394 if (subreg_offset == -1)
395 fprintf (file, "}");
399 /* Emit forking instructions for MASK. */
401 static void
402 nvptx_emit_forking (unsigned mask, bool is_call)
404 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
405 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
406 if (mask)
408 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
410 /* Emit fork at all levels. This helps form SESE regions, as
411 it creates a block with a single successor before entering a
412 partitooned region. That is a good candidate for the end of
413 an SESE region. */
414 emit_insn (gen_nvptx_fork (op));
415 emit_insn (gen_nvptx_forked (op));
419 /* Emit joining instructions for MASK. */
421 static void
422 nvptx_emit_joining (unsigned mask, bool is_call)
424 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
425 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
426 if (mask)
428 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
430 /* Emit joining for all non-call pars to ensure there's a single
431 predecessor for the block the join insn ends up in. This is
432 needed for skipping entire loops. */
433 emit_insn (gen_nvptx_joining (op));
434 emit_insn (gen_nvptx_join (op));
439 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
440 returned in memory. Integer and floating types supported by the
441 machine are passed in registers, everything else is passed in
442 memory. Complex types are split. */
444 static bool
445 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
447 if (type)
449 if (AGGREGATE_TYPE_P (type))
450 return true;
451 if (TREE_CODE (type) == VECTOR_TYPE)
452 return true;
455 if (!for_return && COMPLEX_MODE_P (mode))
456 /* Complex types are passed as two underlying args. */
457 mode = GET_MODE_INNER (mode);
459 if (GET_MODE_CLASS (mode) != MODE_INT
460 && GET_MODE_CLASS (mode) != MODE_FLOAT)
461 return true;
463 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
464 return true;
466 return false;
469 /* A non-memory argument of mode MODE is being passed, determine the mode it
470 should be promoted to. This is also used for determining return
471 type promotion. */
473 static machine_mode
474 promote_arg (machine_mode mode, bool prototyped)
476 if (!prototyped && mode == SFmode)
477 /* K&R float promotion for unprototyped functions. */
478 mode = DFmode;
479 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
480 mode = SImode;
482 return mode;
485 /* A non-memory return type of MODE is being returned. Determine the
486 mode it should be promoted to. */
488 static machine_mode
489 promote_return (machine_mode mode)
491 return promote_arg (mode, true);
494 /* Implement TARGET_FUNCTION_ARG. */
496 static rtx
497 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
498 const_tree, bool named)
500 if (mode == VOIDmode || !named)
501 return NULL_RTX;
503 return gen_reg_rtx (mode);
506 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
508 static rtx
509 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
510 const_tree, bool named)
512 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
514 if (mode == VOIDmode || !named)
515 return NULL_RTX;
517 /* No need to deal with split modes here, the only case that can
518 happen is complex modes and those are dealt with by
519 TARGET_SPLIT_COMPLEX_ARG. */
520 return gen_rtx_UNSPEC (mode,
521 gen_rtvec (1, GEN_INT (cum->count)),
522 UNSPEC_ARG_REG);
525 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
527 static void
528 nvptx_function_arg_advance (cumulative_args_t cum_v,
529 machine_mode ARG_UNUSED (mode),
530 const_tree ARG_UNUSED (type),
531 bool ARG_UNUSED (named))
533 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
535 cum->count++;
538 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
540 For nvptx This is only used for varadic args. The type has already
541 been promoted and/or converted to invisible reference. */
543 static unsigned
544 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
546 return GET_MODE_ALIGNMENT (mode);
549 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
551 For nvptx, we know how to handle functions declared as stdarg: by
552 passing an extra pointer to the unnamed arguments. However, the
553 Fortran frontend can produce a different situation, where a
554 function pointer is declared with no arguments, but the actual
555 function and calls to it take more arguments. In that case, we
556 want to ensure the call matches the definition of the function. */
558 static bool
559 nvptx_strict_argument_naming (cumulative_args_t cum_v)
561 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
563 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
566 /* Implement TARGET_LIBCALL_VALUE. */
568 static rtx
569 nvptx_libcall_value (machine_mode mode, const_rtx)
571 if (!cfun || !cfun->machine->doing_call)
572 /* Pretend to return in a hard reg for early uses before pseudos can be
573 generated. */
574 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
576 return gen_reg_rtx (mode);
579 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
580 where function FUNC returns or receives a value of data type TYPE. */
582 static rtx
583 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
584 bool outgoing)
586 machine_mode mode = promote_return (TYPE_MODE (type));
588 if (outgoing)
590 gcc_assert (cfun);
591 cfun->machine->return_mode = mode;
592 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
595 return nvptx_libcall_value (mode, NULL_RTX);
598 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
600 static bool
601 nvptx_function_value_regno_p (const unsigned int regno)
603 return regno == NVPTX_RETURN_REGNUM;
606 /* Types with a mode other than those supported by the machine are passed by
607 reference in memory. */
609 static bool
610 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
611 machine_mode mode, const_tree type,
612 bool ARG_UNUSED (named))
614 return pass_in_memory (mode, type, false);
617 /* Implement TARGET_RETURN_IN_MEMORY. */
619 static bool
620 nvptx_return_in_memory (const_tree type, const_tree)
622 return pass_in_memory (TYPE_MODE (type), type, true);
625 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
627 static machine_mode
628 nvptx_promote_function_mode (const_tree type, machine_mode mode,
629 int *ARG_UNUSED (punsignedp),
630 const_tree funtype, int for_return)
632 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
635 /* Helper for write_arg. Emit a single PTX argument of MODE, either
636 in a prototype, or as copy in a function prologue. ARGNO is the
637 index of this argument in the PTX function. FOR_REG is negative,
638 if we're emitting the PTX prototype. It is zero if we're copying
639 to an argument register and it is greater than zero if we're
640 copying to a specific hard register. */
642 static int
643 write_arg_mode (std::stringstream &s, int for_reg, int argno,
644 machine_mode mode)
646 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
648 if (for_reg < 0)
650 /* Writing PTX prototype. */
651 s << (argno ? ", " : " (");
652 s << ".param" << ptx_type << " %in_ar" << argno;
654 else
656 s << "\t.reg" << ptx_type << " ";
657 if (for_reg)
658 s << reg_names[for_reg];
659 else
660 s << "%ar" << argno;
661 s << ";\n";
662 if (argno >= 0)
664 s << "\tld.param" << ptx_type << " ";
665 if (for_reg)
666 s << reg_names[for_reg];
667 else
668 s << "%ar" << argno;
669 s << ", [%in_ar" << argno << "];\n";
672 return argno + 1;
675 /* Process function parameter TYPE to emit one or more PTX
676 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
677 is true, if this is a prototyped function, rather than an old-style
678 C declaration. Returns the next argument number to use.
680 The promotion behavior here must match the regular GCC function
681 parameter marshalling machinery. */
683 static int
684 write_arg_type (std::stringstream &s, int for_reg, int argno,
685 tree type, bool prototyped)
687 machine_mode mode = TYPE_MODE (type);
689 if (mode == VOIDmode)
690 return argno;
692 if (pass_in_memory (mode, type, false))
693 mode = Pmode;
694 else
696 bool split = TREE_CODE (type) == COMPLEX_TYPE;
698 if (split)
700 /* Complex types are sent as two separate args. */
701 type = TREE_TYPE (type);
702 mode = TYPE_MODE (type);
703 prototyped = true;
706 mode = promote_arg (mode, prototyped);
707 if (split)
708 argno = write_arg_mode (s, for_reg, argno, mode);
711 return write_arg_mode (s, for_reg, argno, mode);
714 /* Emit a PTX return as a prototype or function prologue declaration
715 for MODE. */
717 static void
718 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
720 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
721 const char *pfx = "\t.reg";
722 const char *sfx = ";\n";
724 if (for_proto)
725 pfx = "(.param", sfx = "_out) ";
727 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
730 /* Process a function return TYPE to emit a PTX return as a prototype
731 or function prologue declaration. Returns true if return is via an
732 additional pointer parameter. The promotion behavior here must
733 match the regular GCC function return mashalling. */
735 static bool
736 write_return_type (std::stringstream &s, bool for_proto, tree type)
738 machine_mode mode = TYPE_MODE (type);
740 if (mode == VOIDmode)
741 return false;
743 bool return_in_mem = pass_in_memory (mode, type, true);
745 if (return_in_mem)
747 if (for_proto)
748 return return_in_mem;
750 /* Named return values can cause us to return a pointer as well
751 as expect an argument for the return location. This is
752 optimization-level specific, so no caller can make use of
753 this data, but more importantly for us, we must ensure it
754 doesn't change the PTX prototype. */
755 mode = (machine_mode) cfun->machine->return_mode;
757 if (mode == VOIDmode)
758 return return_in_mem;
760 /* Clear return_mode to inhibit copy of retval to non-existent
761 retval parameter. */
762 cfun->machine->return_mode = VOIDmode;
764 else
765 mode = promote_return (mode);
767 write_return_mode (s, for_proto, mode);
769 return return_in_mem;
772 /* Look for attributes in ATTRS that would indicate we must write a function
773 as a .entry kernel rather than a .func. Return true if one is found. */
775 static bool
776 write_as_kernel (tree attrs)
778 return (lookup_attribute ("kernel", attrs) != NULL_TREE
779 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
780 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
781 /* For OpenMP target regions, the corresponding kernel entry is emitted from
782 write_omp_entry as a separate function. */
785 /* Emit a linker marker for a function decl or defn. */
787 static void
788 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
789 const char *name)
791 s << "\n// BEGIN";
792 if (globalize)
793 s << " GLOBAL";
794 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
795 s << name << "\n";
798 /* Emit a linker marker for a variable decl or defn. */
800 static void
801 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
803 fprintf (file, "\n// BEGIN%s VAR %s: ",
804 globalize ? " GLOBAL" : "",
805 is_defn ? "DEF" : "DECL");
806 assemble_name_raw (file, name);
807 fputs ("\n", file);
810 /* Write a .func or .kernel declaration or definition along with
811 a helper comment for use by ld. S is the stream to write to, DECL
812 the decl for the function with name NAME. For definitions, emit
813 a declaration too. */
815 static const char *
816 write_fn_proto (std::stringstream &s, bool is_defn,
817 const char *name, const_tree decl)
819 if (is_defn)
820 /* Emit a declaration. The PTX assembler gets upset without it. */
821 name = write_fn_proto (s, false, name, decl);
822 else
824 /* Avoid repeating the name replacement. */
825 name = nvptx_name_replacement (name);
826 if (name[0] == '*')
827 name++;
830 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
832 /* PTX declaration. */
833 if (DECL_EXTERNAL (decl))
834 s << ".extern ";
835 else if (TREE_PUBLIC (decl))
836 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
837 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
839 tree fntype = TREE_TYPE (decl);
840 tree result_type = TREE_TYPE (fntype);
842 /* atomic_compare_exchange_$n builtins have an exceptional calling
843 convention. */
844 int not_atomic_weak_arg = -1;
845 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
846 switch (DECL_FUNCTION_CODE (decl))
848 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
849 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
850 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
851 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
852 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
853 /* These atomics skip the 'weak' parm in an actual library
854 call. We must skip it in the prototype too. */
855 not_atomic_weak_arg = 3;
856 break;
858 default:
859 break;
862 /* Declare the result. */
863 bool return_in_mem = write_return_type (s, true, result_type);
865 s << name;
867 int argno = 0;
869 /* Emit argument list. */
870 if (return_in_mem)
871 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
873 /* We get:
874 NULL in TYPE_ARG_TYPES, for old-style functions
875 NULL in DECL_ARGUMENTS, for builtin functions without another
876 declaration.
877 So we have to pick the best one we have. */
878 tree args = TYPE_ARG_TYPES (fntype);
879 bool prototyped = true;
880 if (!args)
882 args = DECL_ARGUMENTS (decl);
883 prototyped = false;
886 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
888 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
890 if (not_atomic_weak_arg)
891 argno = write_arg_type (s, -1, argno, type, prototyped);
892 else
893 gcc_assert (type == boolean_type_node);
896 if (stdarg_p (fntype))
897 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
899 if (DECL_STATIC_CHAIN (decl))
900 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
902 if (!argno && strcmp (name, "main") == 0)
904 argno = write_arg_type (s, -1, argno, integer_type_node, true);
905 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
908 if (argno)
909 s << ")";
911 s << (is_defn ? "\n" : ";\n");
913 return name;
916 /* Construct a function declaration from a call insn. This can be
917 necessary for two reasons - either we have an indirect call which
918 requires a .callprototype declaration, or we have a libcall
919 generated by emit_library_call for which no decl exists. */
921 static void
922 write_fn_proto_from_insn (std::stringstream &s, const char *name,
923 rtx result, rtx pat)
925 if (!name)
927 s << "\t.callprototype ";
928 name = "_";
930 else
932 name = nvptx_name_replacement (name);
933 write_fn_marker (s, false, true, name);
934 s << "\t.extern .func ";
937 if (result != NULL_RTX)
938 write_return_mode (s, true, GET_MODE (result));
940 s << name;
942 int arg_end = XVECLEN (pat, 0);
943 for (int i = 1; i < arg_end; i++)
945 /* We don't have to deal with mode splitting & promotion here,
946 as that was already done when generating the call
947 sequence. */
948 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
950 write_arg_mode (s, -1, i - 1, mode);
952 if (arg_end != 1)
953 s << ")";
954 s << ";\n";
957 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
958 table and and write a ptx prototype. These are emitted at end of
959 compilation. */
961 static void
962 nvptx_record_fndecl (tree decl)
964 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
965 if (*slot == NULL)
967 *slot = decl;
968 const char *name = get_fnname_from_decl (decl);
969 write_fn_proto (func_decls, false, name, decl);
973 /* Record a libcall or unprototyped external function. CALLEE is the
974 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
975 declaration for it. */
977 static void
978 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
980 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
981 if (*slot == NULL)
983 *slot = callee;
985 const char *name = XSTR (callee, 0);
986 write_fn_proto_from_insn (func_decls, name, retval, pat);
990 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
991 is prototyped, record it now. Otherwise record it as needed at end
992 of compilation, when we might have more information about it. */
994 void
995 nvptx_record_needed_fndecl (tree decl)
997 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
999 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1000 if (*slot == NULL)
1001 *slot = decl;
1003 else
1004 nvptx_record_fndecl (decl);
1007 /* SYM is a SYMBOL_REF. If it refers to an external function, record
1008 it as needed. */
1010 static void
1011 nvptx_maybe_record_fnsym (rtx sym)
1013 tree decl = SYMBOL_REF_DECL (sym);
1015 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1016 nvptx_record_needed_fndecl (decl);
1019 /* Emit a local array to hold some part of a conventional stack frame
1020 and initialize REGNO to point to it. If the size is zero, it'll
1021 never be valid to dereference, so we can simply initialize to
1022 zero. */
1024 static void
1025 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1027 if (size)
1028 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1029 align, reg_names[regno], size);
1030 fprintf (file, "\t.reg.u%d %s;\n",
1031 POINTER_SIZE, reg_names[regno]);
1032 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1033 : "\tmov.u%d %s, 0;\n"),
1034 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1037 /* Emit soft stack frame setup sequence. */
1039 static void
1040 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1042 /* Maintain 64-bit stack alignment. */
1043 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1044 size = ROUND_UP (size, keep_align);
1045 int bits = POINTER_SIZE;
1046 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1047 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1048 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1049 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1050 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1051 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1052 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1053 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1054 fprintf (file, "\t{\n");
1055 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1056 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1057 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1058 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1059 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1060 bits == 64 ? ".wide" : ".lo", bits / 8);
1061 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1063 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1064 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1066 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1067 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1068 bits, reg_sspprev, reg_sspslot);
1070 /* Initialize %frame = %sspprev - size. */
1071 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1072 bits, reg_frame, reg_sspprev, size);
1074 /* Apply alignment, if larger than 64. */
1075 if (alignment > keep_align)
1076 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1077 bits, reg_frame, reg_frame, -alignment);
1079 size = crtl->outgoing_args_size;
1080 gcc_assert (size % keep_align == 0);
1082 /* Initialize %stack. */
1083 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1084 bits, reg_stack, reg_frame, size);
1086 if (!crtl->is_leaf)
1087 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1088 bits, reg_sspslot, reg_stack);
1089 fprintf (file, "\t}\n");
1090 cfun->machine->has_softstack = true;
1091 need_softstack_decl = true;
1094 /* Emit code to initialize the REGNO predicate register to indicate
1095 whether we are not lane zero on the NAME axis. */
1097 static void
1098 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1100 fprintf (file, "\t{\n");
1101 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1102 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1103 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1104 fprintf (file, "\t}\n");
1107 /* Emit code to initialize predicate and master lane index registers for
1108 -muniform-simt code generation variant. */
1110 static void
1111 nvptx_init_unisimt_predicate (FILE *file)
1113 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1114 int loc = REGNO (cfun->machine->unisimt_location);
1115 int bits = POINTER_SIZE;
1116 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1117 fprintf (file, "\t{\n");
1118 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1119 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1120 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1121 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1122 bits == 64 ? ".wide" : ".lo");
1123 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1124 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1125 if (cfun->machine->unisimt_predicate)
1127 int master = REGNO (cfun->machine->unisimt_master);
1128 int pred = REGNO (cfun->machine->unisimt_predicate);
1129 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1130 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1131 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1132 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1133 /* Compute predicate as 'tid.x == master'. */
1134 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1136 fprintf (file, "\t}\n");
1137 need_unisimt_decl = true;
1140 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1142 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1143 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1145 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1146 __nvptx_uni[tid.y] = 0;
1147 gomp_nvptx_main (ORIG, arg);
1149 ORIG itself should not be emitted as a PTX .entry function. */
1151 static void
1152 write_omp_entry (FILE *file, const char *name, const char *orig)
1154 static bool gomp_nvptx_main_declared;
1155 if (!gomp_nvptx_main_declared)
1157 gomp_nvptx_main_declared = true;
1158 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1159 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1160 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1162 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1163 #define NTID_Y "%ntid.y"
1164 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1165 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1166 {\n\
1167 .reg.u32 %r<3>;\n\
1168 .reg.u" PS " %R<4>;\n\
1169 mov.u32 %r0, %tid.y;\n\
1170 mov.u32 %r1, " NTID_Y ";\n\
1171 mov.u32 %r2, %ctaid.x;\n\
1172 cvt.u" PS ".u32 %R1, %r0;\n\
1173 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1174 mov.u" PS " %R0, __nvptx_stacks;\n\
1175 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1176 ld.param.u" PS " %R2, [%stack];\n\
1177 ld.param.u" PS " %R3, [%sz];\n\
1178 add.u" PS " %R2, %R2, %R3;\n\
1179 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1180 st.shared.u" PS " [%R0], %R2;\n\
1181 mov.u" PS " %R0, __nvptx_uni;\n\
1182 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1183 mov.u32 %r0, 0;\n\
1184 st.shared.u32 [%R0], %r0;\n\
1185 mov.u" PS " %R0, \0;\n\
1186 ld.param.u" PS " %R1, [%arg];\n\
1187 {\n\
1188 .param.u" PS " %P<2>;\n\
1189 st.param.u" PS " [%P0], %R0;\n\
1190 st.param.u" PS " [%P1], %R1;\n\
1191 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1192 }\n\
1193 ret.uni;\n\
1194 }\n"
1195 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1196 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1197 #undef ENTRY_TEMPLATE
1198 #undef NTID_Y
1199 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1200 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1201 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1202 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1203 need_softstack_decl = need_unisimt_decl = true;
1206 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1207 function, including local var decls and copies from the arguments to
1208 local regs. */
1210 void
1211 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1213 tree fntype = TREE_TYPE (decl);
1214 tree result_type = TREE_TYPE (fntype);
1215 int argno = 0;
1217 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1218 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1220 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1221 sprintf (buf, "%s$impl", name);
1222 write_omp_entry (file, name, buf);
1223 name = buf;
1225 /* We construct the initial part of the function into a string
1226 stream, in order to share the prototype writing code. */
1227 std::stringstream s;
1228 write_fn_proto (s, true, name, decl);
1229 s << "{\n";
1231 bool return_in_mem = write_return_type (s, false, result_type);
1232 if (return_in_mem)
1233 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1235 /* Declare and initialize incoming arguments. */
1236 tree args = TYPE_ARG_TYPES (fntype);
1237 bool prototyped = true;
1238 if (!args)
1240 args = DECL_ARGUMENTS (decl);
1241 prototyped = false;
1244 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1246 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1248 argno = write_arg_type (s, 0, argno, type, prototyped);
1251 if (stdarg_p (fntype))
1252 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1253 true);
1255 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1256 write_arg_type (s, STATIC_CHAIN_REGNUM,
1257 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1258 true);
1260 fprintf (file, "%s", s.str().c_str());
1262 /* Usually 'crtl->is_leaf' is computed during register allocator
1263 initialization (which is not done on NVPTX) or for pressure-sensitive
1264 optimizations. Initialize it here, except if already set. */
1265 if (!crtl->is_leaf)
1266 crtl->is_leaf = leaf_function_p ();
1268 HOST_WIDE_INT sz = get_frame_size ();
1269 bool need_frameptr = sz || cfun->machine->has_chain;
1270 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1271 if (!TARGET_SOFT_STACK)
1273 /* Declare a local var for outgoing varargs. */
1274 if (cfun->machine->has_varadic)
1275 init_frame (file, STACK_POINTER_REGNUM,
1276 UNITS_PER_WORD, crtl->outgoing_args_size);
1278 /* Declare a local variable for the frame. Force its size to be
1279 DImode-compatible. */
1280 if (need_frameptr)
1281 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1282 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1284 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1285 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1286 init_softstack_frame (file, alignment, sz);
1288 if (cfun->machine->has_simtreg)
1290 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1291 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1292 align = MAX (align, GET_MODE_SIZE (DImode));
1293 if (!crtl->is_leaf || cfun->calls_alloca)
1294 simtsz = HOST_WIDE_INT_M1U;
1295 if (simtsz == HOST_WIDE_INT_M1U)
1296 simtsz = nvptx_softstack_size;
1297 if (cfun->machine->has_softstack)
1298 simtsz += POINTER_SIZE / 8;
1299 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1300 if (align > GET_MODE_SIZE (DImode))
1301 simtsz += align - GET_MODE_SIZE (DImode);
1302 if (simtsz)
1303 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1304 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1306 /* Declare the pseudos we have as ptx registers. */
1307 int maxregs = max_reg_num ();
1308 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1310 if (regno_reg_rtx[i] != const0_rtx)
1312 machine_mode mode = PSEUDO_REGNO_MODE (i);
1313 machine_mode split = maybe_split_mode (mode);
1315 if (split_mode_p (mode))
1316 mode = split;
1317 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1318 output_reg (file, i, split, -2);
1319 fprintf (file, ";\n");
1323 /* Emit axis predicates. */
1324 if (cfun->machine->axis_predicate[0])
1325 nvptx_init_axis_predicate (file,
1326 REGNO (cfun->machine->axis_predicate[0]), "y");
1327 if (cfun->machine->axis_predicate[1])
1328 nvptx_init_axis_predicate (file,
1329 REGNO (cfun->machine->axis_predicate[1]), "x");
1330 if (cfun->machine->unisimt_predicate
1331 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1332 nvptx_init_unisimt_predicate (file);
1335 /* Output code for switching uniform-simt state. ENTERING indicates whether
1336 we are entering or leaving non-uniform execution region. */
1338 static void
1339 nvptx_output_unisimt_switch (FILE *file, bool entering)
1341 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1342 return;
1343 fprintf (file, "\t{\n");
1344 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1345 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1346 if (!crtl->is_leaf)
1348 int loc = REGNO (cfun->machine->unisimt_location);
1349 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1351 if (cfun->machine->unisimt_predicate)
1353 int master = REGNO (cfun->machine->unisimt_master);
1354 int pred = REGNO (cfun->machine->unisimt_predicate);
1355 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1356 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1357 master, entering ? "%ustmp2" : "0");
1358 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1360 fprintf (file, "\t}\n");
1363 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1364 ENTERING indicates whether we are entering or leaving non-uniform execution.
1365 PTR is the register pointing to allocated storage, it is assigned to on
1366 entering and used to restore state on leaving. SIZE and ALIGN are used only
1367 on entering. */
1369 static void
1370 nvptx_output_softstack_switch (FILE *file, bool entering,
1371 rtx ptr, rtx size, rtx align)
1373 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1374 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1375 return;
1376 int bits = POINTER_SIZE, regno = REGNO (ptr);
1377 fprintf (file, "\t{\n");
1378 if (entering)
1380 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1381 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1382 cfun->machine->simt_stack_size);
1383 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1384 if (CONST_INT_P (size))
1385 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1386 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1387 else
1388 output_reg (file, REGNO (size), VOIDmode);
1389 fputs (";\n", file);
1390 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1391 fprintf (file,
1392 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1393 bits, regno, regno, UINTVAL (align));
1395 if (cfun->machine->has_softstack)
1397 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1398 if (entering)
1400 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1401 bits, regno, bits / 8, reg_stack);
1402 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1403 bits, reg_stack, regno, bits / 8);
1405 else
1407 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1408 bits, reg_stack, regno, bits / 8);
1410 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1412 fprintf (file, "\t}\n");
1415 /* Output code to enter non-uniform execution region. DEST is a register
1416 to hold a per-lane allocation given by SIZE and ALIGN. */
1418 const char *
1419 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1421 nvptx_output_unisimt_switch (asm_out_file, true);
1422 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1423 return "";
1426 /* Output code to leave non-uniform execution region. SRC is the register
1427 holding per-lane storage previously allocated by omp_simt_enter insn. */
1429 const char *
1430 nvptx_output_simt_exit (rtx src)
1432 nvptx_output_unisimt_switch (asm_out_file, false);
1433 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1434 return "";
1437 /* Output instruction that sets soft stack pointer in shared memory to the
1438 value in register given by SRC_REGNO. */
1440 const char *
1441 nvptx_output_set_softstack (unsigned src_regno)
1443 if (cfun->machine->has_softstack && !crtl->is_leaf)
1445 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1446 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1447 output_reg (asm_out_file, src_regno, VOIDmode);
1448 fprintf (asm_out_file, ";\n");
1450 return "";
1452 /* Output a return instruction. Also copy the return value to its outgoing
1453 location. */
1455 const char *
1456 nvptx_output_return (void)
1458 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1460 if (mode != VOIDmode)
1461 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1462 nvptx_ptx_type_from_mode (mode, false),
1463 reg_names[NVPTX_RETURN_REGNUM],
1464 reg_names[NVPTX_RETURN_REGNUM]);
1466 return "ret;";
1469 /* Terminate a function by writing a closing brace to FILE. */
1471 void
1472 nvptx_function_end (FILE *file)
1474 fprintf (file, "}\n");
1477 /* Decide whether we can make a sibling call to a function. For ptx, we
1478 can't. */
1480 static bool
1481 nvptx_function_ok_for_sibcall (tree, tree)
1483 return false;
1486 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1488 static rtx
1489 nvptx_get_drap_rtx (void)
1491 if (TARGET_SOFT_STACK && stack_realign_drap)
1492 return arg_pointer_rtx;
1493 return NULL_RTX;
1496 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1497 argument to the next call. */
1499 static void
1500 nvptx_call_args (rtx arg, tree fntype)
1502 if (!cfun->machine->doing_call)
1504 cfun->machine->doing_call = true;
1505 cfun->machine->is_varadic = false;
1506 cfun->machine->num_args = 0;
1508 if (fntype && stdarg_p (fntype))
1510 cfun->machine->is_varadic = true;
1511 cfun->machine->has_varadic = true;
1512 cfun->machine->num_args++;
1516 if (REG_P (arg) && arg != pc_rtx)
1518 cfun->machine->num_args++;
1519 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1520 cfun->machine->call_args);
1524 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1525 information we recorded. */
1527 static void
1528 nvptx_end_call_args (void)
1530 cfun->machine->doing_call = false;
1531 free_EXPR_LIST_list (&cfun->machine->call_args);
1534 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1535 track of whether calls involving static chains or varargs were seen
1536 in the current function.
1537 For libcalls, maintain a hash table of decls we have seen, and
1538 record a function decl for later when encountering a new one. */
1540 void
1541 nvptx_expand_call (rtx retval, rtx address)
1543 rtx callee = XEXP (address, 0);
1544 rtx varargs = NULL_RTX;
1545 unsigned parallel = 0;
1547 if (!call_insn_operand (callee, Pmode))
1549 callee = force_reg (Pmode, callee);
1550 address = change_address (address, QImode, callee);
1553 if (GET_CODE (callee) == SYMBOL_REF)
1555 tree decl = SYMBOL_REF_DECL (callee);
1556 if (decl != NULL_TREE)
1558 if (DECL_STATIC_CHAIN (decl))
1559 cfun->machine->has_chain = true;
1561 tree attr = oacc_get_fn_attrib (decl);
1562 if (attr)
1564 tree dims = TREE_VALUE (attr);
1566 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1567 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1569 if (TREE_PURPOSE (dims)
1570 && !integer_zerop (TREE_PURPOSE (dims)))
1571 break;
1572 /* Not on this axis. */
1573 parallel ^= GOMP_DIM_MASK (ix);
1574 dims = TREE_CHAIN (dims);
1580 unsigned nargs = cfun->machine->num_args;
1581 if (cfun->machine->is_varadic)
1583 varargs = gen_reg_rtx (Pmode);
1584 emit_move_insn (varargs, stack_pointer_rtx);
1587 rtvec vec = rtvec_alloc (nargs + 1);
1588 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1589 int vec_pos = 0;
1591 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1592 rtx tmp_retval = retval;
1593 if (retval)
1595 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1596 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1597 call = gen_rtx_SET (tmp_retval, call);
1599 XVECEXP (pat, 0, vec_pos++) = call;
1601 /* Construct the call insn, including a USE for each argument pseudo
1602 register. These will be used when printing the insn. */
1603 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1604 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1606 if (varargs)
1607 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1609 gcc_assert (vec_pos = XVECLEN (pat, 0));
1611 nvptx_emit_forking (parallel, true);
1612 emit_call_insn (pat);
1613 nvptx_emit_joining (parallel, true);
1615 if (tmp_retval != retval)
1616 emit_move_insn (retval, tmp_retval);
1619 /* Emit a comparison COMPARE, and return the new test to be used in the
1620 jump. */
1623 nvptx_expand_compare (rtx compare)
1625 rtx pred = gen_reg_rtx (BImode);
1626 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1627 XEXP (compare, 0), XEXP (compare, 1));
1628 emit_insn (gen_rtx_SET (pred, cmp));
1629 return gen_rtx_NE (BImode, pred, const0_rtx);
1632 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1634 void
1635 nvptx_expand_oacc_fork (unsigned mode)
1637 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1640 void
1641 nvptx_expand_oacc_join (unsigned mode)
1643 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1646 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1647 objects. */
1649 static rtx
1650 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1652 rtx res;
1654 switch (GET_MODE (src))
1656 case E_DImode:
1657 res = gen_unpackdisi2 (dst0, dst1, src);
1658 break;
1659 case E_DFmode:
1660 res = gen_unpackdfsi2 (dst0, dst1, src);
1661 break;
1662 default: gcc_unreachable ();
1664 return res;
1667 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1668 object. */
1670 static rtx
1671 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1673 rtx res;
1675 switch (GET_MODE (dst))
1677 case E_DImode:
1678 res = gen_packsidi2 (dst, src0, src1);
1679 break;
1680 case E_DFmode:
1681 res = gen_packsidf2 (dst, src0, src1);
1682 break;
1683 default: gcc_unreachable ();
1685 return res;
1688 /* Generate an instruction or sequence to broadcast register REG
1689 across the vectors of a single warp. */
1692 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1694 rtx res;
1696 switch (GET_MODE (dst))
1698 case E_SImode:
1699 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1700 break;
1701 case E_SFmode:
1702 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1703 break;
1704 case E_DImode:
1705 case E_DFmode:
1707 rtx tmp0 = gen_reg_rtx (SImode);
1708 rtx tmp1 = gen_reg_rtx (SImode);
1710 start_sequence ();
1711 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1712 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1713 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1714 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1715 res = get_insns ();
1716 end_sequence ();
1718 break;
1719 case E_BImode:
1721 rtx tmp = gen_reg_rtx (SImode);
1723 start_sequence ();
1724 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1725 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1726 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1727 res = get_insns ();
1728 end_sequence ();
1730 break;
1731 case E_QImode:
1732 case E_HImode:
1734 rtx tmp = gen_reg_rtx (SImode);
1736 start_sequence ();
1737 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1738 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1739 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1740 tmp)));
1741 res = get_insns ();
1742 end_sequence ();
1744 break;
1746 default:
1747 gcc_unreachable ();
1749 return res;
1752 /* Generate an instruction or sequence to broadcast register REG
1753 across the vectors of a single warp. */
1755 static rtx
1756 nvptx_gen_warp_bcast (rtx reg)
1758 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1761 /* Structure used when generating a worker-level spill or fill. */
1763 struct broadcast_data_t
1765 rtx base; /* Register holding base addr of buffer. */
1766 rtx ptr; /* Iteration var, if needed. */
1767 unsigned offset; /* Offset into worker buffer. */
1770 /* Direction of the spill/fill and looping setup/teardown indicator. */
1772 enum propagate_mask
1774 PM_read = 1 << 0,
1775 PM_write = 1 << 1,
1776 PM_loop_begin = 1 << 2,
1777 PM_loop_end = 1 << 3,
1779 PM_read_write = PM_read | PM_write
1782 /* Generate instruction(s) to spill or fill register REG to/from the
1783 worker broadcast array. PM indicates what is to be done, REP
1784 how many loop iterations will be executed (0 for not a loop). */
1786 static rtx
1787 nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
1788 broadcast_data_t *data, bool vector)
1790 rtx res;
1791 machine_mode mode = GET_MODE (reg);
1793 switch (mode)
1795 case E_BImode:
1797 rtx tmp = gen_reg_rtx (SImode);
1799 start_sequence ();
1800 if (pm & PM_read)
1801 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1802 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
1803 if (pm & PM_write)
1804 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1805 res = get_insns ();
1806 end_sequence ();
1808 break;
1810 default:
1812 rtx addr = data->ptr;
1814 if (!addr)
1816 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1818 oacc_bcast_align = MAX (oacc_bcast_align, align);
1819 data->offset = ROUND_UP (data->offset, align);
1820 addr = data->base;
1821 gcc_assert (data->base != NULL);
1822 if (data->offset)
1823 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1826 addr = gen_rtx_MEM (mode, addr);
1827 if (pm == PM_read)
1828 res = gen_rtx_SET (addr, reg);
1829 else if (pm == PM_write)
1830 res = gen_rtx_SET (reg, addr);
1831 else
1832 gcc_unreachable ();
1834 if (data->ptr)
1836 /* We're using a ptr, increment it. */
1837 start_sequence ();
1839 emit_insn (res);
1840 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1841 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1842 res = get_insns ();
1843 end_sequence ();
1845 else
1846 rep = 1;
1847 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1849 break;
1851 return res;
1854 /* Returns true if X is a valid address for use in a memory reference. */
1856 static bool
1857 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1859 enum rtx_code code = GET_CODE (x);
1861 switch (code)
1863 case REG:
1864 return true;
1866 case PLUS:
1867 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1868 return true;
1869 return false;
1871 case CONST:
1872 case SYMBOL_REF:
1873 case LABEL_REF:
1874 return true;
1876 default:
1877 return false;
1881 /* Machinery to output constant initializers. When beginning an
1882 initializer, we decide on a fragment size (which is visible in ptx
1883 in the type used), and then all initializer data is buffered until
1884 a fragment is filled and ready to be written out. */
1886 static struct
1888 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1889 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1890 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1891 out. */
1892 unsigned size; /* Fragment size to accumulate. */
1893 unsigned offset; /* Offset within current fragment. */
1894 bool started; /* Whether we've output any initializer. */
1895 } init_frag;
1897 /* The current fragment is full, write it out. SYM may provide a
1898 symbolic reference we should output, in which case the fragment
1899 value is the addend. */
1901 static void
1902 output_init_frag (rtx sym)
1904 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1905 unsigned HOST_WIDE_INT val = init_frag.val;
1907 init_frag.started = true;
1908 init_frag.val = 0;
1909 init_frag.offset = 0;
1910 init_frag.remaining--;
1912 if (sym)
1914 bool function = (SYMBOL_REF_DECL (sym)
1915 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
1916 if (!function)
1917 fprintf (asm_out_file, "generic(");
1918 output_address (VOIDmode, sym);
1919 if (!function)
1920 fprintf (asm_out_file, ")");
1921 if (val)
1922 fprintf (asm_out_file, " + ");
1925 if (!sym || val)
1926 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1929 /* Add value VAL of size SIZE to the data we're emitting, and keep
1930 writing out chunks as they fill up. */
1932 static void
1933 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1935 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1937 for (unsigned part = 0; size; size -= part)
1939 val >>= part * BITS_PER_UNIT;
1940 part = init_frag.size - init_frag.offset;
1941 part = MIN (part, size);
1943 unsigned HOST_WIDE_INT partial
1944 = val << (init_frag.offset * BITS_PER_UNIT);
1945 init_frag.val |= partial & init_frag.mask;
1946 init_frag.offset += part;
1948 if (init_frag.offset == init_frag.size)
1949 output_init_frag (NULL);
1953 /* Target hook for assembling integer object X of size SIZE. */
1955 static bool
1956 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1958 HOST_WIDE_INT val = 0;
1960 switch (GET_CODE (x))
1962 default:
1963 /* Let the generic machinery figure it out, usually for a
1964 CONST_WIDE_INT. */
1965 return false;
1967 case CONST_INT:
1968 nvptx_assemble_value (INTVAL (x), size);
1969 break;
1971 case CONST:
1972 x = XEXP (x, 0);
1973 gcc_assert (GET_CODE (x) == PLUS);
1974 val = INTVAL (XEXP (x, 1));
1975 x = XEXP (x, 0);
1976 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1977 /* FALLTHROUGH */
1979 case SYMBOL_REF:
1980 gcc_assert (size == init_frag.size);
1981 if (init_frag.offset)
1982 sorry ("cannot emit unaligned pointers in ptx assembly");
1984 nvptx_maybe_record_fnsym (x);
1985 init_frag.val = val;
1986 output_init_frag (x);
1987 break;
1990 return true;
1993 /* Output SIZE zero bytes. We ignore the FILE argument since the
1994 functions we're calling to perform the output just use
1995 asm_out_file. */
1997 void
1998 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2000 /* Finish the current fragment, if it's started. */
2001 if (init_frag.offset)
2003 unsigned part = init_frag.size - init_frag.offset;
2004 part = MIN (part, (unsigned)size);
2005 size -= part;
2006 nvptx_assemble_value (0, part);
2009 /* If this skip doesn't terminate the initializer, write as many
2010 remaining pieces as possible directly. */
2011 if (size < init_frag.remaining * init_frag.size)
2013 while (size >= init_frag.size)
2015 size -= init_frag.size;
2016 output_init_frag (NULL_RTX);
2018 if (size)
2019 nvptx_assemble_value (0, size);
2023 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2024 ignore the FILE arg. */
2026 void
2027 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2029 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2030 nvptx_assemble_value (str[i], 1);
2033 /* Return true if TYPE is a record type where the last field is an array without
2034 given dimension. */
2036 static bool
2037 flexible_array_member_type_p (const_tree type)
2039 if (TREE_CODE (type) != RECORD_TYPE)
2040 return false;
2042 const_tree last_field = NULL_TREE;
2043 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2044 last_field = f;
2046 if (!last_field)
2047 return false;
2049 const_tree last_field_type = TREE_TYPE (last_field);
2050 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2051 return false;
2053 return (! TYPE_DOMAIN (last_field_type)
2054 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2057 /* Emit a PTX variable decl and prepare for emission of its
2058 initializer. NAME is the symbol name and SETION the PTX data
2059 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2060 The caller has already emitted any indentation and linkage
2061 specifier. It is responsible for any initializer, terminating ;
2062 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2063 this is the opposite way round that PTX wants them! */
2065 static void
2066 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2067 const_tree type, HOST_WIDE_INT size, unsigned align,
2068 bool undefined = false)
2070 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2071 && (TYPE_DOMAIN (type) == NULL_TREE);
2073 if (undefined && flexible_array_member_type_p (type))
2075 size = 0;
2076 atype = true;
2079 while (TREE_CODE (type) == ARRAY_TYPE)
2080 type = TREE_TYPE (type);
2082 if (TREE_CODE (type) == VECTOR_TYPE
2083 || TREE_CODE (type) == COMPLEX_TYPE)
2084 /* Neither vector nor complex types can contain the other. */
2085 type = TREE_TYPE (type);
2087 unsigned elt_size = int_size_in_bytes (type);
2089 /* Largest mode we're prepared to accept. For BLKmode types we
2090 don't know if it'll contain pointer constants, so have to choose
2091 pointer size, otherwise we can choose DImode. */
2092 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2094 elt_size |= GET_MODE_SIZE (elt_mode);
2095 elt_size &= -elt_size; /* Extract LSB set. */
2097 init_frag.size = elt_size;
2098 /* Avoid undefined shift behavior by using '2'. */
2099 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2100 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2101 init_frag.val = 0;
2102 init_frag.offset = 0;
2103 init_frag.started = false;
2104 /* Size might not be a multiple of elt size, if there's an
2105 initialized trailing struct array with smaller type than
2106 elt_size. */
2107 init_frag.remaining = (size + elt_size - 1) / elt_size;
2109 fprintf (file, "%s .align %d .u%d ",
2110 section, align / BITS_PER_UNIT,
2111 elt_size * BITS_PER_UNIT);
2112 assemble_name (file, name);
2114 if (size)
2115 /* We make everything an array, to simplify any initialization
2116 emission. */
2117 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2118 else if (atype)
2119 fprintf (file, "[]");
2122 /* Called when the initializer for a decl has been completely output through
2123 combinations of the three functions above. */
2125 static void
2126 nvptx_assemble_decl_end (void)
2128 if (init_frag.offset)
2129 /* This can happen with a packed struct with trailing array member. */
2130 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2131 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2134 /* Output an uninitialized common or file-scope variable. */
2136 void
2137 nvptx_output_aligned_decl (FILE *file, const char *name,
2138 const_tree decl, HOST_WIDE_INT size, unsigned align)
2140 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2142 /* If this is public, it is common. The nearest thing we have to
2143 common is weak. */
2144 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2146 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2147 TREE_TYPE (decl), size, align);
2148 nvptx_assemble_decl_end ();
2151 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2152 writing a constant variable EXP with NAME and SIZE and its
2153 initializer to FILE. */
2155 static void
2156 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2157 const_tree exp, HOST_WIDE_INT obj_size)
2159 write_var_marker (file, true, false, name);
2161 fprintf (file, "\t");
2163 tree type = TREE_TYPE (exp);
2164 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2165 TYPE_ALIGN (type));
2168 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2169 a variable DECL with NAME to FILE. */
2171 void
2172 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2174 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2176 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2177 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2179 tree type = TREE_TYPE (decl);
2180 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2181 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2182 type, obj_size, DECL_ALIGN (decl));
2185 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2187 static void
2188 nvptx_globalize_label (FILE *, const char *)
2192 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2193 declaration only for variable DECL with NAME to FILE. */
2195 static void
2196 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2198 /* The middle end can place constant pool decls into the varpool as
2199 undefined. Until that is fixed, catch the problem here. */
2200 if (DECL_IN_CONSTANT_POOL (decl))
2201 return;
2203 /* We support weak defintions, and hence have the right
2204 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2205 if (DECL_WEAK (decl))
2206 error_at (DECL_SOURCE_LOCATION (decl),
2207 "PTX does not support weak declarations"
2208 " (only weak definitions)");
2209 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2211 fprintf (file, "\t.extern ");
2212 tree size = DECL_SIZE_UNIT (decl);
2213 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2214 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2215 DECL_ALIGN (decl), true);
2216 nvptx_assemble_decl_end ();
2219 /* Output a pattern for a move instruction. */
2221 const char *
2222 nvptx_output_mov_insn (rtx dst, rtx src)
2224 machine_mode dst_mode = GET_MODE (dst);
2225 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2226 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2227 machine_mode src_inner = (GET_CODE (src) == SUBREG
2228 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2230 rtx sym = src;
2231 if (GET_CODE (sym) == CONST)
2232 sym = XEXP (XEXP (sym, 0), 0);
2233 if (SYMBOL_REF_P (sym))
2235 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2236 return "%.\tcvta%D1%t0\t%0, %1;";
2237 nvptx_maybe_record_fnsym (sym);
2240 if (src_inner == dst_inner)
2241 return "%.\tmov%t0\t%0, %1;";
2243 if (CONSTANT_P (src))
2244 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2245 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2246 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2248 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2250 if (GET_MODE_BITSIZE (dst_mode) == 128
2251 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2253 /* mov.b128 is not supported. */
2254 if (dst_inner == V2DImode && src_inner == TImode)
2255 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2256 else if (dst_inner == TImode && src_inner == V2DImode)
2257 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2259 gcc_unreachable ();
2261 return "%.\tmov.b%T0\t%0, %1;";
2264 return "%.\tcvt%t0%t1\t%0, %1;";
2267 static void nvptx_print_operand (FILE *, rtx, int);
2269 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2270 involves writing .param declarations and in/out copies into them. For
2271 indirect calls, also write the .callprototype. */
2273 const char *
2274 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2276 char buf[16];
2277 static int labelno;
2278 bool needs_tgt = register_operand (callee, Pmode);
2279 rtx pat = PATTERN (insn);
2280 if (GET_CODE (pat) == COND_EXEC)
2281 pat = COND_EXEC_CODE (pat);
2282 int arg_end = XVECLEN (pat, 0);
2283 tree decl = NULL_TREE;
2285 fprintf (asm_out_file, "\t{\n");
2286 if (result != NULL)
2287 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2288 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2289 reg_names[NVPTX_RETURN_REGNUM]);
2291 /* Ensure we have a ptx declaration in the output if necessary. */
2292 if (GET_CODE (callee) == SYMBOL_REF)
2294 decl = SYMBOL_REF_DECL (callee);
2295 if (!decl
2296 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2297 nvptx_record_libfunc (callee, result, pat);
2298 else if (DECL_EXTERNAL (decl))
2299 nvptx_record_fndecl (decl);
2302 if (needs_tgt)
2304 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2305 labelno++;
2306 ASM_OUTPUT_LABEL (asm_out_file, buf);
2307 std::stringstream s;
2308 write_fn_proto_from_insn (s, NULL, result, pat);
2309 fputs (s.str().c_str(), asm_out_file);
2312 for (int argno = 1; argno < arg_end; argno++)
2314 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2315 machine_mode mode = GET_MODE (t);
2316 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2318 /* Mode splitting has already been done. */
2319 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2320 "\t\tst.param%s [%%out_arg%d], ",
2321 ptx_type, argno, ptx_type, argno);
2322 output_reg (asm_out_file, REGNO (t), VOIDmode);
2323 fprintf (asm_out_file, ";\n");
2326 /* The '.' stands for the call's predicate, if any. */
2327 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2328 fprintf (asm_out_file, "\t\tcall ");
2329 if (result != NULL_RTX)
2330 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2332 if (decl)
2334 const char *name = get_fnname_from_decl (decl);
2335 name = nvptx_name_replacement (name);
2336 assemble_name (asm_out_file, name);
2338 else
2339 output_address (VOIDmode, callee);
2341 const char *open = "(";
2342 for (int argno = 1; argno < arg_end; argno++)
2344 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2345 open = "";
2347 if (decl && DECL_STATIC_CHAIN (decl))
2349 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2350 open = "";
2352 if (!open[0])
2353 fprintf (asm_out_file, ")");
2355 if (needs_tgt)
2357 fprintf (asm_out_file, ", ");
2358 assemble_name (asm_out_file, buf);
2360 fprintf (asm_out_file, ";\n");
2362 if (find_reg_note (insn, REG_NORETURN, NULL))
2364 /* No return functions confuse the PTX JIT, as it doesn't realize
2365 the flow control barrier they imply. It can seg fault if it
2366 encounters what looks like an unexitable loop. Emit a trailing
2367 trap and exit, which it does grok. */
2368 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2369 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2372 if (result)
2374 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2376 if (!rval[0])
2377 /* We must escape the '%' that starts RETURN_REGNUM. */
2378 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2379 reg_names[NVPTX_RETURN_REGNUM]);
2380 return rval;
2383 return "}";
2386 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2388 static bool
2389 nvptx_print_operand_punct_valid_p (unsigned char c)
2391 return c == '.' || c== '#';
2394 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2396 static void
2397 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2399 rtx off;
2400 if (GET_CODE (x) == CONST)
2401 x = XEXP (x, 0);
2402 switch (GET_CODE (x))
2404 case PLUS:
2405 off = XEXP (x, 1);
2406 output_address (VOIDmode, XEXP (x, 0));
2407 fprintf (file, "+");
2408 output_address (VOIDmode, off);
2409 break;
2411 case SYMBOL_REF:
2412 case LABEL_REF:
2413 output_addr_const (file, x);
2414 break;
2416 default:
2417 gcc_assert (GET_CODE (x) != MEM);
2418 nvptx_print_operand (file, x, 0);
2419 break;
2423 /* Write assembly language output for the address ADDR to FILE. */
2425 static void
2426 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2428 nvptx_print_address_operand (file, addr, mode);
2431 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2433 Meaning of CODE:
2434 . -- print the predicate for the instruction or an emptry string for an
2435 unconditional one.
2436 # -- print a rounding mode for the instruction
2438 A -- print a data area for a MEM
2439 c -- print an opcode suffix for a comparison operator, including a type code
2440 D -- print a data area for a MEM operand
2441 S -- print a shuffle kind specified by CONST_INT
2442 t -- print a type opcode suffix, promoting QImode to 32 bits
2443 T -- print a type size in bits
2444 u -- print a type opcode suffix without promotions. */
2446 static void
2447 nvptx_print_operand (FILE *file, rtx x, int code)
2449 if (code == '.')
2451 x = current_insn_predicate;
2452 if (x)
2454 fputs ("@", file);
2455 if (GET_CODE (x) == EQ)
2456 fputs ("!", file);
2457 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2459 return;
2461 else if (code == '#')
2463 fputs (".rn", file);
2464 return;
2467 enum rtx_code x_code = GET_CODE (x);
2468 machine_mode mode = GET_MODE (x);
2470 switch (code)
2472 case 'A':
2473 x = XEXP (x, 0);
2474 /* FALLTHROUGH. */
2476 case 'D':
2477 if (GET_CODE (x) == CONST)
2478 x = XEXP (x, 0);
2479 if (GET_CODE (x) == PLUS)
2480 x = XEXP (x, 0);
2482 if (GET_CODE (x) == SYMBOL_REF)
2483 fputs (section_for_sym (x), file);
2484 break;
2486 case 't':
2487 case 'u':
2488 if (x_code == SUBREG)
2490 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2491 if (VECTOR_MODE_P (inner_mode)
2492 && (GET_MODE_SIZE (mode)
2493 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2494 mode = GET_MODE_INNER (inner_mode);
2495 else if (split_mode_p (inner_mode))
2496 mode = maybe_split_mode (inner_mode);
2497 else
2498 mode = inner_mode;
2500 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2501 break;
2503 case 'H':
2504 case 'L':
2506 rtx inner_x = SUBREG_REG (x);
2507 machine_mode inner_mode = GET_MODE (inner_x);
2508 machine_mode split = maybe_split_mode (inner_mode);
2510 output_reg (file, REGNO (inner_x), split,
2511 (code == 'H'
2512 ? GET_MODE_SIZE (inner_mode) / 2
2513 : 0));
2515 break;
2517 case 'S':
2519 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2520 /* Same order as nvptx_shuffle_kind. */
2521 static const char *const kinds[] =
2522 {".up", ".down", ".bfly", ".idx"};
2523 fputs (kinds[kind], file);
2525 break;
2527 case 'T':
2528 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2529 break;
2531 case 'j':
2532 fprintf (file, "@");
2533 goto common;
2535 case 'J':
2536 fprintf (file, "@!");
2537 goto common;
2539 case 'c':
2540 mode = GET_MODE (XEXP (x, 0));
2541 switch (x_code)
2543 case EQ:
2544 fputs (".eq", file);
2545 break;
2546 case NE:
2547 if (FLOAT_MODE_P (mode))
2548 fputs (".neu", file);
2549 else
2550 fputs (".ne", file);
2551 break;
2552 case LE:
2553 case LEU:
2554 fputs (".le", file);
2555 break;
2556 case GE:
2557 case GEU:
2558 fputs (".ge", file);
2559 break;
2560 case LT:
2561 case LTU:
2562 fputs (".lt", file);
2563 break;
2564 case GT:
2565 case GTU:
2566 fputs (".gt", file);
2567 break;
2568 case LTGT:
2569 fputs (".ne", file);
2570 break;
2571 case UNEQ:
2572 fputs (".equ", file);
2573 break;
2574 case UNLE:
2575 fputs (".leu", file);
2576 break;
2577 case UNGE:
2578 fputs (".geu", file);
2579 break;
2580 case UNLT:
2581 fputs (".ltu", file);
2582 break;
2583 case UNGT:
2584 fputs (".gtu", file);
2585 break;
2586 case UNORDERED:
2587 fputs (".nan", file);
2588 break;
2589 case ORDERED:
2590 fputs (".num", file);
2591 break;
2592 default:
2593 gcc_unreachable ();
2595 if (FLOAT_MODE_P (mode)
2596 || x_code == EQ || x_code == NE
2597 || x_code == GEU || x_code == GTU
2598 || x_code == LEU || x_code == LTU)
2599 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2600 else
2601 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2602 break;
2603 default:
2604 common:
2605 switch (x_code)
2607 case SUBREG:
2609 rtx inner_x = SUBREG_REG (x);
2610 machine_mode inner_mode = GET_MODE (inner_x);
2611 machine_mode split = maybe_split_mode (inner_mode);
2613 if (VECTOR_MODE_P (inner_mode)
2614 && (GET_MODE_SIZE (mode)
2615 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2617 output_reg (file, REGNO (inner_x), VOIDmode);
2618 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2620 else if (split_mode_p (inner_mode)
2621 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2622 output_reg (file, REGNO (inner_x), split);
2623 else
2624 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2626 break;
2628 case REG:
2629 output_reg (file, REGNO (x), maybe_split_mode (mode));
2630 break;
2632 case MEM:
2633 fputc ('[', file);
2634 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2635 fputc (']', file);
2636 break;
2638 case CONST_INT:
2639 output_addr_const (file, x);
2640 break;
2642 case CONST:
2643 case SYMBOL_REF:
2644 case LABEL_REF:
2645 /* We could use output_addr_const, but that can print things like
2646 "x-8", which breaks ptxas. Need to ensure it is output as
2647 "x+-8". */
2648 nvptx_print_address_operand (file, x, VOIDmode);
2649 break;
2651 case CONST_DOUBLE:
2652 long vals[2];
2653 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2654 vals[0] &= 0xffffffff;
2655 vals[1] &= 0xffffffff;
2656 if (mode == SFmode)
2657 fprintf (file, "0f%08lx", vals[0]);
2658 else
2659 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2660 break;
2662 case CONST_VECTOR:
2664 unsigned n = CONST_VECTOR_NUNITS (x);
2665 fprintf (file, "{ ");
2666 for (unsigned i = 0; i < n; ++i)
2668 if (i != 0)
2669 fprintf (file, ", ");
2671 rtx elem = CONST_VECTOR_ELT (x, i);
2672 output_addr_const (file, elem);
2674 fprintf (file, " }");
2676 break;
2678 default:
2679 output_addr_const (file, x);
2684 /* Record replacement regs used to deal with subreg operands. */
2685 struct reg_replace
2687 rtx replacement[MAX_RECOG_OPERANDS];
2688 machine_mode mode;
2689 int n_allocated;
2690 int n_in_use;
2693 /* Allocate or reuse a replacement in R and return the rtx. */
2695 static rtx
2696 get_replacement (struct reg_replace *r)
2698 if (r->n_allocated == r->n_in_use)
2699 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2700 return r->replacement[r->n_in_use++];
2703 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2704 the presence of subregs would break the rules for most instructions.
2705 Replace them with a suitable new register of the right size, plus
2706 conversion copyin/copyout instructions. */
2708 static void
2709 nvptx_reorg_subreg (void)
2711 struct reg_replace qiregs, hiregs, siregs, diregs;
2712 rtx_insn *insn, *next;
2714 qiregs.n_allocated = 0;
2715 hiregs.n_allocated = 0;
2716 siregs.n_allocated = 0;
2717 diregs.n_allocated = 0;
2718 qiregs.mode = QImode;
2719 hiregs.mode = HImode;
2720 siregs.mode = SImode;
2721 diregs.mode = DImode;
2723 for (insn = get_insns (); insn; insn = next)
2725 next = NEXT_INSN (insn);
2726 if (!NONDEBUG_INSN_P (insn)
2727 || asm_noperands (PATTERN (insn)) >= 0
2728 || GET_CODE (PATTERN (insn)) == USE
2729 || GET_CODE (PATTERN (insn)) == CLOBBER)
2730 continue;
2732 qiregs.n_in_use = 0;
2733 hiregs.n_in_use = 0;
2734 siregs.n_in_use = 0;
2735 diregs.n_in_use = 0;
2736 extract_insn (insn);
2737 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2739 for (int i = 0; i < recog_data.n_operands; i++)
2741 rtx op = recog_data.operand[i];
2742 if (GET_CODE (op) != SUBREG)
2743 continue;
2745 rtx inner = SUBREG_REG (op);
2747 machine_mode outer_mode = GET_MODE (op);
2748 machine_mode inner_mode = GET_MODE (inner);
2749 gcc_assert (s_ok);
2750 if (s_ok
2751 && (GET_MODE_PRECISION (inner_mode)
2752 >= GET_MODE_PRECISION (outer_mode)))
2753 continue;
2754 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2755 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2756 : outer_mode == HImode ? &hiregs
2757 : outer_mode == SImode ? &siregs
2758 : &diregs);
2759 rtx new_reg = get_replacement (r);
2761 if (recog_data.operand_type[i] != OP_OUT)
2763 enum rtx_code code;
2764 if (GET_MODE_PRECISION (inner_mode)
2765 < GET_MODE_PRECISION (outer_mode))
2766 code = ZERO_EXTEND;
2767 else
2768 code = TRUNCATE;
2770 rtx pat = gen_rtx_SET (new_reg,
2771 gen_rtx_fmt_e (code, outer_mode, inner));
2772 emit_insn_before (pat, insn);
2775 if (recog_data.operand_type[i] != OP_IN)
2777 enum rtx_code code;
2778 if (GET_MODE_PRECISION (inner_mode)
2779 < GET_MODE_PRECISION (outer_mode))
2780 code = TRUNCATE;
2781 else
2782 code = ZERO_EXTEND;
2784 rtx pat = gen_rtx_SET (inner,
2785 gen_rtx_fmt_e (code, inner_mode, new_reg));
2786 emit_insn_after (pat, insn);
2788 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2793 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2794 first use. */
2796 static rtx
2797 nvptx_get_unisimt_master ()
2799 rtx &master = cfun->machine->unisimt_master;
2800 return master ? master : master = gen_reg_rtx (SImode);
2803 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2805 static rtx
2806 nvptx_get_unisimt_predicate ()
2808 rtx &pred = cfun->machine->unisimt_predicate;
2809 return pred ? pred : pred = gen_reg_rtx (BImode);
2812 /* Return true if given call insn references one of the functions provided by
2813 the CUDA runtime: malloc, free, vprintf. */
2815 static bool
2816 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2818 rtx pat = PATTERN (insn);
2819 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2820 pat = XVECEXP (pat, 0, 0);
2821 if (GET_CODE (pat) == SET)
2822 pat = SET_SRC (pat);
2823 gcc_checking_assert (GET_CODE (pat) == CALL
2824 && GET_CODE (XEXP (pat, 0)) == MEM);
2825 rtx addr = XEXP (XEXP (pat, 0), 0);
2826 if (GET_CODE (addr) != SYMBOL_REF)
2827 return false;
2828 const char *name = XSTR (addr, 0);
2829 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2830 references with forced assembler name refer to PTX syscalls. For vprintf,
2831 accept both normal and forced-assembler-name references. */
2832 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2833 || !strcmp (name, "*malloc")
2834 || !strcmp (name, "*free"));
2837 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2838 propagate its value from lane MASTER to current lane. */
2840 static void
2841 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2843 rtx reg;
2844 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2845 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2848 /* Adjust code for uniform-simt code generation variant by making atomics and
2849 "syscalls" conditionally executed, and inserting shuffle-based propagation
2850 for registers being set. */
2852 static void
2853 nvptx_reorg_uniform_simt ()
2855 rtx_insn *insn, *next;
2857 for (insn = get_insns (); insn; insn = next)
2859 next = NEXT_INSN (insn);
2860 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2861 && !(NONJUMP_INSN_P (insn)
2862 && GET_CODE (PATTERN (insn)) == PARALLEL
2863 && get_attr_atomic (insn)))
2864 continue;
2865 rtx pat = PATTERN (insn);
2866 rtx master = nvptx_get_unisimt_master ();
2867 for (int i = 0; i < XVECLEN (pat, 0); i++)
2868 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2869 rtx pred = nvptx_get_unisimt_predicate ();
2870 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2871 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2872 validate_change (insn, &PATTERN (insn), pat, false);
2876 /* Offloading function attributes. */
2878 struct offload_attrs
2880 unsigned mask;
2881 int num_gangs;
2882 int num_workers;
2883 int vector_length;
2886 /* Loop structure of the function. The entire function is described as
2887 a NULL loop. */
2889 struct parallel
2891 /* Parent parallel. */
2892 parallel *parent;
2894 /* Next sibling parallel. */
2895 parallel *next;
2897 /* First child parallel. */
2898 parallel *inner;
2900 /* Partitioning mask of the parallel. */
2901 unsigned mask;
2903 /* Partitioning used within inner parallels. */
2904 unsigned inner_mask;
2906 /* Location of parallel forked and join. The forked is the first
2907 block in the parallel and the join is the first block after of
2908 the partition. */
2909 basic_block forked_block;
2910 basic_block join_block;
2912 rtx_insn *forked_insn;
2913 rtx_insn *join_insn;
2915 rtx_insn *fork_insn;
2916 rtx_insn *joining_insn;
2918 /* Basic blocks in this parallel, but not in child parallels. The
2919 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2920 blocks are not. */
2921 auto_vec<basic_block> blocks;
2923 public:
2924 parallel (parallel *parent, unsigned mode);
2925 ~parallel ();
2928 /* Constructor links the new parallel into it's parent's chain of
2929 children. */
2931 parallel::parallel (parallel *parent_, unsigned mask_)
2932 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2934 forked_block = join_block = 0;
2935 forked_insn = join_insn = 0;
2936 fork_insn = joining_insn = 0;
2938 if (parent)
2940 next = parent->inner;
2941 parent->inner = this;
2945 parallel::~parallel ()
2947 delete inner;
2948 delete next;
2951 /* Map of basic blocks to insns */
2952 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2954 /* A tuple of an insn of interest and the BB in which it resides. */
2955 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2956 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2958 /* Split basic blocks such that each forked and join unspecs are at
2959 the start of their basic blocks. Thus afterwards each block will
2960 have a single partitioning mode. We also do the same for return
2961 insns, as they are executed by every thread. Return the
2962 partitioning mode of the function as a whole. Populate MAP with
2963 head and tail blocks. We also clear the BB visited flag, which is
2964 used when finding partitions. */
2966 static void
2967 nvptx_split_blocks (bb_insn_map_t *map)
2969 insn_bb_vec_t worklist;
2970 basic_block block;
2971 rtx_insn *insn;
2973 /* Locate all the reorg instructions of interest. */
2974 FOR_ALL_BB_FN (block, cfun)
2976 bool seen_insn = false;
2978 /* Clear visited flag, for use by parallel locator */
2979 block->flags &= ~BB_VISITED;
2981 FOR_BB_INSNS (block, insn)
2983 if (!INSN_P (insn))
2984 continue;
2985 switch (recog_memoized (insn))
2987 default:
2988 seen_insn = true;
2989 continue;
2990 case CODE_FOR_nvptx_forked:
2991 case CODE_FOR_nvptx_join:
2992 break;
2994 case CODE_FOR_return:
2995 /* We also need to split just before return insns, as
2996 that insn needs executing by all threads, but the
2997 block it is in probably does not. */
2998 break;
3001 if (seen_insn)
3002 /* We've found an instruction that must be at the start of
3003 a block, but isn't. Add it to the worklist. */
3004 worklist.safe_push (insn_bb_t (insn, block));
3005 else
3006 /* It was already the first instruction. Just add it to
3007 the map. */
3008 map->get_or_insert (block) = insn;
3009 seen_insn = true;
3013 /* Split blocks on the worklist. */
3014 unsigned ix;
3015 insn_bb_t *elt;
3016 basic_block remap = 0;
3017 for (ix = 0; worklist.iterate (ix, &elt); ix++)
3019 if (remap != elt->second)
3021 block = elt->second;
3022 remap = block;
3025 /* Split block before insn. The insn is in the new block */
3026 edge e = split_block (block, PREV_INSN (elt->first));
3028 block = e->dest;
3029 map->get_or_insert (block) = elt->first;
3033 /* BLOCK is a basic block containing a head or tail instruction.
3034 Locate the associated prehead or pretail instruction, which must be
3035 in the single predecessor block. */
3037 static rtx_insn *
3038 nvptx_discover_pre (basic_block block, int expected)
3040 gcc_assert (block->preds->length () == 1);
3041 basic_block pre_block = (*block->preds)[0]->src;
3042 rtx_insn *pre_insn;
3044 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3045 pre_insn = PREV_INSN (pre_insn))
3046 gcc_assert (pre_insn != BB_HEAD (pre_block));
3048 gcc_assert (recog_memoized (pre_insn) == expected);
3049 return pre_insn;
3052 /* Dump this parallel and all its inner parallels. */
3054 static void
3055 nvptx_dump_pars (parallel *par, unsigned depth)
3057 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3058 depth, par->mask,
3059 par->forked_block ? par->forked_block->index : -1,
3060 par->join_block ? par->join_block->index : -1);
3062 fprintf (dump_file, " blocks:");
3064 basic_block block;
3065 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3066 fprintf (dump_file, " %d", block->index);
3067 fprintf (dump_file, "\n");
3068 if (par->inner)
3069 nvptx_dump_pars (par->inner, depth + 1);
3071 if (par->next)
3072 nvptx_dump_pars (par->next, depth);
3075 /* If BLOCK contains a fork/join marker, process it to create or
3076 terminate a loop structure. Add this block to the current loop,
3077 and then walk successor blocks. */
3079 static parallel *
3080 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3082 if (block->flags & BB_VISITED)
3083 return par;
3084 block->flags |= BB_VISITED;
3086 if (rtx_insn **endp = map->get (block))
3088 rtx_insn *end = *endp;
3090 /* This is a block head or tail, or return instruction. */
3091 switch (recog_memoized (end))
3093 case CODE_FOR_return:
3094 /* Return instructions are in their own block, and we
3095 don't need to do anything more. */
3096 return par;
3098 case CODE_FOR_nvptx_forked:
3099 /* Loop head, create a new inner loop and add it into
3100 our parent's child list. */
3102 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3104 gcc_assert (mask);
3105 par = new parallel (par, mask);
3106 par->forked_block = block;
3107 par->forked_insn = end;
3108 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3109 par->fork_insn
3110 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3112 break;
3114 case CODE_FOR_nvptx_join:
3115 /* A loop tail. Finish the current loop and return to
3116 parent. */
3118 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3120 gcc_assert (par->mask == mask);
3121 par->join_block = block;
3122 par->join_insn = end;
3123 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3124 par->joining_insn
3125 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3126 par = par->parent;
3128 break;
3130 default:
3131 gcc_unreachable ();
3135 if (par)
3136 /* Add this block onto the current loop's list of blocks. */
3137 par->blocks.safe_push (block);
3138 else
3139 /* This must be the entry block. Create a NULL parallel. */
3140 par = new parallel (0, 0);
3142 /* Walk successor blocks. */
3143 edge e;
3144 edge_iterator ei;
3146 FOR_EACH_EDGE (e, ei, block->succs)
3147 nvptx_find_par (map, par, e->dest);
3149 return par;
3152 /* DFS walk the CFG looking for fork & join markers. Construct
3153 loop structures as we go. MAP is a mapping of basic blocks
3154 to head & tail markers, discovered when splitting blocks. This
3155 speeds up the discovery. We rely on the BB visited flag having
3156 been cleared when splitting blocks. */
3158 static parallel *
3159 nvptx_discover_pars (bb_insn_map_t *map)
3161 basic_block block;
3163 /* Mark exit blocks as visited. */
3164 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3165 block->flags |= BB_VISITED;
3167 /* And entry block as not. */
3168 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3169 block->flags &= ~BB_VISITED;
3171 parallel *par = nvptx_find_par (map, 0, block);
3173 if (dump_file)
3175 fprintf (dump_file, "\nLoops\n");
3176 nvptx_dump_pars (par, 0);
3177 fprintf (dump_file, "\n");
3180 return par;
3183 /* Analyse a group of BBs within a partitioned region and create N
3184 Single-Entry-Single-Exit regions. Some of those regions will be
3185 trivial ones consisting of a single BB. The blocks of a
3186 partitioned region might form a set of disjoint graphs -- because
3187 the region encloses a differently partitoned sub region.
3189 We use the linear time algorithm described in 'Finding Regions Fast:
3190 Single Entry Single Exit and control Regions in Linear Time'
3191 Johnson, Pearson & Pingali. That algorithm deals with complete
3192 CFGs, where a back edge is inserted from END to START, and thus the
3193 problem becomes one of finding equivalent loops.
3195 In this case we have a partial CFG. We complete it by redirecting
3196 any incoming edge to the graph to be from an arbitrary external BB,
3197 and similarly redirecting any outgoing edge to be to that BB.
3198 Thus we end up with a closed graph.
3200 The algorithm works by building a spanning tree of an undirected
3201 graph and keeping track of back edges from nodes further from the
3202 root in the tree to nodes nearer to the root in the tree. In the
3203 description below, the root is up and the tree grows downwards.
3205 We avoid having to deal with degenerate back-edges to the same
3206 block, by splitting each BB into 3 -- one for input edges, one for
3207 the node itself and one for the output edges. Such back edges are
3208 referred to as 'Brackets'. Cycle equivalent nodes will have the
3209 same set of brackets.
3211 Determining bracket equivalency is done by maintaining a list of
3212 brackets in such a manner that the list length and final bracket
3213 uniquely identify the set.
3215 We use coloring to mark all BBs with cycle equivalency with the
3216 same color. This is the output of the 'Finding Regions Fast'
3217 algorithm. Notice it doesn't actually find the set of nodes within
3218 a particular region, just unorderd sets of nodes that are the
3219 entries and exits of SESE regions.
3221 After determining cycle equivalency, we need to find the minimal
3222 set of SESE regions. Do this with a DFS coloring walk of the
3223 complete graph. We're either 'looking' or 'coloring'. When
3224 looking, and we're in the subgraph, we start coloring the color of
3225 the current node, and remember that node as the start of the
3226 current color's SESE region. Every time we go to a new node, we
3227 decrement the count of nodes with thet color. If it reaches zero,
3228 we remember that node as the end of the current color's SESE region
3229 and return to 'looking'. Otherwise we color the node the current
3230 color.
3232 This way we end up with coloring the inside of non-trivial SESE
3233 regions with the color of that region. */
3235 /* A pair of BBs. We use this to represent SESE regions. */
3236 typedef std::pair<basic_block, basic_block> bb_pair_t;
3237 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3239 /* A node in the undirected CFG. The discriminator SECOND indicates just
3240 above or just below the BB idicated by FIRST. */
3241 typedef std::pair<basic_block, int> pseudo_node_t;
3243 /* A bracket indicates an edge towards the root of the spanning tree of the
3244 undirected graph. Each bracket has a color, determined
3245 from the currrent set of brackets. */
3246 struct bracket
3248 pseudo_node_t back; /* Back target */
3250 /* Current color and size of set. */
3251 unsigned color;
3252 unsigned size;
3254 bracket (pseudo_node_t back_)
3255 : back (back_), color (~0u), size (~0u)
3259 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3261 if (length != size)
3263 size = length;
3264 color = color_counts.length ();
3265 color_counts.quick_push (0);
3267 color_counts[color]++;
3268 return color;
3272 typedef auto_vec<bracket> bracket_vec_t;
3274 /* Basic block info for finding SESE regions. */
3276 struct bb_sese
3278 int node; /* Node number in spanning tree. */
3279 int parent; /* Parent node number. */
3281 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3282 edges arrive at pseudo-node Ai and the outgoing edges leave at
3283 pseudo-node Ao. We have to remember which way we arrived at a
3284 particular node when generating the spanning tree. dir > 0 means
3285 we arrived at Ai, dir < 0 means we arrived at Ao. */
3286 int dir;
3288 /* Lowest numbered pseudo-node reached via a backedge from thsis
3289 node, or any descendant. */
3290 pseudo_node_t high;
3292 int color; /* Cycle-equivalence color */
3294 /* Stack of brackets for this node. */
3295 bracket_vec_t brackets;
3297 bb_sese (unsigned node_, unsigned p, int dir_)
3298 :node (node_), parent (p), dir (dir_)
3301 ~bb_sese ();
3303 /* Push a bracket ending at BACK. */
3304 void push (const pseudo_node_t &back)
3306 if (dump_file)
3307 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3308 back.first ? back.first->index : 0, back.second);
3309 brackets.safe_push (bracket (back));
3312 void append (bb_sese *child);
3313 void remove (const pseudo_node_t &);
3315 /* Set node's color. */
3316 void set_color (auto_vec<unsigned> &color_counts)
3318 color = brackets.last ().get_color (color_counts, brackets.length ());
3322 bb_sese::~bb_sese ()
3326 /* Destructively append CHILD's brackets. */
3328 void
3329 bb_sese::append (bb_sese *child)
3331 if (int len = child->brackets.length ())
3333 int ix;
3335 if (dump_file)
3337 for (ix = 0; ix < len; ix++)
3339 const pseudo_node_t &pseudo = child->brackets[ix].back;
3340 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3341 child->node, pseudo.first ? pseudo.first->index : 0,
3342 pseudo.second);
3345 if (!brackets.length ())
3346 std::swap (brackets, child->brackets);
3347 else
3349 brackets.reserve (len);
3350 for (ix = 0; ix < len; ix++)
3351 brackets.quick_push (child->brackets[ix]);
3356 /* Remove brackets that terminate at PSEUDO. */
3358 void
3359 bb_sese::remove (const pseudo_node_t &pseudo)
3361 unsigned removed = 0;
3362 int len = brackets.length ();
3364 for (int ix = 0; ix < len; ix++)
3366 if (brackets[ix].back == pseudo)
3368 if (dump_file)
3369 fprintf (dump_file, "Removing backedge %d:%+d\n",
3370 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3371 removed++;
3373 else if (removed)
3374 brackets[ix-removed] = brackets[ix];
3376 while (removed--)
3377 brackets.pop ();
3380 /* Accessors for BB's aux pointer. */
3381 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3382 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3384 /* DFS walk creating SESE data structures. Only cover nodes with
3385 BB_VISITED set. Append discovered blocks to LIST. We number in
3386 increments of 3 so that the above and below pseudo nodes can be
3387 implicitly numbered too. */
3389 static int
3390 nvptx_sese_number (int n, int p, int dir, basic_block b,
3391 auto_vec<basic_block> *list)
3393 if (BB_GET_SESE (b))
3394 return n;
3396 if (dump_file)
3397 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3398 b->index, n, p, dir);
3400 BB_SET_SESE (b, new bb_sese (n, p, dir));
3401 p = n;
3403 n += 3;
3404 list->quick_push (b);
3406 /* First walk the nodes on the 'other side' of this node, then walk
3407 the nodes on the same side. */
3408 for (unsigned ix = 2; ix; ix--)
3410 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3411 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3412 : offsetof (edge_def, src));
3413 edge e;
3414 edge_iterator (ei);
3416 FOR_EACH_EDGE (e, ei, edges)
3418 basic_block target = *(basic_block *)((char *)e + offset);
3420 if (target->flags & BB_VISITED)
3421 n = nvptx_sese_number (n, p, dir, target, list);
3423 dir = -dir;
3425 return n;
3428 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3429 EDGES are the outgoing edges and OFFSET is the offset to the src
3430 or dst block on the edges. */
3432 static void
3433 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3434 vec<edge, va_gc> *edges, size_t offset)
3436 edge e;
3437 edge_iterator (ei);
3438 int hi_back = depth;
3439 pseudo_node_t node_back (0, depth);
3440 int hi_child = depth;
3441 pseudo_node_t node_child (0, depth);
3442 basic_block child = NULL;
3443 unsigned num_children = 0;
3444 int usd = -dir * sese->dir;
3446 if (dump_file)
3447 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3448 me->index, sese->node, dir);
3450 if (dir < 0)
3452 /* This is the above pseudo-child. It has the BB itself as an
3453 additional child node. */
3454 node_child = sese->high;
3455 hi_child = node_child.second;
3456 if (node_child.first)
3457 hi_child += BB_GET_SESE (node_child.first)->node;
3458 num_children++;
3461 /* Examine each edge.
3462 - if it is a child (a) append its bracket list and (b) record
3463 whether it is the child with the highest reaching bracket.
3464 - if it is an edge to ancestor, record whether it's the highest
3465 reaching backlink. */
3466 FOR_EACH_EDGE (e, ei, edges)
3468 basic_block target = *(basic_block *)((char *)e + offset);
3470 if (bb_sese *t_sese = BB_GET_SESE (target))
3472 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3474 /* Child node. Append its bracket list. */
3475 num_children++;
3476 sese->append (t_sese);
3478 /* Compare it's hi value. */
3479 int t_hi = t_sese->high.second;
3481 if (basic_block child_hi_block = t_sese->high.first)
3482 t_hi += BB_GET_SESE (child_hi_block)->node;
3484 if (hi_child > t_hi)
3486 hi_child = t_hi;
3487 node_child = t_sese->high;
3488 child = target;
3491 else if (t_sese->node < sese->node + dir
3492 && !(dir < 0 && sese->parent == t_sese->node))
3494 /* Non-parental ancestor node -- a backlink. */
3495 int d = usd * t_sese->dir;
3496 int back = t_sese->node + d;
3498 if (hi_back > back)
3500 hi_back = back;
3501 node_back = pseudo_node_t (target, d);
3505 else
3506 { /* Fallen off graph, backlink to entry node. */
3507 hi_back = 0;
3508 node_back = pseudo_node_t (0, 0);
3512 /* Remove any brackets that terminate at this pseudo node. */
3513 sese->remove (pseudo_node_t (me, dir));
3515 /* Now push any backlinks from this pseudo node. */
3516 FOR_EACH_EDGE (e, ei, edges)
3518 basic_block target = *(basic_block *)((char *)e + offset);
3519 if (bb_sese *t_sese = BB_GET_SESE (target))
3521 if (t_sese->node < sese->node + dir
3522 && !(dir < 0 && sese->parent == t_sese->node))
3523 /* Non-parental ancestor node - backedge from me. */
3524 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3526 else
3528 /* back edge to entry node */
3529 sese->push (pseudo_node_t (0, 0));
3533 /* If this node leads directly or indirectly to a no-return region of
3534 the graph, then fake a backedge to entry node. */
3535 if (!sese->brackets.length () || !edges || !edges->length ())
3537 hi_back = 0;
3538 node_back = pseudo_node_t (0, 0);
3539 sese->push (node_back);
3542 /* Record the highest reaching backedge from us or a descendant. */
3543 sese->high = hi_back < hi_child ? node_back : node_child;
3545 if (num_children > 1)
3547 /* There is more than one child -- this is a Y shaped piece of
3548 spanning tree. We have to insert a fake backedge from this
3549 node to the highest ancestor reached by not-the-highest
3550 reaching child. Note that there may be multiple children
3551 with backedges to the same highest node. That's ok and we
3552 insert the edge to that highest node. */
3553 hi_child = depth;
3554 if (dir < 0 && child)
3556 node_child = sese->high;
3557 hi_child = node_child.second;
3558 if (node_child.first)
3559 hi_child += BB_GET_SESE (node_child.first)->node;
3562 FOR_EACH_EDGE (e, ei, edges)
3564 basic_block target = *(basic_block *)((char *)e + offset);
3566 if (target == child)
3567 /* Ignore the highest child. */
3568 continue;
3570 bb_sese *t_sese = BB_GET_SESE (target);
3571 if (!t_sese)
3572 continue;
3573 if (t_sese->parent != sese->node)
3574 /* Not a child. */
3575 continue;
3577 /* Compare its hi value. */
3578 int t_hi = t_sese->high.second;
3580 if (basic_block child_hi_block = t_sese->high.first)
3581 t_hi += BB_GET_SESE (child_hi_block)->node;
3583 if (hi_child > t_hi)
3585 hi_child = t_hi;
3586 node_child = t_sese->high;
3590 sese->push (node_child);
3595 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3596 proceed to successors. Set SESE entry and exit nodes of
3597 REGIONS. */
3599 static void
3600 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3601 basic_block block, int coloring)
3603 bb_sese *sese = BB_GET_SESE (block);
3605 if (block->flags & BB_VISITED)
3607 /* If we've already encountered this block, either we must not
3608 be coloring, or it must have been colored the current color. */
3609 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3610 return;
3613 block->flags |= BB_VISITED;
3615 if (sese)
3617 if (coloring < 0)
3619 /* Start coloring a region. */
3620 regions[sese->color].first = block;
3621 coloring = sese->color;
3624 if (!--color_counts[sese->color] && sese->color == coloring)
3626 /* Found final block of SESE region. */
3627 regions[sese->color].second = block;
3628 coloring = -1;
3630 else
3631 /* Color the node, so we can assert on revisiting the node
3632 that the graph is indeed SESE. */
3633 sese->color = coloring;
3635 else
3636 /* Fallen off the subgraph, we cannot be coloring. */
3637 gcc_assert (coloring < 0);
3639 /* Walk each successor block. */
3640 if (block->succs && block->succs->length ())
3642 edge e;
3643 edge_iterator ei;
3645 FOR_EACH_EDGE (e, ei, block->succs)
3646 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3648 else
3649 gcc_assert (coloring < 0);
3652 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3653 end up with NULL entries in it. */
3655 static void
3656 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3658 basic_block block;
3659 int ix;
3661 /* First clear each BB of the whole function. */
3662 FOR_ALL_BB_FN (block, cfun)
3664 block->flags &= ~BB_VISITED;
3665 BB_SET_SESE (block, 0);
3668 /* Mark blocks in the function that are in this graph. */
3669 for (ix = 0; blocks.iterate (ix, &block); ix++)
3670 block->flags |= BB_VISITED;
3672 /* Counts of nodes assigned to each color. There cannot be more
3673 colors than blocks (and hopefully there will be fewer). */
3674 auto_vec<unsigned> color_counts;
3675 color_counts.reserve (blocks.length ());
3677 /* Worklist of nodes in the spanning tree. Again, there cannot be
3678 more nodes in the tree than blocks (there will be fewer if the
3679 CFG of blocks is disjoint). */
3680 auto_vec<basic_block> spanlist;
3681 spanlist.reserve (blocks.length ());
3683 /* Make sure every block has its cycle class determined. */
3684 for (ix = 0; blocks.iterate (ix, &block); ix++)
3686 if (BB_GET_SESE (block))
3687 /* We already met this block in an earlier graph solve. */
3688 continue;
3690 if (dump_file)
3691 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3693 /* Number the nodes reachable from block initial DFS order. */
3694 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3696 /* Now walk in reverse DFS order to find cycle equivalents. */
3697 while (spanlist.length ())
3699 block = spanlist.pop ();
3700 bb_sese *sese = BB_GET_SESE (block);
3702 /* Do the pseudo node below. */
3703 nvptx_sese_pseudo (block, sese, depth, +1,
3704 sese->dir > 0 ? block->succs : block->preds,
3705 (sese->dir > 0 ? offsetof (edge_def, dest)
3706 : offsetof (edge_def, src)));
3707 sese->set_color (color_counts);
3708 /* Do the pseudo node above. */
3709 nvptx_sese_pseudo (block, sese, depth, -1,
3710 sese->dir < 0 ? block->succs : block->preds,
3711 (sese->dir < 0 ? offsetof (edge_def, dest)
3712 : offsetof (edge_def, src)));
3714 if (dump_file)
3715 fprintf (dump_file, "\n");
3718 if (dump_file)
3720 unsigned count;
3721 const char *comma = "";
3723 fprintf (dump_file, "Found %d cycle equivalents\n",
3724 color_counts.length ());
3725 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3727 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3729 comma = "";
3730 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3731 if (BB_GET_SESE (block)->color == ix)
3733 block->flags |= BB_VISITED;
3734 fprintf (dump_file, "%s%d", comma, block->index);
3735 comma=",";
3737 fprintf (dump_file, "}");
3738 comma = ", ";
3740 fprintf (dump_file, "\n");
3743 /* Now we've colored every block in the subgraph. We now need to
3744 determine the minimal set of SESE regions that cover that
3745 subgraph. Do this with a DFS walk of the complete function.
3746 During the walk we're either 'looking' or 'coloring'. When we
3747 reach the last node of a particular color, we stop coloring and
3748 return to looking. */
3750 /* There cannot be more SESE regions than colors. */
3751 regions.reserve (color_counts.length ());
3752 for (ix = color_counts.length (); ix--;)
3753 regions.quick_push (bb_pair_t (0, 0));
3755 for (ix = 0; blocks.iterate (ix, &block); ix++)
3756 block->flags &= ~BB_VISITED;
3758 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3760 if (dump_file)
3762 const char *comma = "";
3763 int len = regions.length ();
3765 fprintf (dump_file, "SESE regions:");
3766 for (ix = 0; ix != len; ix++)
3768 basic_block from = regions[ix].first;
3769 basic_block to = regions[ix].second;
3771 if (from)
3773 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3774 if (to != from)
3775 fprintf (dump_file, "->%d", to->index);
3777 int color = BB_GET_SESE (from)->color;
3779 /* Print the blocks within the region (excluding ends). */
3780 FOR_EACH_BB_FN (block, cfun)
3782 bb_sese *sese = BB_GET_SESE (block);
3784 if (sese && sese->color == color
3785 && block != from && block != to)
3786 fprintf (dump_file, ".%d", block->index);
3788 fprintf (dump_file, "}");
3790 comma = ",";
3792 fprintf (dump_file, "\n\n");
3795 for (ix = 0; blocks.iterate (ix, &block); ix++)
3796 delete BB_GET_SESE (block);
3799 #undef BB_SET_SESE
3800 #undef BB_GET_SESE
3802 /* Propagate live state at the start of a partitioned region. IS_CALL
3803 indicates whether the propagation is for a (partitioned) call
3804 instruction. BLOCK provides the live register information, and
3805 might not contain INSN. Propagation is inserted just after INSN. RW
3806 indicates whether we are reading and/or writing state. This
3807 separation is needed for worker-level proppagation where we
3808 essentially do a spill & fill. FN is the underlying worker
3809 function to generate the propagation instructions for single
3810 register. DATA is user data.
3812 Returns true if we didn't emit any instructions.
3814 We propagate the live register set for non-calls and the entire
3815 frame for calls and non-calls. We could do better by (a)
3816 propagating just the live set that is used within the partitioned
3817 regions and (b) only propagating stack entries that are used. The
3818 latter might be quite hard to determine. */
3820 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
3822 static bool
3823 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
3824 propagate_mask rw, propagator_fn fn, void *data, bool vector)
3826 bitmap live = DF_LIVE_IN (block);
3827 bitmap_iterator iterator;
3828 unsigned ix;
3829 bool empty = true;
3831 /* Copy the frame array. */
3832 HOST_WIDE_INT fs = get_frame_size ();
3833 if (fs)
3835 rtx tmp = gen_reg_rtx (DImode);
3836 rtx idx = NULL_RTX;
3837 rtx ptr = gen_reg_rtx (Pmode);
3838 rtx pred = NULL_RTX;
3839 rtx_code_label *label = NULL;
3841 empty = false;
3842 /* The frame size might not be DImode compatible, but the frame
3843 array's declaration will be. So it's ok to round up here. */
3844 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3845 /* Detect single iteration loop. */
3846 if (fs == 1)
3847 fs = 0;
3849 start_sequence ();
3850 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3851 if (fs)
3853 idx = gen_reg_rtx (SImode);
3854 pred = gen_reg_rtx (BImode);
3855 label = gen_label_rtx ();
3857 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3858 /* Allow worker function to initialize anything needed. */
3859 rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
3860 if (init)
3861 emit_insn (init);
3862 emit_label (label);
3863 LABEL_NUSES (label)++;
3864 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3866 if (rw & PM_read)
3867 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3868 emit_insn (fn (tmp, rw, fs, data, vector));
3869 if (rw & PM_write)
3870 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3871 if (fs)
3873 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3874 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3875 emit_insn (gen_br_true_uni (pred, label));
3876 rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
3877 if (fini)
3878 emit_insn (fini);
3879 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3881 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3882 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3883 rtx cpy = get_insns ();
3884 end_sequence ();
3885 insn = emit_insn_after (cpy, insn);
3888 if (!is_call)
3889 /* Copy live registers. */
3890 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3892 rtx reg = regno_reg_rtx[ix];
3894 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3896 rtx bcast = fn (reg, rw, 0, data, vector);
3898 insn = emit_insn_after (bcast, insn);
3899 empty = false;
3902 return empty;
3905 /* Worker for nvptx_warp_propagate. */
3907 static rtx
3908 warp_prop_gen (rtx reg, propagate_mask pm,
3909 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
3910 bool ARG_UNUSED (vector))
3912 if (!(pm & PM_read_write))
3913 return 0;
3915 return nvptx_gen_warp_bcast (reg);
3918 /* Propagate state that is live at start of BLOCK across the vectors
3919 of a single warp. Propagation is inserted just after INSN.
3920 IS_CALL and return as for nvptx_propagate. */
3922 static bool
3923 nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
3925 return nvptx_propagate (is_call, block, insn, PM_read_write,
3926 warp_prop_gen, 0, false);
3929 /* Worker for nvptx_shared_propagate. */
3931 static rtx
3932 shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
3933 bool vector)
3935 broadcast_data_t *data = (broadcast_data_t *)data_;
3937 if (pm & PM_loop_begin)
3939 /* Starting a loop, initialize pointer. */
3940 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3942 oacc_bcast_align = MAX (oacc_bcast_align, align);
3943 data->offset = ROUND_UP (data->offset, align);
3945 data->ptr = gen_reg_rtx (Pmode);
3947 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3949 else if (pm & PM_loop_end)
3951 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3952 data->ptr = NULL_RTX;
3953 return clobber;
3955 else
3956 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
3959 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3960 indicates if this is just before partitioned mode (do spill), or
3961 just after it starts (do fill). Sequence is inserted just after
3962 INSN. IS_CALL and return as for nvptx_propagate. */
3964 static bool
3965 nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
3966 rtx_insn *insn, bool vector)
3968 broadcast_data_t data;
3970 data.base = gen_reg_rtx (Pmode);
3971 data.offset = 0;
3972 data.ptr = NULL_RTX;
3974 bool empty = nvptx_propagate (is_call, block, insn,
3975 pre_p ? PM_read : PM_write, shared_prop_gen,
3976 &data, vector);
3977 gcc_assert (empty == !data.offset);
3978 if (data.offset)
3980 /* Stuff was emitted, initialize the base pointer now. */
3981 rtx init = gen_rtx_SET (data.base, oacc_bcast_sym);
3982 emit_insn_after (init, insn);
3984 oacc_bcast_size = MAX (oacc_bcast_size, data.offset);
3986 return empty;
3989 /* Emit a worker-level synchronization barrier. We use different
3990 markers for before and after synchronizations. */
3992 static rtx
3993 nvptx_cta_sync (bool after)
3995 return gen_nvptx_barsync (GEN_INT (after), GEN_INT (0));
3998 #if WORKAROUND_PTXJIT_BUG
3999 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4000 real insns. */
4002 static rtx_insn *
4003 bb_first_real_insn (basic_block bb)
4005 rtx_insn *insn;
4007 /* Find first insn of from block. */
4008 FOR_BB_INSNS (bb, insn)
4009 if (INSN_P (insn))
4010 return insn;
4012 return 0;
4014 #endif
4016 /* Return true if INSN needs neutering. */
4018 static bool
4019 needs_neutering_p (rtx_insn *insn)
4021 if (!INSN_P (insn))
4022 return false;
4024 switch (recog_memoized (insn))
4026 case CODE_FOR_nvptx_fork:
4027 case CODE_FOR_nvptx_forked:
4028 case CODE_FOR_nvptx_joining:
4029 case CODE_FOR_nvptx_join:
4030 case CODE_FOR_nvptx_barsync:
4031 return false;
4032 default:
4033 return true;
4037 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4039 static bool
4040 verify_neutering_jumps (basic_block from,
4041 rtx_insn *vector_jump, rtx_insn *worker_jump,
4042 rtx_insn *vector_label, rtx_insn *worker_label)
4044 basic_block bb = from;
4045 rtx_insn *insn = BB_HEAD (bb);
4046 bool seen_worker_jump = false;
4047 bool seen_vector_jump = false;
4048 bool seen_worker_label = false;
4049 bool seen_vector_label = false;
4050 bool worker_neutered = false;
4051 bool vector_neutered = false;
4052 while (true)
4054 if (insn == worker_jump)
4056 seen_worker_jump = true;
4057 worker_neutered = true;
4058 gcc_assert (!vector_neutered);
4060 else if (insn == vector_jump)
4062 seen_vector_jump = true;
4063 vector_neutered = true;
4065 else if (insn == worker_label)
4067 seen_worker_label = true;
4068 gcc_assert (worker_neutered);
4069 worker_neutered = false;
4071 else if (insn == vector_label)
4073 seen_vector_label = true;
4074 gcc_assert (vector_neutered);
4075 vector_neutered = false;
4077 else if (INSN_P (insn))
4078 switch (recog_memoized (insn))
4080 case CODE_FOR_nvptx_barsync:
4081 gcc_assert (!vector_neutered && !worker_neutered);
4082 break;
4083 default:
4084 break;
4087 if (insn != BB_END (bb))
4088 insn = NEXT_INSN (insn);
4089 else if (JUMP_P (insn) && single_succ_p (bb)
4090 && !seen_vector_jump && !seen_worker_jump)
4092 bb = single_succ (bb);
4093 insn = BB_HEAD (bb);
4095 else
4096 break;
4099 gcc_assert (!(vector_jump && !seen_vector_jump));
4100 gcc_assert (!(worker_jump && !seen_worker_jump));
4102 if (seen_vector_label || seen_worker_label)
4104 gcc_assert (!(vector_label && !seen_vector_label));
4105 gcc_assert (!(worker_label && !seen_worker_label));
4107 return true;
4110 return false;
4113 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4115 static void
4116 verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4117 rtx_insn *worker_label)
4119 basic_block bb = to;
4120 rtx_insn *insn = BB_END (bb);
4121 bool seen_worker_label = false;
4122 bool seen_vector_label = false;
4123 while (true)
4125 if (insn == worker_label)
4127 seen_worker_label = true;
4128 gcc_assert (!seen_vector_label);
4130 else if (insn == vector_label)
4131 seen_vector_label = true;
4132 else if (INSN_P (insn))
4133 switch (recog_memoized (insn))
4135 case CODE_FOR_nvptx_barsync:
4136 gcc_assert (!seen_vector_label && !seen_worker_label);
4137 break;
4140 if (insn != BB_HEAD (bb))
4141 insn = PREV_INSN (insn);
4142 else
4143 break;
4146 gcc_assert (!(vector_label && !seen_vector_label));
4147 gcc_assert (!(worker_label && !seen_worker_label));
4150 /* Single neutering according to MASK. FROM is the incoming block and
4151 TO is the outgoing block. These may be the same block. Insert at
4152 start of FROM:
4154 if (tid.<axis>) goto end.
4156 and insert before ending branch of TO (if there is such an insn):
4158 end:
4159 <possibly-broadcast-cond>
4160 <branch>
4162 We currently only use differnt FROM and TO when skipping an entire
4163 loop. We could do more if we detected superblocks. */
4165 static void
4166 nvptx_single (unsigned mask, basic_block from, basic_block to)
4168 rtx_insn *head = BB_HEAD (from);
4169 rtx_insn *tail = BB_END (to);
4170 unsigned skip_mask = mask;
4172 while (true)
4174 /* Find first insn of from block. */
4175 while (head != BB_END (from) && !needs_neutering_p (head))
4176 head = NEXT_INSN (head);
4178 if (from == to)
4179 break;
4181 if (!(JUMP_P (head) && single_succ_p (from)))
4182 break;
4184 basic_block jump_target = single_succ (from);
4185 if (!single_pred_p (jump_target))
4186 break;
4188 from = jump_target;
4189 head = BB_HEAD (from);
4192 /* Find last insn of to block */
4193 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4194 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4195 tail = PREV_INSN (tail);
4197 /* Detect if tail is a branch. */
4198 rtx tail_branch = NULL_RTX;
4199 rtx cond_branch = NULL_RTX;
4200 if (tail && INSN_P (tail))
4202 tail_branch = PATTERN (tail);
4203 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4204 tail_branch = NULL_RTX;
4205 else
4207 cond_branch = SET_SRC (tail_branch);
4208 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4209 cond_branch = NULL_RTX;
4213 if (tail == head)
4215 /* If this is empty, do nothing. */
4216 if (!head || !needs_neutering_p (head))
4217 return;
4219 if (cond_branch)
4221 /* If we're only doing vector single, there's no need to
4222 emit skip code because we'll not insert anything. */
4223 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4224 skip_mask = 0;
4226 else if (tail_branch)
4227 /* Block with only unconditional branch. Nothing to do. */
4228 return;
4231 /* Insert the vector test inside the worker test. */
4232 unsigned mode;
4233 rtx_insn *before = tail;
4234 rtx_insn *neuter_start = NULL;
4235 rtx_insn *worker_label = NULL, *vector_label = NULL;
4236 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4237 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4238 if (GOMP_DIM_MASK (mode) & skip_mask)
4240 rtx_code_label *label = gen_label_rtx ();
4241 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4242 rtx_insn **mode_jump = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4243 rtx_insn **mode_label = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4245 if (!pred)
4247 pred = gen_reg_rtx (BImode);
4248 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4251 rtx br;
4252 if (mode == GOMP_DIM_VECTOR)
4253 br = gen_br_true (pred, label);
4254 else
4255 br = gen_br_true_uni (pred, label);
4256 if (neuter_start)
4257 neuter_start = emit_insn_after (br, neuter_start);
4258 else
4259 neuter_start = emit_insn_before (br, head);
4260 *mode_jump = neuter_start;
4262 LABEL_NUSES (label)++;
4263 rtx_insn *label_insn;
4264 if (tail_branch)
4266 label_insn = emit_label_before (label, before);
4267 before = label_insn;
4269 else
4271 label_insn = emit_label_after (label, tail);
4272 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4273 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4274 emit_insn_after (gen_exit (), label_insn);
4277 if (mode == GOMP_DIM_VECTOR)
4278 vector_label = label_insn;
4279 else
4280 worker_label = label_insn;
4283 /* Now deal with propagating the branch condition. */
4284 if (cond_branch)
4286 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4288 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
4290 /* Vector mode only, do a shuffle. */
4291 #if WORKAROUND_PTXJIT_BUG
4292 /* The branch condition %rcond is propagated like this:
4295 .reg .u32 %x;
4296 mov.u32 %x,%tid.x;
4297 setp.ne.u32 %rnotvzero,%x,0;
4300 @%rnotvzero bra Lskip;
4301 setp.<op>.<type> %rcond,op1,op2;
4302 Lskip:
4303 selp.u32 %rcondu32,1,0,%rcond;
4304 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4305 setp.ne.u32 %rcond,%rcondu32,0;
4307 There seems to be a bug in the ptx JIT compiler (observed at driver
4308 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4309 unless %rcond is initialized to something before 'bra Lskip'. The
4310 bug is not observed with ptxas from cuda 8.0.61.
4312 It is true that the code is non-trivial: at Lskip, %rcond is
4313 uninitialized in threads 1-31, and after the selp the same holds
4314 for %rcondu32. But shfl propagates the defined value in thread 0
4315 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4316 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4318 There is nothing in the PTX spec to suggest that this is wrong, or
4319 to explain why the extra initialization is needed. So, we classify
4320 it as a JIT bug, and the extra initialization as workaround:
4323 .reg .u32 %x;
4324 mov.u32 %x,%tid.x;
4325 setp.ne.u32 %rnotvzero,%x,0;
4328 +.reg .pred %rcond2;
4329 +setp.eq.u32 %rcond2, 1, 0;
4331 @%rnotvzero bra Lskip;
4332 setp.<op>.<type> %rcond,op1,op2;
4333 +mov.pred %rcond2, %rcond;
4334 Lskip:
4335 +mov.pred %rcond, %rcond2;
4336 selp.u32 %rcondu32,1,0,%rcond;
4337 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4338 setp.ne.u32 %rcond,%rcondu32,0;
4340 rtx_insn *label = PREV_INSN (tail);
4341 gcc_assert (label && LABEL_P (label));
4342 rtx tmp = gen_reg_rtx (BImode);
4343 emit_insn_before (gen_movbi (tmp, const0_rtx),
4344 bb_first_real_insn (from));
4345 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4346 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4347 #endif
4348 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
4350 else
4352 /* Includes worker mode, do spill & fill. By construction
4353 we should never have worker mode only. */
4354 broadcast_data_t data;
4356 data.base = oacc_bcast_sym;
4357 data.ptr = 0;
4359 oacc_bcast_size = MAX (oacc_bcast_size, GET_MODE_SIZE (SImode));
4361 data.offset = 0;
4362 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
4363 false),
4364 before);
4365 /* Barrier so other workers can see the write. */
4366 emit_insn_before (nvptx_cta_sync (false), tail);
4367 data.offset = 0;
4368 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
4369 false), tail);
4370 /* This barrier is needed to avoid worker zero clobbering
4371 the broadcast buffer before all the other workers have
4372 had a chance to read this instance of it. */
4373 emit_insn_before (nvptx_cta_sync (false), tail);
4376 extract_insn (tail);
4377 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4378 UNSPEC_BR_UNIFIED);
4379 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4382 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4383 vector_label, worker_label);
4384 if (!seen_label)
4385 verify_neutering_labels (to, vector_label, worker_label);
4388 /* PAR is a parallel that is being skipped in its entirety according to
4389 MASK. Treat this as skipping a superblock starting at forked
4390 and ending at joining. */
4392 static void
4393 nvptx_skip_par (unsigned mask, parallel *par)
4395 basic_block tail = par->join_block;
4396 gcc_assert (tail->preds->length () == 1);
4398 basic_block pre_tail = (*tail->preds)[0]->src;
4399 gcc_assert (pre_tail->succs->length () == 1);
4401 nvptx_single (mask, par->forked_block, pre_tail);
4404 /* If PAR has a single inner parallel and PAR itself only contains
4405 empty entry and exit blocks, swallow the inner PAR. */
4407 static void
4408 nvptx_optimize_inner (parallel *par)
4410 parallel *inner = par->inner;
4412 /* We mustn't be the outer dummy par. */
4413 if (!par->mask)
4414 return;
4416 /* We must have a single inner par. */
4417 if (!inner || inner->next)
4418 return;
4420 /* We must only contain 2 blocks ourselves -- the head and tail of
4421 the inner par. */
4422 if (par->blocks.length () != 2)
4423 return;
4425 /* We must be disjoint partitioning. As we only have vector and
4426 worker partitioning, this is sufficient to guarantee the pars
4427 have adjacent partitioning. */
4428 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4429 /* This indicates malformed code generation. */
4430 return;
4432 /* The outer forked insn should be immediately followed by the inner
4433 fork insn. */
4434 rtx_insn *forked = par->forked_insn;
4435 rtx_insn *fork = BB_END (par->forked_block);
4437 if (NEXT_INSN (forked) != fork)
4438 return;
4439 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4441 /* The outer joining insn must immediately follow the inner join
4442 insn. */
4443 rtx_insn *joining = par->joining_insn;
4444 rtx_insn *join = inner->join_insn;
4445 if (NEXT_INSN (join) != joining)
4446 return;
4448 /* Preconditions met. Swallow the inner par. */
4449 if (dump_file)
4450 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4451 inner->mask, inner->forked_block->index,
4452 inner->join_block->index,
4453 par->mask, par->forked_block->index, par->join_block->index);
4455 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4457 par->blocks.reserve (inner->blocks.length ());
4458 while (inner->blocks.length ())
4459 par->blocks.quick_push (inner->blocks.pop ());
4461 par->inner = inner->inner;
4462 inner->inner = NULL;
4464 delete inner;
4467 /* Process the parallel PAR and all its contained
4468 parallels. We do everything but the neutering. Return mask of
4469 partitioned modes used within this parallel. */
4471 static unsigned
4472 nvptx_process_pars (parallel *par)
4474 if (nvptx_optimize)
4475 nvptx_optimize_inner (par);
4477 unsigned inner_mask = par->mask;
4479 /* Do the inner parallels first. */
4480 if (par->inner)
4482 par->inner_mask = nvptx_process_pars (par->inner);
4483 inner_mask |= par->inner_mask;
4486 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4488 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4490 nvptx_shared_propagate (false, is_call, par->forked_block,
4491 par->forked_insn, false);
4492 bool empty = nvptx_shared_propagate (true, is_call,
4493 par->forked_block, par->fork_insn,
4494 false);
4496 if (!empty || !is_call)
4498 /* Insert begin and end synchronizations. */
4499 emit_insn_before (nvptx_cta_sync (false), par->forked_insn);
4500 emit_insn_before (nvptx_cta_sync (false), par->join_insn);
4503 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4504 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
4506 /* Now do siblings. */
4507 if (par->next)
4508 inner_mask |= nvptx_process_pars (par->next);
4509 return inner_mask;
4512 /* Neuter the parallel described by PAR. We recurse in depth-first
4513 order. MODES are the partitioning of the execution and OUTER is
4514 the partitioning of the parallels we are contained in. */
4516 static void
4517 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4519 unsigned me = (par->mask
4520 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4521 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4522 unsigned skip_mask = 0, neuter_mask = 0;
4524 if (par->inner)
4525 nvptx_neuter_pars (par->inner, modes, outer | me);
4527 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4529 if ((outer | me) & GOMP_DIM_MASK (mode))
4530 {} /* Mode is partitioned: no neutering. */
4531 else if (!(modes & GOMP_DIM_MASK (mode)))
4532 {} /* Mode is not used: nothing to do. */
4533 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4534 || !par->forked_insn)
4535 /* Partitioned in inner parallels, or we're not a partitioned
4536 at all: neuter individual blocks. */
4537 neuter_mask |= GOMP_DIM_MASK (mode);
4538 else if (!par->parent || !par->parent->forked_insn
4539 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4540 /* Parent isn't a parallel or contains this paralleling: skip
4541 parallel at this level. */
4542 skip_mask |= GOMP_DIM_MASK (mode);
4543 else
4544 {} /* Parent will skip this parallel itself. */
4547 if (neuter_mask)
4549 int ix, len;
4551 if (nvptx_optimize)
4553 /* Neuter whole SESE regions. */
4554 bb_pair_vec_t regions;
4556 nvptx_find_sese (par->blocks, regions);
4557 len = regions.length ();
4558 for (ix = 0; ix != len; ix++)
4560 basic_block from = regions[ix].first;
4561 basic_block to = regions[ix].second;
4563 if (from)
4564 nvptx_single (neuter_mask, from, to);
4565 else
4566 gcc_assert (!to);
4569 else
4571 /* Neuter each BB individually. */
4572 len = par->blocks.length ();
4573 for (ix = 0; ix != len; ix++)
4575 basic_block block = par->blocks[ix];
4577 nvptx_single (neuter_mask, block, block);
4582 if (skip_mask)
4583 nvptx_skip_par (skip_mask, par);
4585 if (par->next)
4586 nvptx_neuter_pars (par->next, modes, outer);
4589 static void
4590 populate_offload_attrs (offload_attrs *oa)
4592 tree attr = oacc_get_fn_attrib (current_function_decl);
4593 tree dims = TREE_VALUE (attr);
4594 unsigned ix;
4596 oa->mask = 0;
4598 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4600 tree t = TREE_VALUE (dims);
4601 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
4602 tree allowed = TREE_PURPOSE (dims);
4604 if (size != 1 && !(allowed && integer_zerop (allowed)))
4605 oa->mask |= GOMP_DIM_MASK (ix);
4607 switch (ix)
4609 case GOMP_DIM_GANG:
4610 oa->num_gangs = size;
4611 break;
4613 case GOMP_DIM_WORKER:
4614 oa->num_workers = size;
4615 break;
4617 case GOMP_DIM_VECTOR:
4618 oa->vector_length = size;
4619 break;
4624 #if WORKAROUND_PTXJIT_BUG_2
4625 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4626 is needed in the nvptx target because the branches generated for
4627 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4629 static rtx
4630 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4632 rtx pat;
4633 if ((strict && !JUMP_P (insn))
4634 || (!strict && !INSN_P (insn)))
4635 return NULL_RTX;
4636 pat = PATTERN (insn);
4638 /* The set is allowed to appear either as the insn pattern or
4639 the first set in a PARALLEL. */
4640 if (GET_CODE (pat) == PARALLEL)
4641 pat = XVECEXP (pat, 0, 0);
4642 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4643 return pat;
4645 return NULL_RTX;
4648 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4650 static rtx
4651 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4653 rtx x = nvptx_pc_set (insn, strict);
4655 if (!x)
4656 return NULL_RTX;
4657 x = SET_SRC (x);
4658 if (GET_CODE (x) == LABEL_REF)
4659 return x;
4660 if (GET_CODE (x) != IF_THEN_ELSE)
4661 return NULL_RTX;
4662 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4663 return XEXP (x, 1);
4664 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4665 return XEXP (x, 2);
4666 return NULL_RTX;
4669 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4670 insn inbetween the branch and the label. This works around a JIT bug
4671 observed at driver version 384.111, at -O0 for sm_50. */
4673 static void
4674 prevent_branch_around_nothing (void)
4676 rtx_insn *seen_label = NULL;
4677 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4679 if (INSN_P (insn) && condjump_p (insn))
4681 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
4682 continue;
4685 if (seen_label == NULL)
4686 continue;
4688 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4689 continue;
4691 if (INSN_P (insn))
4692 switch (recog_memoized (insn))
4694 case CODE_FOR_nvptx_fork:
4695 case CODE_FOR_nvptx_forked:
4696 case CODE_FOR_nvptx_joining:
4697 case CODE_FOR_nvptx_join:
4698 continue;
4699 default:
4700 seen_label = NULL;
4701 continue;
4704 if (LABEL_P (insn) && insn == seen_label)
4705 emit_insn_before (gen_fake_nop (), insn);
4707 seen_label = NULL;
4710 #endif
4712 #ifdef WORKAROUND_PTXJIT_BUG_3
4713 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4714 works around a hang observed at driver version 390.48 for sm_50. */
4716 static void
4717 workaround_barsyncs (void)
4719 bool seen_barsync = false;
4720 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4722 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
4724 if (seen_barsync)
4726 emit_insn_before (gen_nvptx_membar_cta (), insn);
4727 emit_insn_before (gen_nvptx_membar_cta (), insn);
4730 seen_barsync = true;
4731 continue;
4734 if (!seen_barsync)
4735 continue;
4737 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4738 continue;
4739 else if (INSN_P (insn))
4740 switch (recog_memoized (insn))
4742 case CODE_FOR_nvptx_fork:
4743 case CODE_FOR_nvptx_forked:
4744 case CODE_FOR_nvptx_joining:
4745 case CODE_FOR_nvptx_join:
4746 continue;
4747 default:
4748 break;
4751 seen_barsync = false;
4754 #endif
4756 /* PTX-specific reorganization
4757 - Split blocks at fork and join instructions
4758 - Compute live registers
4759 - Mark now-unused registers, so function begin doesn't declare
4760 unused registers.
4761 - Insert state propagation when entering partitioned mode
4762 - Insert neutering instructions when in single mode
4763 - Replace subregs with suitable sequences.
4766 static void
4767 nvptx_reorg (void)
4769 /* We are freeing block_for_insn in the toplev to keep compatibility
4770 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4771 compute_bb_for_insn ();
4773 thread_prologue_and_epilogue_insns ();
4775 /* Split blocks and record interesting unspecs. */
4776 bb_insn_map_t bb_insn_map;
4778 nvptx_split_blocks (&bb_insn_map);
4780 /* Compute live regs */
4781 df_clear_flags (DF_LR_RUN_DCE);
4782 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
4783 df_live_add_problem ();
4784 df_live_set_all_dirty ();
4785 df_analyze ();
4786 regstat_init_n_sets_and_refs ();
4788 if (dump_file)
4789 df_dump (dump_file);
4791 /* Mark unused regs as unused. */
4792 int max_regs = max_reg_num ();
4793 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
4794 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4795 regno_reg_rtx[i] = const0_rtx;
4797 /* Determine launch dimensions of the function. If it is not an
4798 offloaded function (i.e. this is a regular compiler), the
4799 function has no neutering. */
4800 tree attr = oacc_get_fn_attrib (current_function_decl);
4801 if (attr)
4803 /* If we determined this mask before RTL expansion, we could
4804 elide emission of some levels of forks and joins. */
4805 offload_attrs oa;
4807 populate_offload_attrs (&oa);
4809 /* If there is worker neutering, there must be vector
4810 neutering. Otherwise the hardware will fail. */
4811 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4812 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4814 /* Discover & process partitioned regions. */
4815 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4816 nvptx_process_pars (pars);
4817 nvptx_neuter_pars (pars, oa.mask, 0);
4818 delete pars;
4821 /* Replace subregs. */
4822 nvptx_reorg_subreg ();
4824 if (TARGET_UNIFORM_SIMT)
4825 nvptx_reorg_uniform_simt ();
4827 #if WORKAROUND_PTXJIT_BUG_2
4828 prevent_branch_around_nothing ();
4829 #endif
4831 #ifdef WORKAROUND_PTXJIT_BUG_3
4832 workaround_barsyncs ();
4833 #endif
4835 regstat_free_n_sets_and_refs ();
4837 df_finish_pass (true);
4840 /* Handle a "kernel" attribute; arguments as in
4841 struct attribute_spec.handler. */
4843 static tree
4844 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4845 int ARG_UNUSED (flags), bool *no_add_attrs)
4847 tree decl = *node;
4849 if (TREE_CODE (decl) != FUNCTION_DECL)
4851 error ("%qE attribute only applies to functions", name);
4852 *no_add_attrs = true;
4854 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
4856 error ("%qE attribute requires a void return type", name);
4857 *no_add_attrs = true;
4860 return NULL_TREE;
4863 /* Handle a "shared" attribute; arguments as in
4864 struct attribute_spec.handler. */
4866 static tree
4867 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4868 int ARG_UNUSED (flags), bool *no_add_attrs)
4870 tree decl = *node;
4872 if (TREE_CODE (decl) != VAR_DECL)
4874 error ("%qE attribute only applies to variables", name);
4875 *no_add_attrs = true;
4877 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4879 error ("%qE attribute not allowed with auto storage class", name);
4880 *no_add_attrs = true;
4883 return NULL_TREE;
4886 /* Table of valid machine attributes. */
4887 static const struct attribute_spec nvptx_attribute_table[] =
4889 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
4890 affects_type_identity, handler, exclude } */
4891 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
4892 NULL },
4893 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
4894 NULL },
4895 { NULL, 0, 0, false, false, false, false, NULL, NULL }
4898 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
4900 static HOST_WIDE_INT
4901 nvptx_vector_alignment (const_tree type)
4903 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4905 return MIN (align, BIGGEST_ALIGNMENT);
4908 /* Indicate that INSN cannot be duplicated. */
4910 static bool
4911 nvptx_cannot_copy_insn_p (rtx_insn *insn)
4913 switch (recog_memoized (insn))
4915 case CODE_FOR_nvptx_shufflesi:
4916 case CODE_FOR_nvptx_shufflesf:
4917 case CODE_FOR_nvptx_barsync:
4918 case CODE_FOR_nvptx_fork:
4919 case CODE_FOR_nvptx_forked:
4920 case CODE_FOR_nvptx_joining:
4921 case CODE_FOR_nvptx_join:
4922 return true;
4923 default:
4924 return false;
4928 /* Section anchors do not work. Initialization for flag_section_anchor
4929 probes the existence of the anchoring target hooks and prevents
4930 anchoring if they don't exist. However, we may be being used with
4931 a host-side compiler that does support anchoring, and hence see
4932 the anchor flag set (as it's not recalculated). So provide an
4933 implementation denying anchoring. */
4935 static bool
4936 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4938 return false;
4941 /* Record a symbol for mkoffload to enter into the mapping table. */
4943 static void
4944 nvptx_record_offload_symbol (tree decl)
4946 switch (TREE_CODE (decl))
4948 case VAR_DECL:
4949 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4950 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4951 break;
4953 case FUNCTION_DECL:
4955 tree attr = oacc_get_fn_attrib (decl);
4956 /* OpenMP offloading does not set this attribute. */
4957 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
4959 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4960 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4962 for (; dims; dims = TREE_CHAIN (dims))
4964 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4966 gcc_assert (!TREE_PURPOSE (dims));
4967 fprintf (asm_out_file, ", %#x", size);
4970 fprintf (asm_out_file, "\n");
4972 break;
4974 default:
4975 gcc_unreachable ();
4979 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4980 at the start of a file. */
4982 static void
4983 nvptx_file_start (void)
4985 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4986 fputs ("\t.version\t3.1\n", asm_out_file);
4987 if (TARGET_SM35)
4988 fputs ("\t.target\tsm_35\n", asm_out_file);
4989 else
4990 fputs ("\t.target\tsm_30\n", asm_out_file);
4991 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4992 fputs ("// END PREAMBLE\n", asm_out_file);
4995 /* Emit a declaration for a worker and vector-level buffer in .shared
4996 memory. */
4998 static void
4999 write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
5001 const char *name = XSTR (sym, 0);
5003 write_var_marker (file, true, false, name);
5004 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5005 align, name, size);
5008 /* Write out the function declarations we've collected and declare storage
5009 for the broadcast buffer. */
5011 static void
5012 nvptx_file_end (void)
5014 hash_table<tree_hasher>::iterator iter;
5015 tree decl;
5016 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
5017 nvptx_record_fndecl (decl);
5018 fputs (func_decls.str().c_str(), asm_out_file);
5020 if (oacc_bcast_size)
5021 write_shared_buffer (asm_out_file, oacc_bcast_sym,
5022 oacc_bcast_align, oacc_bcast_size);
5024 if (worker_red_size)
5025 write_shared_buffer (asm_out_file, worker_red_sym,
5026 worker_red_align, worker_red_size);
5028 if (need_softstack_decl)
5030 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
5031 /* 32 is the maximum number of warps in a block. Even though it's an
5032 external declaration, emit the array size explicitly; otherwise, it
5033 may fail at PTX JIT time if the definition is later in link order. */
5034 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
5035 POINTER_SIZE);
5037 if (need_unisimt_decl)
5039 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
5040 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
5044 /* Expander for the shuffle builtins. */
5046 static rtx
5047 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
5049 if (ignore)
5050 return target;
5052 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
5053 NULL_RTX, mode, EXPAND_NORMAL);
5054 if (!REG_P (src))
5055 src = copy_to_mode_reg (mode, src);
5057 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5058 NULL_RTX, SImode, EXPAND_NORMAL);
5059 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
5060 NULL_RTX, SImode, EXPAND_NORMAL);
5062 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5063 idx = copy_to_mode_reg (SImode, idx);
5065 rtx pat = nvptx_gen_shuffle (target, src, idx,
5066 (nvptx_shuffle_kind) INTVAL (op));
5067 if (pat)
5068 emit_insn (pat);
5070 return target;
5073 /* Worker reduction address expander. */
5075 static rtx
5076 nvptx_expand_shared_addr (tree exp, rtx target,
5077 machine_mode ARG_UNUSED (mode), int ignore)
5079 if (ignore)
5080 return target;
5082 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
5083 worker_red_align = MAX (worker_red_align, align);
5085 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5086 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
5087 worker_red_size = MAX (worker_red_size, size + offset);
5089 rtx addr = worker_red_sym;
5090 if (offset)
5092 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5093 addr = gen_rtx_CONST (Pmode, addr);
5096 emit_move_insn (target, addr);
5098 return target;
5101 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5102 not require taking the address of any object, other than the memory
5103 cell being operated on. */
5105 static rtx
5106 nvptx_expand_cmp_swap (tree exp, rtx target,
5107 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5109 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5111 if (!target)
5112 target = gen_reg_rtx (mode);
5114 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5115 NULL_RTX, Pmode, EXPAND_NORMAL);
5116 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5117 NULL_RTX, mode, EXPAND_NORMAL);
5118 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5119 NULL_RTX, mode, EXPAND_NORMAL);
5120 rtx pat;
5122 mem = gen_rtx_MEM (mode, mem);
5123 if (!REG_P (cmp))
5124 cmp = copy_to_mode_reg (mode, cmp);
5125 if (!REG_P (src))
5126 src = copy_to_mode_reg (mode, src);
5128 if (mode == SImode)
5129 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5130 else
5131 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5133 emit_insn (pat);
5135 return target;
5139 /* Codes for all the NVPTX builtins. */
5140 enum nvptx_builtins
5142 NVPTX_BUILTIN_SHUFFLE,
5143 NVPTX_BUILTIN_SHUFFLELL,
5144 NVPTX_BUILTIN_WORKER_ADDR,
5145 NVPTX_BUILTIN_CMP_SWAP,
5146 NVPTX_BUILTIN_CMP_SWAPLL,
5147 NVPTX_BUILTIN_MAX
5150 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5152 /* Return the NVPTX builtin for CODE. */
5154 static tree
5155 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5157 if (code >= NVPTX_BUILTIN_MAX)
5158 return error_mark_node;
5160 return nvptx_builtin_decls[code];
5163 /* Set up all builtin functions for this target. */
5165 static void
5166 nvptx_init_builtins (void)
5168 #define DEF(ID, NAME, T) \
5169 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5170 = add_builtin_function ("__builtin_nvptx_" NAME, \
5171 build_function_type_list T, \
5172 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5173 #define ST sizetype
5174 #define UINT unsigned_type_node
5175 #define LLUINT long_long_unsigned_type_node
5176 #define PTRVOID ptr_type_node
5178 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5179 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5180 DEF (WORKER_ADDR, "worker_addr",
5181 (PTRVOID, ST, UINT, UINT, NULL_TREE));
5182 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5183 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5185 #undef DEF
5186 #undef ST
5187 #undef UINT
5188 #undef LLUINT
5189 #undef PTRVOID
5192 /* Expand an expression EXP that calls a built-in function,
5193 with result going to TARGET if that's convenient
5194 (and in mode MODE if that's convenient).
5195 SUBTARGET may be used as the target for computing one of EXP's operands.
5196 IGNORE is nonzero if the value is to be ignored. */
5198 static rtx
5199 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5200 machine_mode mode, int ignore)
5202 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
5203 switch (DECL_FUNCTION_CODE (fndecl))
5205 case NVPTX_BUILTIN_SHUFFLE:
5206 case NVPTX_BUILTIN_SHUFFLELL:
5207 return nvptx_expand_shuffle (exp, target, mode, ignore);
5209 case NVPTX_BUILTIN_WORKER_ADDR:
5210 return nvptx_expand_shared_addr (exp, target, mode, ignore);
5212 case NVPTX_BUILTIN_CMP_SWAP:
5213 case NVPTX_BUILTIN_CMP_SWAPLL:
5214 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5216 default: gcc_unreachable ();
5220 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5222 static int
5223 nvptx_simt_vf ()
5225 return PTX_WARP_SIZE;
5228 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5229 DIMS has changed. */
5231 static void
5232 nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level)
5234 bool oacc_default_dims_p = false;
5235 bool oacc_min_dims_p = false;
5236 bool offload_region_p = false;
5237 bool routine_p = false;
5238 bool routine_seq_p = false;
5240 if (decl == NULL_TREE)
5242 if (fn_level == -1)
5243 oacc_default_dims_p = true;
5244 else if (fn_level == -2)
5245 oacc_min_dims_p = true;
5246 else
5247 gcc_unreachable ();
5249 else if (fn_level == -1)
5250 offload_region_p = true;
5251 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5253 routine_p = true;
5254 routine_seq_p = fn_level == GOMP_DIM_MAX;
5256 else
5257 gcc_unreachable ();
5259 if (routine_p)
5261 /* OpenACC routines in C arrive here with the following attributes
5262 (omitting the 'omp declare target'):
5263 seq : __attribute__((oacc function (0 1, 0 1, 0 1)))
5264 vector: __attribute__((oacc function (0 1, 0 1, 1 0)))
5265 worker: __attribute__((oacc function (0 1, 1 0, 1 0)))
5266 gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
5268 If we take f.i. the oacc function attribute of the worker routine
5269 (0 1, 1 0, 1 0), then:
5270 - the slice (0, 1, 1) is interpreted by oacc_fn_attrib_level as
5271 meaning: worker routine, that is:
5272 - can't contain gang loop (0),
5273 - can contain worker loop (1),
5274 - can contain vector loop (1).
5275 - the slice (1, 0, 0) is interpreted by oacc_validate_dims as the
5276 dimensions: gang: 1, worker: 0, vector: 0.
5278 OTOH, routines in Fortran arrive here with these attributes:
5279 seq : __attribute__((oacc function (0 0, 0 0, 0 0)))
5280 vector: __attribute__((oacc function (0 0, 0 0, 1 0)))
5281 worker: __attribute__((oacc function (0 0, 1 0, 1 0)))
5282 gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
5283 that is, the same as for C but with the dimensions set to 0.
5285 This is due to a bug in the Fortran front-end: PR72741. Work around
5286 this bug by forcing the dimensions to be the same in Fortran as for C,
5287 to be able to handle C and Fortran routines uniformly in this
5288 function. */
5289 dims[GOMP_DIM_VECTOR] = fn_level > GOMP_DIM_VECTOR ? 1 : 0;
5290 dims[GOMP_DIM_WORKER] = fn_level > GOMP_DIM_WORKER ? 1 : 0;
5291 dims[GOMP_DIM_GANG] = fn_level > GOMP_DIM_GANG ? 1 : 0;
5294 if (oacc_min_dims_p)
5296 gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
5297 gcc_assert (dims[GOMP_DIM_WORKER] == 1);
5298 gcc_assert (dims[GOMP_DIM_GANG] == 1);
5300 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5301 return;
5304 if (routine_p)
5306 if (!routine_seq_p)
5307 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5309 return;
5312 if (oacc_default_dims_p)
5314 /* -1 : not set
5315 0 : set at runtime, f.i. -fopenacc-dims=-
5316 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
5317 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5318 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5319 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5321 /* But -fopenacc-dims=- is not yet supported on trunk. */
5322 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5323 gcc_assert (dims[GOMP_DIM_WORKER] != 0);
5324 gcc_assert (dims[GOMP_DIM_GANG] != 0);
5327 if (offload_region_p)
5329 /* -1 : not set
5330 0 : set using variable, f.i. num_gangs (n)
5331 >= 1: set using constant, f.i. num_gangs (1). */
5332 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5333 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5334 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5337 if (dims[GOMP_DIM_VECTOR] >= 0
5338 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
5340 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5341 dims[GOMP_DIM_VECTOR]
5342 ? G_("using vector_length (%d), ignoring %d")
5343 : G_("using vector_length (%d), ignoring runtime setting"),
5344 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
5345 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
5348 /* Check the num workers is not too large. */
5349 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5351 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5352 "using num_workers (%d), ignoring %d",
5353 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
5354 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5357 if (oacc_default_dims_p)
5359 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
5360 if (dims[GOMP_DIM_WORKER] < 0)
5361 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
5362 if (dims[GOMP_DIM_GANG] < 0)
5363 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
5367 /* Validate compute dimensions of an OpenACC offload or routine, fill
5368 in non-unity defaults. FN_LEVEL indicates the level at which a
5369 routine might spawn a loop. It is negative for non-routines. If
5370 DECL is null, we are validating the default dimensions. */
5372 static bool
5373 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
5375 int old_dims[GOMP_DIM_MAX];
5376 unsigned int i;
5378 for (i = 0; i < GOMP_DIM_MAX; ++i)
5379 old_dims[i] = dims[i];
5381 nvptx_goacc_validate_dims_1 (decl, dims, fn_level);
5383 for (i = 0; i < GOMP_DIM_MAX; ++i)
5384 if (old_dims[i] != dims[i])
5385 return true;
5387 return false;
5390 /* Return maximum dimension size, or zero for unbounded. */
5392 static int
5393 nvptx_dim_limit (int axis)
5395 switch (axis)
5397 case GOMP_DIM_VECTOR:
5398 return PTX_VECTOR_LENGTH;
5400 default:
5401 break;
5403 return 0;
5406 /* Determine whether fork & joins are needed. */
5408 static bool
5409 nvptx_goacc_fork_join (gcall *call, const int dims[],
5410 bool ARG_UNUSED (is_fork))
5412 tree arg = gimple_call_arg (call, 2);
5413 unsigned axis = TREE_INT_CST_LOW (arg);
5415 /* We only care about worker and vector partitioning. */
5416 if (axis < GOMP_DIM_WORKER)
5417 return false;
5419 /* If the size is 1, there's no partitioning. */
5420 if (dims[axis] == 1)
5421 return false;
5423 return true;
5426 /* Generate a PTX builtin function call that returns the address in
5427 the worker reduction buffer at OFFSET. TYPE is the type of the
5428 data at that location. */
5430 static tree
5431 nvptx_get_shared_red_addr (tree type, tree offset)
5433 machine_mode mode = TYPE_MODE (type);
5434 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
5435 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5436 tree align = build_int_cst (unsigned_type_node,
5437 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5438 tree call = build_call_expr (fndecl, 3, offset, size, align);
5440 return fold_convert (build_pointer_type (type), call);
5443 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5444 will cast the variable if necessary. */
5446 static void
5447 nvptx_generate_vector_shuffle (location_t loc,
5448 tree dest_var, tree var, unsigned shift,
5449 gimple_seq *seq)
5451 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5452 tree_code code = NOP_EXPR;
5453 tree arg_type = unsigned_type_node;
5454 tree var_type = TREE_TYPE (var);
5455 tree dest_type = var_type;
5457 if (TREE_CODE (var_type) == COMPLEX_TYPE)
5458 var_type = TREE_TYPE (var_type);
5460 if (TREE_CODE (var_type) == REAL_TYPE)
5461 code = VIEW_CONVERT_EXPR;
5463 if (TYPE_SIZE (var_type)
5464 == TYPE_SIZE (long_long_unsigned_type_node))
5466 fn = NVPTX_BUILTIN_SHUFFLELL;
5467 arg_type = long_long_unsigned_type_node;
5470 tree call = nvptx_builtin_decl (fn, true);
5471 tree bits = build_int_cst (unsigned_type_node, shift);
5472 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5473 tree expr;
5475 if (var_type != dest_type)
5477 /* Do real and imaginary parts separately. */
5478 tree real = fold_build1 (REALPART_EXPR, var_type, var);
5479 real = fold_build1 (code, arg_type, real);
5480 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5481 real = fold_build1 (code, var_type, real);
5483 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5484 imag = fold_build1 (code, arg_type, imag);
5485 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5486 imag = fold_build1 (code, var_type, imag);
5488 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5490 else
5492 expr = fold_build1 (code, arg_type, var);
5493 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5494 expr = fold_build1 (code, dest_type, expr);
5497 gimplify_assign (dest_var, expr, seq);
5500 /* Lazily generate the global lock var decl and return its address. */
5502 static tree
5503 nvptx_global_lock_addr ()
5505 tree v = global_lock_var;
5507 if (!v)
5509 tree name = get_identifier ("__reduction_lock");
5510 tree type = build_qualified_type (unsigned_type_node,
5511 TYPE_QUAL_VOLATILE);
5512 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5513 global_lock_var = v;
5514 DECL_ARTIFICIAL (v) = 1;
5515 DECL_EXTERNAL (v) = 1;
5516 TREE_STATIC (v) = 1;
5517 TREE_PUBLIC (v) = 1;
5518 TREE_USED (v) = 1;
5519 mark_addressable (v);
5520 mark_decl_referenced (v);
5523 return build_fold_addr_expr (v);
5526 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5527 GSI. We use a lockless scheme for nearly all case, which looks
5528 like:
5529 actual = initval(OP);
5530 do {
5531 guess = actual;
5532 write = guess OP myval;
5533 actual = cmp&swap (ptr, guess, write)
5534 } while (actual bit-different-to guess);
5535 return write;
5537 This relies on a cmp&swap instruction, which is available for 32-
5538 and 64-bit types. Larger types must use a locking scheme. */
5540 static tree
5541 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5542 tree ptr, tree var, tree_code op)
5544 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5545 tree_code code = NOP_EXPR;
5546 tree arg_type = unsigned_type_node;
5547 tree var_type = TREE_TYPE (var);
5549 if (TREE_CODE (var_type) == COMPLEX_TYPE
5550 || TREE_CODE (var_type) == REAL_TYPE)
5551 code = VIEW_CONVERT_EXPR;
5553 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
5555 arg_type = long_long_unsigned_type_node;
5556 fn = NVPTX_BUILTIN_CMP_SWAPLL;
5559 tree swap_fn = nvptx_builtin_decl (fn, true);
5561 gimple_seq init_seq = NULL;
5562 tree init_var = make_ssa_name (arg_type);
5563 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5564 init_expr = fold_build1 (code, arg_type, init_expr);
5565 gimplify_assign (init_var, init_expr, &init_seq);
5566 gimple *init_end = gimple_seq_last (init_seq);
5568 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5570 /* Split the block just after the init stmts. */
5571 basic_block pre_bb = gsi_bb (*gsi);
5572 edge pre_edge = split_block (pre_bb, init_end);
5573 basic_block loop_bb = pre_edge->dest;
5574 pre_bb = pre_edge->src;
5575 /* Reset the iterator. */
5576 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5578 tree expect_var = make_ssa_name (arg_type);
5579 tree actual_var = make_ssa_name (arg_type);
5580 tree write_var = make_ssa_name (arg_type);
5582 /* Build and insert the reduction calculation. */
5583 gimple_seq red_seq = NULL;
5584 tree write_expr = fold_build1 (code, var_type, expect_var);
5585 write_expr = fold_build2 (op, var_type, write_expr, var);
5586 write_expr = fold_build1 (code, arg_type, write_expr);
5587 gimplify_assign (write_var, write_expr, &red_seq);
5589 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5591 /* Build & insert the cmp&swap sequence. */
5592 gimple_seq latch_seq = NULL;
5593 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5594 ptr, expect_var, write_var);
5595 gimplify_assign (actual_var, swap_expr, &latch_seq);
5597 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5598 NULL_TREE, NULL_TREE);
5599 gimple_seq_add_stmt (&latch_seq, cond);
5601 gimple *latch_end = gimple_seq_last (latch_seq);
5602 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
5604 /* Split the block just after the latch stmts. */
5605 edge post_edge = split_block (loop_bb, latch_end);
5606 basic_block post_bb = post_edge->dest;
5607 loop_bb = post_edge->src;
5608 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5610 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5611 post_edge->probability = profile_probability::even ();
5612 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
5613 loop_edge->probability = profile_probability::even ();
5614 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5615 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5617 gphi *phi = create_phi_node (expect_var, loop_bb);
5618 add_phi_arg (phi, init_var, pre_edge, loc);
5619 add_phi_arg (phi, actual_var, loop_edge, loc);
5621 loop *loop = alloc_loop ();
5622 loop->header = loop_bb;
5623 loop->latch = loop_bb;
5624 add_loop (loop, loop_bb->loop_father);
5626 return fold_build1 (code, var_type, write_var);
5629 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5630 GSI. This is necessary for types larger than 64 bits, where there
5631 is no cmp&swap instruction to implement a lockless scheme. We use
5632 a lock variable in global memory.
5634 while (cmp&swap (&lock_var, 0, 1))
5635 continue;
5636 T accum = *ptr;
5637 accum = accum OP var;
5638 *ptr = accum;
5639 cmp&swap (&lock_var, 1, 0);
5640 return accum;
5642 A lock in global memory is necessary to force execution engine
5643 descheduling and avoid resource starvation that can occur if the
5644 lock is in .shared memory. */
5646 static tree
5647 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5648 tree ptr, tree var, tree_code op)
5650 tree var_type = TREE_TYPE (var);
5651 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5652 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5653 tree uns_locked = build_int_cst (unsigned_type_node, 1);
5655 /* Split the block just before the gsi. Insert a gimple nop to make
5656 this easier. */
5657 gimple *nop = gimple_build_nop ();
5658 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
5659 basic_block entry_bb = gsi_bb (*gsi);
5660 edge entry_edge = split_block (entry_bb, nop);
5661 basic_block lock_bb = entry_edge->dest;
5662 /* Reset the iterator. */
5663 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5665 /* Build and insert the locking sequence. */
5666 gimple_seq lock_seq = NULL;
5667 tree lock_var = make_ssa_name (unsigned_type_node);
5668 tree lock_expr = nvptx_global_lock_addr ();
5669 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
5670 uns_unlocked, uns_locked);
5671 gimplify_assign (lock_var, lock_expr, &lock_seq);
5672 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
5673 NULL_TREE, NULL_TREE);
5674 gimple_seq_add_stmt (&lock_seq, cond);
5675 gimple *lock_end = gimple_seq_last (lock_seq);
5676 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
5678 /* Split the block just after the lock sequence. */
5679 edge locked_edge = split_block (lock_bb, lock_end);
5680 basic_block update_bb = locked_edge->dest;
5681 lock_bb = locked_edge->src;
5682 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5684 /* Create the lock loop ... */
5685 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5686 locked_edge->probability = profile_probability::even ();
5687 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
5688 loop_edge->probability = profile_probability::even ();
5689 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
5690 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
5692 /* ... and the loop structure. */
5693 loop *lock_loop = alloc_loop ();
5694 lock_loop->header = lock_bb;
5695 lock_loop->latch = lock_bb;
5696 lock_loop->nb_iterations_estimate = 1;
5697 lock_loop->any_estimate = true;
5698 add_loop (lock_loop, entry_bb->loop_father);
5700 /* Build and insert the reduction calculation. */
5701 gimple_seq red_seq = NULL;
5702 tree acc_in = make_ssa_name (var_type);
5703 tree ref_in = build_simple_mem_ref (ptr);
5704 TREE_THIS_VOLATILE (ref_in) = 1;
5705 gimplify_assign (acc_in, ref_in, &red_seq);
5707 tree acc_out = make_ssa_name (var_type);
5708 tree update_expr = fold_build2 (op, var_type, ref_in, var);
5709 gimplify_assign (acc_out, update_expr, &red_seq);
5711 tree ref_out = build_simple_mem_ref (ptr);
5712 TREE_THIS_VOLATILE (ref_out) = 1;
5713 gimplify_assign (ref_out, acc_out, &red_seq);
5715 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5717 /* Build & insert the unlock sequence. */
5718 gimple_seq unlock_seq = NULL;
5719 tree unlock_expr = nvptx_global_lock_addr ();
5720 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
5721 uns_locked, uns_unlocked);
5722 gimplify_and_add (unlock_expr, &unlock_seq);
5723 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
5725 return acc_out;
5728 /* Emit a sequence to update a reduction accumlator at *PTR with the
5729 value held in VAR using operator OP. Return the updated value.
5731 TODO: optimize for atomic ops and indepedent complex ops. */
5733 static tree
5734 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
5735 tree ptr, tree var, tree_code op)
5737 tree type = TREE_TYPE (var);
5738 tree size = TYPE_SIZE (type);
5740 if (size == TYPE_SIZE (unsigned_type_node)
5741 || size == TYPE_SIZE (long_long_unsigned_type_node))
5742 return nvptx_lockless_update (loc, gsi, ptr, var, op);
5743 else
5744 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
5747 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5749 static void
5750 nvptx_goacc_reduction_setup (gcall *call)
5752 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5753 tree lhs = gimple_call_lhs (call);
5754 tree var = gimple_call_arg (call, 2);
5755 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5756 gimple_seq seq = NULL;
5758 push_gimplify_context (true);
5760 if (level != GOMP_DIM_GANG)
5762 /* Copy the receiver object. */
5763 tree ref_to_res = gimple_call_arg (call, 1);
5765 if (!integer_zerop (ref_to_res))
5766 var = build_simple_mem_ref (ref_to_res);
5769 if (level == GOMP_DIM_WORKER)
5771 /* Store incoming value to worker reduction buffer. */
5772 tree offset = gimple_call_arg (call, 5);
5773 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset);
5774 tree ptr = make_ssa_name (TREE_TYPE (call));
5776 gimplify_assign (ptr, call, &seq);
5777 tree ref = build_simple_mem_ref (ptr);
5778 TREE_THIS_VOLATILE (ref) = 1;
5779 gimplify_assign (ref, var, &seq);
5782 if (lhs)
5783 gimplify_assign (lhs, var, &seq);
5785 pop_gimplify_context (NULL);
5786 gsi_replace_with_seq (&gsi, seq, true);
5789 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
5791 static void
5792 nvptx_goacc_reduction_init (gcall *call)
5794 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5795 tree lhs = gimple_call_lhs (call);
5796 tree var = gimple_call_arg (call, 2);
5797 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5798 enum tree_code rcode
5799 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5800 tree init = omp_reduction_init_op (gimple_location (call), rcode,
5801 TREE_TYPE (var));
5802 gimple_seq seq = NULL;
5804 push_gimplify_context (true);
5806 if (level == GOMP_DIM_VECTOR)
5808 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5809 tree tid = make_ssa_name (integer_type_node);
5810 tree dim_vector = gimple_call_arg (call, 3);
5811 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
5812 dim_vector);
5813 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
5814 NULL_TREE, NULL_TREE);
5816 gimple_call_set_lhs (tid_call, tid);
5817 gimple_seq_add_stmt (&seq, tid_call);
5818 gimple_seq_add_stmt (&seq, cond_stmt);
5820 /* Split the block just after the call. */
5821 edge init_edge = split_block (gsi_bb (gsi), call);
5822 basic_block init_bb = init_edge->dest;
5823 basic_block call_bb = init_edge->src;
5825 /* Fixup flags from call_bb to init_bb. */
5826 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
5827 init_edge->probability = profile_probability::even ();
5829 /* Set the initialization stmts. */
5830 gimple_seq init_seq = NULL;
5831 tree init_var = make_ssa_name (TREE_TYPE (var));
5832 gimplify_assign (init_var, init, &init_seq);
5833 gsi = gsi_start_bb (init_bb);
5834 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5836 /* Split block just after the init stmt. */
5837 gsi_prev (&gsi);
5838 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5839 basic_block dst_bb = inited_edge->dest;
5841 /* Create false edge from call_bb to dst_bb. */
5842 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
5843 nop_edge->probability = profile_probability::even ();
5845 /* Create phi node in dst block. */
5846 gphi *phi = create_phi_node (lhs, dst_bb);
5847 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5848 add_phi_arg (phi, var, nop_edge, gimple_location (call));
5850 /* Reset dominator of dst bb. */
5851 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5853 /* Reset the gsi. */
5854 gsi = gsi_for_stmt (call);
5856 else
5858 if (level == GOMP_DIM_GANG)
5860 /* If there's no receiver object, propagate the incoming VAR. */
5861 tree ref_to_res = gimple_call_arg (call, 1);
5862 if (integer_zerop (ref_to_res))
5863 init = var;
5866 gimplify_assign (lhs, init, &seq);
5869 pop_gimplify_context (NULL);
5870 gsi_replace_with_seq (&gsi, seq, true);
5873 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
5875 static void
5876 nvptx_goacc_reduction_fini (gcall *call)
5878 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5879 tree lhs = gimple_call_lhs (call);
5880 tree ref_to_res = gimple_call_arg (call, 1);
5881 tree var = gimple_call_arg (call, 2);
5882 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5883 enum tree_code op
5884 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5885 gimple_seq seq = NULL;
5886 tree r = NULL_TREE;;
5888 push_gimplify_context (true);
5890 if (level == GOMP_DIM_VECTOR)
5892 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5893 but that requires a method of emitting a unified jump at the
5894 gimple level. */
5895 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5897 tree other_var = make_ssa_name (TREE_TYPE (var));
5898 nvptx_generate_vector_shuffle (gimple_location (call),
5899 other_var, var, shfl, &seq);
5901 r = make_ssa_name (TREE_TYPE (var));
5902 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5903 var, other_var), &seq);
5904 var = r;
5907 else
5909 tree accum = NULL_TREE;
5911 if (level == GOMP_DIM_WORKER)
5913 /* Get reduction buffer address. */
5914 tree offset = gimple_call_arg (call, 5);
5915 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset);
5916 tree ptr = make_ssa_name (TREE_TYPE (call));
5918 gimplify_assign (ptr, call, &seq);
5919 accum = ptr;
5921 else if (integer_zerop (ref_to_res))
5922 r = var;
5923 else
5924 accum = ref_to_res;
5926 if (accum)
5928 /* UPDATE the accumulator. */
5929 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5930 seq = NULL;
5931 r = nvptx_reduction_update (gimple_location (call), &gsi,
5932 accum, var, op);
5936 if (lhs)
5937 gimplify_assign (lhs, r, &seq);
5938 pop_gimplify_context (NULL);
5940 gsi_replace_with_seq (&gsi, seq, true);
5943 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5945 static void
5946 nvptx_goacc_reduction_teardown (gcall *call)
5948 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5949 tree lhs = gimple_call_lhs (call);
5950 tree var = gimple_call_arg (call, 2);
5951 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5952 gimple_seq seq = NULL;
5954 push_gimplify_context (true);
5955 if (level == GOMP_DIM_WORKER)
5957 /* Read the worker reduction buffer. */
5958 tree offset = gimple_call_arg (call, 5);
5959 tree call = nvptx_get_shared_red_addr(TREE_TYPE (var), offset);
5960 tree ptr = make_ssa_name (TREE_TYPE (call));
5962 gimplify_assign (ptr, call, &seq);
5963 var = build_simple_mem_ref (ptr);
5964 TREE_THIS_VOLATILE (var) = 1;
5967 if (level != GOMP_DIM_GANG)
5969 /* Write to the receiver object. */
5970 tree ref_to_res = gimple_call_arg (call, 1);
5972 if (!integer_zerop (ref_to_res))
5973 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5976 if (lhs)
5977 gimplify_assign (lhs, var, &seq);
5979 pop_gimplify_context (NULL);
5981 gsi_replace_with_seq (&gsi, seq, true);
5984 /* NVPTX reduction expander. */
5986 static void
5987 nvptx_goacc_reduction (gcall *call)
5989 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5991 switch (code)
5993 case IFN_GOACC_REDUCTION_SETUP:
5994 nvptx_goacc_reduction_setup (call);
5995 break;
5997 case IFN_GOACC_REDUCTION_INIT:
5998 nvptx_goacc_reduction_init (call);
5999 break;
6001 case IFN_GOACC_REDUCTION_FINI:
6002 nvptx_goacc_reduction_fini (call);
6003 break;
6005 case IFN_GOACC_REDUCTION_TEARDOWN:
6006 nvptx_goacc_reduction_teardown (call);
6007 break;
6009 default:
6010 gcc_unreachable ();
6014 static bool
6015 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
6016 rtx x ATTRIBUTE_UNUSED)
6018 return true;
6021 static bool
6022 nvptx_vector_mode_supported (machine_mode mode)
6024 return (mode == V2SImode
6025 || mode == V2DImode);
6028 /* Return the preferred mode for vectorizing scalar MODE. */
6030 static machine_mode
6031 nvptx_preferred_simd_mode (scalar_mode mode)
6033 switch (mode)
6035 case E_DImode:
6036 return V2DImode;
6037 case E_SImode:
6038 return V2SImode;
6040 default:
6041 return default_preferred_simd_mode (mode);
6045 unsigned int
6046 nvptx_data_alignment (const_tree type, unsigned int basic_align)
6048 if (TREE_CODE (type) == INTEGER_TYPE)
6050 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
6051 if (size == GET_MODE_SIZE (TImode))
6052 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
6055 return basic_align;
6058 /* Implement TARGET_MODES_TIEABLE_P. */
6060 static bool
6061 nvptx_modes_tieable_p (machine_mode, machine_mode)
6063 return false;
6066 /* Implement TARGET_HARD_REGNO_NREGS. */
6068 static unsigned int
6069 nvptx_hard_regno_nregs (unsigned int, machine_mode)
6071 return 1;
6074 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
6076 static bool
6077 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
6079 return false;
6082 static GTY(()) tree nvptx_previous_fndecl;
6084 static void
6085 nvptx_set_current_function (tree fndecl)
6087 if (!fndecl || fndecl == nvptx_previous_fndecl)
6088 return;
6090 nvptx_previous_fndecl = fndecl;
6093 #undef TARGET_OPTION_OVERRIDE
6094 #define TARGET_OPTION_OVERRIDE nvptx_option_override
6096 #undef TARGET_ATTRIBUTE_TABLE
6097 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6099 #undef TARGET_LRA_P
6100 #define TARGET_LRA_P hook_bool_void_false
6102 #undef TARGET_LEGITIMATE_ADDRESS_P
6103 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6105 #undef TARGET_PROMOTE_FUNCTION_MODE
6106 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6108 #undef TARGET_FUNCTION_ARG
6109 #define TARGET_FUNCTION_ARG nvptx_function_arg
6110 #undef TARGET_FUNCTION_INCOMING_ARG
6111 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6112 #undef TARGET_FUNCTION_ARG_ADVANCE
6113 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
6114 #undef TARGET_FUNCTION_ARG_BOUNDARY
6115 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
6116 #undef TARGET_PASS_BY_REFERENCE
6117 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6118 #undef TARGET_FUNCTION_VALUE_REGNO_P
6119 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6120 #undef TARGET_FUNCTION_VALUE
6121 #define TARGET_FUNCTION_VALUE nvptx_function_value
6122 #undef TARGET_LIBCALL_VALUE
6123 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
6124 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
6125 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
6126 #undef TARGET_GET_DRAP_RTX
6127 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
6128 #undef TARGET_SPLIT_COMPLEX_ARG
6129 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6130 #undef TARGET_RETURN_IN_MEMORY
6131 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6132 #undef TARGET_OMIT_STRUCT_RETURN_REG
6133 #define TARGET_OMIT_STRUCT_RETURN_REG true
6134 #undef TARGET_STRICT_ARGUMENT_NAMING
6135 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
6136 #undef TARGET_CALL_ARGS
6137 #define TARGET_CALL_ARGS nvptx_call_args
6138 #undef TARGET_END_CALL_ARGS
6139 #define TARGET_END_CALL_ARGS nvptx_end_call_args
6141 #undef TARGET_ASM_FILE_START
6142 #define TARGET_ASM_FILE_START nvptx_file_start
6143 #undef TARGET_ASM_FILE_END
6144 #define TARGET_ASM_FILE_END nvptx_file_end
6145 #undef TARGET_ASM_GLOBALIZE_LABEL
6146 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6147 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6148 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6149 #undef TARGET_PRINT_OPERAND
6150 #define TARGET_PRINT_OPERAND nvptx_print_operand
6151 #undef TARGET_PRINT_OPERAND_ADDRESS
6152 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6153 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6154 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6155 #undef TARGET_ASM_INTEGER
6156 #define TARGET_ASM_INTEGER nvptx_assemble_integer
6157 #undef TARGET_ASM_DECL_END
6158 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6159 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
6160 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6161 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6162 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6163 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6164 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6166 #undef TARGET_MACHINE_DEPENDENT_REORG
6167 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6168 #undef TARGET_NO_REGISTER_ALLOCATION
6169 #define TARGET_NO_REGISTER_ALLOCATION true
6171 #undef TARGET_ENCODE_SECTION_INFO
6172 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
6173 #undef TARGET_RECORD_OFFLOAD_SYMBOL
6174 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6176 #undef TARGET_VECTOR_ALIGNMENT
6177 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6179 #undef TARGET_CANNOT_COPY_INSN_P
6180 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6182 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6183 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6185 #undef TARGET_INIT_BUILTINS
6186 #define TARGET_INIT_BUILTINS nvptx_init_builtins
6187 #undef TARGET_EXPAND_BUILTIN
6188 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6189 #undef TARGET_BUILTIN_DECL
6190 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
6192 #undef TARGET_SIMT_VF
6193 #define TARGET_SIMT_VF nvptx_simt_vf
6195 #undef TARGET_GOACC_VALIDATE_DIMS
6196 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6198 #undef TARGET_GOACC_DIM_LIMIT
6199 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6201 #undef TARGET_GOACC_FORK_JOIN
6202 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6204 #undef TARGET_GOACC_REDUCTION
6205 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6207 #undef TARGET_CANNOT_FORCE_CONST_MEM
6208 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6210 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6211 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6213 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6214 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6215 nvptx_preferred_simd_mode
6217 #undef TARGET_MODES_TIEABLE_P
6218 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6220 #undef TARGET_HARD_REGNO_NREGS
6221 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6223 #undef TARGET_CAN_CHANGE_MODE_CLASS
6224 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6226 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6227 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6229 #undef TARGET_SET_CURRENT_FUNCTION
6230 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6232 struct gcc_target targetm = TARGET_INITIALIZER;
6234 #include "gt-nvptx.h"