PR c/81544 - attribute noreturn and warn_unused_result on the same function accepted
[official-gcc.git] / gcc / config / nvptx / nvptx.c
blob16ff370bb32eee2507afa020082d17a712dc2eee
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2017 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include <sstream>
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "diagnostic.h"
39 #include "alias.h"
40 #include "insn-flags.h"
41 #include "output.h"
42 #include "insn-attr.h"
43 #include "flags.h"
44 #include "dojump.h"
45 #include "explow.h"
46 #include "calls.h"
47 #include "varasm.h"
48 #include "stmt.h"
49 #include "expr.h"
50 #include "tm-preds.h"
51 #include "tm-constrs.h"
52 #include "langhooks.h"
53 #include "dbxout.h"
54 #include "cfgrtl.h"
55 #include "gimple.h"
56 #include "stor-layout.h"
57 #include "builtins.h"
58 #include "omp-general.h"
59 #include "omp-low.h"
60 #include "gomp-constants.h"
61 #include "dumpfile.h"
62 #include "internal-fn.h"
63 #include "gimple-iterator.h"
64 #include "stringpool.h"
65 #include "attribs.h"
66 #include "tree-vrp.h"
67 #include "tree-ssa-operands.h"
68 #include "tree-ssanames.h"
69 #include "gimplify.h"
70 #include "tree-phinodes.h"
71 #include "cfgloop.h"
72 #include "fold-const.h"
73 #include "intl.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 #define WORKAROUND_PTXJIT_BUG 1
80 /* The various PTX memory areas an object might reside in. */
81 enum nvptx_data_area
83 DATA_AREA_GENERIC,
84 DATA_AREA_GLOBAL,
85 DATA_AREA_SHARED,
86 DATA_AREA_LOCAL,
87 DATA_AREA_CONST,
88 DATA_AREA_PARAM,
89 DATA_AREA_MAX
92 /* We record the data area in the target symbol flags. */
93 #define SYMBOL_DATA_AREA(SYM) \
94 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
95 & 7)
96 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
97 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
99 /* Record the function decls we've written, and the libfuncs and function
100 decls corresponding to them. */
101 static std::stringstream func_decls;
103 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
105 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
106 static bool equal (rtx a, rtx b) { return a == b; }
109 static GTY((cache))
110 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
112 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
114 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
115 static bool equal (tree a, tree b) { return a == b; }
118 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
119 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
121 /* Buffer needed to broadcast across workers. This is used for both
122 worker-neutering and worker broadcasting. It is shared by all
123 functions emitted. The buffer is placed in shared memory. It'd be
124 nice if PTX supported common blocks, because then this could be
125 shared across TUs (taking the largest size). */
126 static unsigned worker_bcast_size;
127 static unsigned worker_bcast_align;
128 static GTY(()) rtx worker_bcast_sym;
130 /* Buffer needed for worker reductions. This has to be distinct from
131 the worker broadcast array, as both may be live concurrently. */
132 static unsigned worker_red_size;
133 static unsigned worker_red_align;
134 static GTY(()) rtx worker_red_sym;
136 /* Global lock variable, needed for 128bit worker & gang reductions. */
137 static GTY(()) tree global_lock_var;
139 /* True if any function references __nvptx_stacks. */
140 static bool need_softstack_decl;
142 /* True if any function references __nvptx_uni. */
143 static bool need_unisimt_decl;
145 /* Allocate a new, cleared machine_function structure. */
147 static struct machine_function *
148 nvptx_init_machine_status (void)
150 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
151 p->return_mode = VOIDmode;
152 return p;
155 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
156 and -fopenacc is also enabled. */
158 static void
159 diagnose_openacc_conflict (bool optval, const char *optname)
161 if (flag_openacc && optval)
162 error ("option %s is not supported together with -fopenacc", optname);
165 /* Implement TARGET_OPTION_OVERRIDE. */
167 static void
168 nvptx_option_override (void)
170 init_machine_status = nvptx_init_machine_status;
172 /* Set toplevel_reorder, unless explicitly disabled. We need
173 reordering so that we emit necessary assembler decls of
174 undeclared variables. */
175 if (!global_options_set.x_flag_toplevel_reorder)
176 flag_toplevel_reorder = 1;
178 /* Set flag_no_common, unless explicitly disabled. We fake common
179 using .weak, and that's not entirely accurate, so avoid it
180 unless forced. */
181 if (!global_options_set.x_flag_no_common)
182 flag_no_common = 1;
184 /* The patch area requires nops, which we don't have. */
185 if (function_entry_patch_area_size > 0)
186 sorry ("not generating patch area, nops not supported");
188 /* Assumes that it will see only hard registers. */
189 flag_var_tracking = 0;
191 if (nvptx_optimize < 0)
192 nvptx_optimize = optimize > 0;
194 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
195 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
196 declared_libfuncs_htab
197 = hash_table<declared_libfunc_hasher>::create_ggc (17);
199 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
200 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
201 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
203 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
204 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
205 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
207 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
208 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
209 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
211 if (TARGET_GOMP)
212 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
215 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
216 deal with ptx ideosyncracies. */
218 const char *
219 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
221 switch (mode)
223 case E_BLKmode:
224 return ".b8";
225 case E_BImode:
226 return ".pred";
227 case E_QImode:
228 if (promote)
229 return ".u32";
230 else
231 return ".u8";
232 case E_HImode:
233 return ".u16";
234 case E_SImode:
235 return ".u32";
236 case E_DImode:
237 return ".u64";
239 case E_SFmode:
240 return ".f32";
241 case E_DFmode:
242 return ".f64";
244 case E_V2SImode:
245 return ".v2.u32";
246 case E_V2DImode:
247 return ".v2.u64";
249 default:
250 gcc_unreachable ();
254 /* Encode the PTX data area that DECL (which might not actually be a
255 _DECL) should reside in. */
257 static void
258 nvptx_encode_section_info (tree decl, rtx rtl, int first)
260 default_encode_section_info (decl, rtl, first);
261 if (first && MEM_P (rtl))
263 nvptx_data_area area = DATA_AREA_GENERIC;
265 if (TREE_CONSTANT (decl))
266 area = DATA_AREA_CONST;
267 else if (TREE_CODE (decl) == VAR_DECL)
269 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
271 area = DATA_AREA_SHARED;
272 if (DECL_INITIAL (decl))
273 error ("static initialization of variable %q+D in %<.shared%>"
274 " memory is not supported", decl);
276 else
277 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
280 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
284 /* Return the PTX name of the data area in which SYM should be
285 placed. The symbol must have already been processed by
286 nvptx_encode_seciton_info, or equivalent. */
288 static const char *
289 section_for_sym (rtx sym)
291 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
292 /* Same order as nvptx_data_area enum. */
293 static char const *const areas[] =
294 {"", ".global", ".shared", ".local", ".const", ".param"};
296 return areas[area];
299 /* Similarly for a decl. */
301 static const char *
302 section_for_decl (const_tree decl)
304 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
307 /* Check NAME for special function names and redirect them by returning a
308 replacement. This applies to malloc, free and realloc, for which we
309 want to use libgcc wrappers, and call, which triggers a bug in
310 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
311 not active in an offload compiler -- the names are all set by the
312 host-side compiler. */
314 static const char *
315 nvptx_name_replacement (const char *name)
317 if (strcmp (name, "call") == 0)
318 return "__nvptx_call";
319 if (strcmp (name, "malloc") == 0)
320 return "__nvptx_malloc";
321 if (strcmp (name, "free") == 0)
322 return "__nvptx_free";
323 if (strcmp (name, "realloc") == 0)
324 return "__nvptx_realloc";
325 return name;
328 /* If MODE should be treated as two registers of an inner mode, return
329 that inner mode. Otherwise return VOIDmode. */
331 static machine_mode
332 maybe_split_mode (machine_mode mode)
334 if (COMPLEX_MODE_P (mode))
335 return GET_MODE_INNER (mode);
337 if (mode == TImode)
338 return DImode;
340 return VOIDmode;
343 /* Return true if mode should be treated as two registers. */
345 static bool
346 split_mode_p (machine_mode mode)
348 return maybe_split_mode (mode) != VOIDmode;
351 /* Output a register, subreg, or register pair (with optional
352 enclosing braces). */
354 static void
355 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
356 int subreg_offset = -1)
358 if (inner_mode == VOIDmode)
360 if (HARD_REGISTER_NUM_P (regno))
361 fprintf (file, "%s", reg_names[regno]);
362 else
363 fprintf (file, "%%r%d", regno);
365 else if (subreg_offset >= 0)
367 output_reg (file, regno, VOIDmode);
368 fprintf (file, "$%d", subreg_offset);
370 else
372 if (subreg_offset == -1)
373 fprintf (file, "{");
374 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
375 fprintf (file, ",");
376 output_reg (file, regno, inner_mode, 0);
377 if (subreg_offset == -1)
378 fprintf (file, "}");
382 /* Emit forking instructions for MASK. */
384 static void
385 nvptx_emit_forking (unsigned mask, bool is_call)
387 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
388 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
389 if (mask)
391 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
393 /* Emit fork at all levels. This helps form SESE regions, as
394 it creates a block with a single successor before entering a
395 partitooned region. That is a good candidate for the end of
396 an SESE region. */
397 if (!is_call)
398 emit_insn (gen_nvptx_fork (op));
399 emit_insn (gen_nvptx_forked (op));
403 /* Emit joining instructions for MASK. */
405 static void
406 nvptx_emit_joining (unsigned mask, bool is_call)
408 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
409 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
410 if (mask)
412 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
414 /* Emit joining for all non-call pars to ensure there's a single
415 predecessor for the block the join insn ends up in. This is
416 needed for skipping entire loops. */
417 if (!is_call)
418 emit_insn (gen_nvptx_joining (op));
419 emit_insn (gen_nvptx_join (op));
424 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
425 returned in memory. Integer and floating types supported by the
426 machine are passed in registers, everything else is passed in
427 memory. Complex types are split. */
429 static bool
430 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
432 if (type)
434 if (AGGREGATE_TYPE_P (type))
435 return true;
436 if (TREE_CODE (type) == VECTOR_TYPE)
437 return true;
440 if (!for_return && COMPLEX_MODE_P (mode))
441 /* Complex types are passed as two underlying args. */
442 mode = GET_MODE_INNER (mode);
444 if (GET_MODE_CLASS (mode) != MODE_INT
445 && GET_MODE_CLASS (mode) != MODE_FLOAT)
446 return true;
448 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
449 return true;
451 return false;
454 /* A non-memory argument of mode MODE is being passed, determine the mode it
455 should be promoted to. This is also used for determining return
456 type promotion. */
458 static machine_mode
459 promote_arg (machine_mode mode, bool prototyped)
461 if (!prototyped && mode == SFmode)
462 /* K&R float promotion for unprototyped functions. */
463 mode = DFmode;
464 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
465 mode = SImode;
467 return mode;
470 /* A non-memory return type of MODE is being returned. Determine the
471 mode it should be promoted to. */
473 static machine_mode
474 promote_return (machine_mode mode)
476 return promote_arg (mode, true);
479 /* Implement TARGET_FUNCTION_ARG. */
481 static rtx
482 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
483 const_tree, bool named)
485 if (mode == VOIDmode || !named)
486 return NULL_RTX;
488 return gen_reg_rtx (mode);
491 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
493 static rtx
494 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
495 const_tree, bool named)
497 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
499 if (mode == VOIDmode || !named)
500 return NULL_RTX;
502 /* No need to deal with split modes here, the only case that can
503 happen is complex modes and those are dealt with by
504 TARGET_SPLIT_COMPLEX_ARG. */
505 return gen_rtx_UNSPEC (mode,
506 gen_rtvec (1, GEN_INT (cum->count)),
507 UNSPEC_ARG_REG);
510 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
512 static void
513 nvptx_function_arg_advance (cumulative_args_t cum_v,
514 machine_mode ARG_UNUSED (mode),
515 const_tree ARG_UNUSED (type),
516 bool ARG_UNUSED (named))
518 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
520 cum->count++;
523 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
525 For nvptx This is only used for varadic args. The type has already
526 been promoted and/or converted to invisible reference. */
528 static unsigned
529 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
531 return GET_MODE_ALIGNMENT (mode);
534 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
536 For nvptx, we know how to handle functions declared as stdarg: by
537 passing an extra pointer to the unnamed arguments. However, the
538 Fortran frontend can produce a different situation, where a
539 function pointer is declared with no arguments, but the actual
540 function and calls to it take more arguments. In that case, we
541 want to ensure the call matches the definition of the function. */
543 static bool
544 nvptx_strict_argument_naming (cumulative_args_t cum_v)
546 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
548 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
551 /* Implement TARGET_LIBCALL_VALUE. */
553 static rtx
554 nvptx_libcall_value (machine_mode mode, const_rtx)
556 if (!cfun || !cfun->machine->doing_call)
557 /* Pretend to return in a hard reg for early uses before pseudos can be
558 generated. */
559 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
561 return gen_reg_rtx (mode);
564 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
565 where function FUNC returns or receives a value of data type TYPE. */
567 static rtx
568 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
569 bool outgoing)
571 machine_mode mode = promote_return (TYPE_MODE (type));
573 if (outgoing)
575 gcc_assert (cfun);
576 cfun->machine->return_mode = mode;
577 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
580 return nvptx_libcall_value (mode, NULL_RTX);
583 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
585 static bool
586 nvptx_function_value_regno_p (const unsigned int regno)
588 return regno == NVPTX_RETURN_REGNUM;
591 /* Types with a mode other than those supported by the machine are passed by
592 reference in memory. */
594 static bool
595 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
596 machine_mode mode, const_tree type,
597 bool ARG_UNUSED (named))
599 return pass_in_memory (mode, type, false);
602 /* Implement TARGET_RETURN_IN_MEMORY. */
604 static bool
605 nvptx_return_in_memory (const_tree type, const_tree)
607 return pass_in_memory (TYPE_MODE (type), type, true);
610 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
612 static machine_mode
613 nvptx_promote_function_mode (const_tree type, machine_mode mode,
614 int *ARG_UNUSED (punsignedp),
615 const_tree funtype, int for_return)
617 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
620 /* Helper for write_arg. Emit a single PTX argument of MODE, either
621 in a prototype, or as copy in a function prologue. ARGNO is the
622 index of this argument in the PTX function. FOR_REG is negative,
623 if we're emitting the PTX prototype. It is zero if we're copying
624 to an argument register and it is greater than zero if we're
625 copying to a specific hard register. */
627 static int
628 write_arg_mode (std::stringstream &s, int for_reg, int argno,
629 machine_mode mode)
631 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
633 if (for_reg < 0)
635 /* Writing PTX prototype. */
636 s << (argno ? ", " : " (");
637 s << ".param" << ptx_type << " %in_ar" << argno;
639 else
641 s << "\t.reg" << ptx_type << " ";
642 if (for_reg)
643 s << reg_names[for_reg];
644 else
645 s << "%ar" << argno;
646 s << ";\n";
647 if (argno >= 0)
649 s << "\tld.param" << ptx_type << " ";
650 if (for_reg)
651 s << reg_names[for_reg];
652 else
653 s << "%ar" << argno;
654 s << ", [%in_ar" << argno << "];\n";
657 return argno + 1;
660 /* Process function parameter TYPE to emit one or more PTX
661 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
662 is true, if this is a prototyped function, rather than an old-style
663 C declaration. Returns the next argument number to use.
665 The promotion behavior here must match the regular GCC function
666 parameter marshalling machinery. */
668 static int
669 write_arg_type (std::stringstream &s, int for_reg, int argno,
670 tree type, bool prototyped)
672 machine_mode mode = TYPE_MODE (type);
674 if (mode == VOIDmode)
675 return argno;
677 if (pass_in_memory (mode, type, false))
678 mode = Pmode;
679 else
681 bool split = TREE_CODE (type) == COMPLEX_TYPE;
683 if (split)
685 /* Complex types are sent as two separate args. */
686 type = TREE_TYPE (type);
687 mode = TYPE_MODE (type);
688 prototyped = true;
691 mode = promote_arg (mode, prototyped);
692 if (split)
693 argno = write_arg_mode (s, for_reg, argno, mode);
696 return write_arg_mode (s, for_reg, argno, mode);
699 /* Emit a PTX return as a prototype or function prologue declaration
700 for MODE. */
702 static void
703 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
705 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
706 const char *pfx = "\t.reg";
707 const char *sfx = ";\n";
709 if (for_proto)
710 pfx = "(.param", sfx = "_out) ";
712 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
715 /* Process a function return TYPE to emit a PTX return as a prototype
716 or function prologue declaration. Returns true if return is via an
717 additional pointer parameter. The promotion behavior here must
718 match the regular GCC function return mashalling. */
720 static bool
721 write_return_type (std::stringstream &s, bool for_proto, tree type)
723 machine_mode mode = TYPE_MODE (type);
725 if (mode == VOIDmode)
726 return false;
728 bool return_in_mem = pass_in_memory (mode, type, true);
730 if (return_in_mem)
732 if (for_proto)
733 return return_in_mem;
735 /* Named return values can cause us to return a pointer as well
736 as expect an argument for the return location. This is
737 optimization-level specific, so no caller can make use of
738 this data, but more importantly for us, we must ensure it
739 doesn't change the PTX prototype. */
740 mode = (machine_mode) cfun->machine->return_mode;
742 if (mode == VOIDmode)
743 return return_in_mem;
745 /* Clear return_mode to inhibit copy of retval to non-existent
746 retval parameter. */
747 cfun->machine->return_mode = VOIDmode;
749 else
750 mode = promote_return (mode);
752 write_return_mode (s, for_proto, mode);
754 return return_in_mem;
757 /* Look for attributes in ATTRS that would indicate we must write a function
758 as a .entry kernel rather than a .func. Return true if one is found. */
760 static bool
761 write_as_kernel (tree attrs)
763 return (lookup_attribute ("kernel", attrs) != NULL_TREE
764 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
765 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
766 /* For OpenMP target regions, the corresponding kernel entry is emitted from
767 write_omp_entry as a separate function. */
770 /* Emit a linker marker for a function decl or defn. */
772 static void
773 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
774 const char *name)
776 s << "\n// BEGIN";
777 if (globalize)
778 s << " GLOBAL";
779 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
780 s << name << "\n";
783 /* Emit a linker marker for a variable decl or defn. */
785 static void
786 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
788 fprintf (file, "\n// BEGIN%s VAR %s: ",
789 globalize ? " GLOBAL" : "",
790 is_defn ? "DEF" : "DECL");
791 assemble_name_raw (file, name);
792 fputs ("\n", file);
795 /* Write a .func or .kernel declaration or definition along with
796 a helper comment for use by ld. S is the stream to write to, DECL
797 the decl for the function with name NAME. For definitions, emit
798 a declaration too. */
800 static const char *
801 write_fn_proto (std::stringstream &s, bool is_defn,
802 const char *name, const_tree decl)
804 if (is_defn)
805 /* Emit a declaration. The PTX assembler gets upset without it. */
806 name = write_fn_proto (s, false, name, decl);
807 else
809 /* Avoid repeating the name replacement. */
810 name = nvptx_name_replacement (name);
811 if (name[0] == '*')
812 name++;
815 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
817 /* PTX declaration. */
818 if (DECL_EXTERNAL (decl))
819 s << ".extern ";
820 else if (TREE_PUBLIC (decl))
821 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
822 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
824 tree fntype = TREE_TYPE (decl);
825 tree result_type = TREE_TYPE (fntype);
827 /* atomic_compare_exchange_$n builtins have an exceptional calling
828 convention. */
829 int not_atomic_weak_arg = -1;
830 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
831 switch (DECL_FUNCTION_CODE (decl))
833 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
834 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
835 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
836 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
837 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
838 /* These atomics skip the 'weak' parm in an actual library
839 call. We must skip it in the prototype too. */
840 not_atomic_weak_arg = 3;
841 break;
843 default:
844 break;
847 /* Declare the result. */
848 bool return_in_mem = write_return_type (s, true, result_type);
850 s << name;
852 int argno = 0;
854 /* Emit argument list. */
855 if (return_in_mem)
856 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
858 /* We get:
859 NULL in TYPE_ARG_TYPES, for old-style functions
860 NULL in DECL_ARGUMENTS, for builtin functions without another
861 declaration.
862 So we have to pick the best one we have. */
863 tree args = TYPE_ARG_TYPES (fntype);
864 bool prototyped = true;
865 if (!args)
867 args = DECL_ARGUMENTS (decl);
868 prototyped = false;
871 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
873 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
875 if (not_atomic_weak_arg)
876 argno = write_arg_type (s, -1, argno, type, prototyped);
877 else
878 gcc_assert (type == boolean_type_node);
881 if (stdarg_p (fntype))
882 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
884 if (DECL_STATIC_CHAIN (decl))
885 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
887 if (!argno && strcmp (name, "main") == 0)
889 argno = write_arg_type (s, -1, argno, integer_type_node, true);
890 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
893 if (argno)
894 s << ")";
896 s << (is_defn ? "\n" : ";\n");
898 return name;
901 /* Construct a function declaration from a call insn. This can be
902 necessary for two reasons - either we have an indirect call which
903 requires a .callprototype declaration, or we have a libcall
904 generated by emit_library_call for which no decl exists. */
906 static void
907 write_fn_proto_from_insn (std::stringstream &s, const char *name,
908 rtx result, rtx pat)
910 if (!name)
912 s << "\t.callprototype ";
913 name = "_";
915 else
917 name = nvptx_name_replacement (name);
918 write_fn_marker (s, false, true, name);
919 s << "\t.extern .func ";
922 if (result != NULL_RTX)
923 write_return_mode (s, true, GET_MODE (result));
925 s << name;
927 int arg_end = XVECLEN (pat, 0);
928 for (int i = 1; i < arg_end; i++)
930 /* We don't have to deal with mode splitting & promotion here,
931 as that was already done when generating the call
932 sequence. */
933 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
935 write_arg_mode (s, -1, i - 1, mode);
937 if (arg_end != 1)
938 s << ")";
939 s << ";\n";
942 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
943 table and and write a ptx prototype. These are emitted at end of
944 compilation. */
946 static void
947 nvptx_record_fndecl (tree decl)
949 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
950 if (*slot == NULL)
952 *slot = decl;
953 const char *name = get_fnname_from_decl (decl);
954 write_fn_proto (func_decls, false, name, decl);
958 /* Record a libcall or unprototyped external function. CALLEE is the
959 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
960 declaration for it. */
962 static void
963 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
965 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
966 if (*slot == NULL)
968 *slot = callee;
970 const char *name = XSTR (callee, 0);
971 write_fn_proto_from_insn (func_decls, name, retval, pat);
975 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
976 is prototyped, record it now. Otherwise record it as needed at end
977 of compilation, when we might have more information about it. */
979 void
980 nvptx_record_needed_fndecl (tree decl)
982 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
984 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
985 if (*slot == NULL)
986 *slot = decl;
988 else
989 nvptx_record_fndecl (decl);
992 /* SYM is a SYMBOL_REF. If it refers to an external function, record
993 it as needed. */
995 static void
996 nvptx_maybe_record_fnsym (rtx sym)
998 tree decl = SYMBOL_REF_DECL (sym);
1000 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1001 nvptx_record_needed_fndecl (decl);
1004 /* Emit a local array to hold some part of a conventional stack frame
1005 and initialize REGNO to point to it. If the size is zero, it'll
1006 never be valid to dereference, so we can simply initialize to
1007 zero. */
1009 static void
1010 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1012 if (size)
1013 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1014 align, reg_names[regno], size);
1015 fprintf (file, "\t.reg.u%d %s;\n",
1016 POINTER_SIZE, reg_names[regno]);
1017 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1018 : "\tmov.u%d %s, 0;\n"),
1019 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1022 /* Emit soft stack frame setup sequence. */
1024 static void
1025 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1027 /* Maintain 64-bit stack alignment. */
1028 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1029 size = ROUND_UP (size, keep_align);
1030 int bits = POINTER_SIZE;
1031 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1032 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1033 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1034 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1035 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1036 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1037 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1038 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1039 fprintf (file, "\t{\n");
1040 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1041 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1042 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1043 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1044 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1045 bits == 64 ? ".wide" : ".lo", bits / 8);
1046 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1048 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1049 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1051 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1052 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1053 bits, reg_sspprev, reg_sspslot);
1055 /* Initialize %frame = %sspprev - size. */
1056 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1057 bits, reg_frame, reg_sspprev, size);
1059 /* Apply alignment, if larger than 64. */
1060 if (alignment > keep_align)
1061 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1062 bits, reg_frame, reg_frame, -alignment);
1064 size = crtl->outgoing_args_size;
1065 gcc_assert (size % keep_align == 0);
1067 /* Initialize %stack. */
1068 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1069 bits, reg_stack, reg_frame, size);
1071 if (!crtl->is_leaf)
1072 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1073 bits, reg_sspslot, reg_stack);
1074 fprintf (file, "\t}\n");
1075 cfun->machine->has_softstack = true;
1076 need_softstack_decl = true;
1079 /* Emit code to initialize the REGNO predicate register to indicate
1080 whether we are not lane zero on the NAME axis. */
1082 static void
1083 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1085 fprintf (file, "\t{\n");
1086 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1087 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1088 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1089 fprintf (file, "\t}\n");
1092 /* Emit code to initialize predicate and master lane index registers for
1093 -muniform-simt code generation variant. */
1095 static void
1096 nvptx_init_unisimt_predicate (FILE *file)
1098 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1099 int loc = REGNO (cfun->machine->unisimt_location);
1100 int bits = POINTER_SIZE;
1101 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1102 fprintf (file, "\t{\n");
1103 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1104 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1105 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1106 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1107 bits == 64 ? ".wide" : ".lo");
1108 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1109 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1110 if (cfun->machine->unisimt_predicate)
1112 int master = REGNO (cfun->machine->unisimt_master);
1113 int pred = REGNO (cfun->machine->unisimt_predicate);
1114 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1115 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1116 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1117 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1118 /* Compute predicate as 'tid.x == master'. */
1119 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1121 fprintf (file, "\t}\n");
1122 need_unisimt_decl = true;
1125 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1127 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1128 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1130 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1131 __nvptx_uni[tid.y] = 0;
1132 gomp_nvptx_main (ORIG, arg);
1134 ORIG itself should not be emitted as a PTX .entry function. */
1136 static void
1137 write_omp_entry (FILE *file, const char *name, const char *orig)
1139 static bool gomp_nvptx_main_declared;
1140 if (!gomp_nvptx_main_declared)
1142 gomp_nvptx_main_declared = true;
1143 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1144 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1145 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1147 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1148 #define NTID_Y "%ntid.y"
1149 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1150 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1151 {\n\
1152 .reg.u32 %r<3>;\n\
1153 .reg.u" PS " %R<4>;\n\
1154 mov.u32 %r0, %tid.y;\n\
1155 mov.u32 %r1, " NTID_Y ";\n\
1156 mov.u32 %r2, %ctaid.x;\n\
1157 cvt.u" PS ".u32 %R1, %r0;\n\
1158 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1159 mov.u" PS " %R0, __nvptx_stacks;\n\
1160 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1161 ld.param.u" PS " %R2, [%stack];\n\
1162 ld.param.u" PS " %R3, [%sz];\n\
1163 add.u" PS " %R2, %R2, %R3;\n\
1164 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1165 st.shared.u" PS " [%R0], %R2;\n\
1166 mov.u" PS " %R0, __nvptx_uni;\n\
1167 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1168 mov.u32 %r0, 0;\n\
1169 st.shared.u32 [%R0], %r0;\n\
1170 mov.u" PS " %R0, \0;\n\
1171 ld.param.u" PS " %R1, [%arg];\n\
1172 {\n\
1173 .param.u" PS " %P<2>;\n\
1174 st.param.u" PS " [%P0], %R0;\n\
1175 st.param.u" PS " [%P1], %R1;\n\
1176 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1177 }\n\
1178 ret.uni;\n\
1179 }\n"
1180 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1181 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1182 #undef ENTRY_TEMPLATE
1183 #undef NTID_Y
1184 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1185 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1186 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1187 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1188 need_softstack_decl = need_unisimt_decl = true;
1191 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1192 function, including local var decls and copies from the arguments to
1193 local regs. */
1195 void
1196 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1198 tree fntype = TREE_TYPE (decl);
1199 tree result_type = TREE_TYPE (fntype);
1200 int argno = 0;
1202 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1203 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1205 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1206 sprintf (buf, "%s$impl", name);
1207 write_omp_entry (file, name, buf);
1208 name = buf;
1210 /* We construct the initial part of the function into a string
1211 stream, in order to share the prototype writing code. */
1212 std::stringstream s;
1213 write_fn_proto (s, true, name, decl);
1214 s << "{\n";
1216 bool return_in_mem = write_return_type (s, false, result_type);
1217 if (return_in_mem)
1218 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1220 /* Declare and initialize incoming arguments. */
1221 tree args = TYPE_ARG_TYPES (fntype);
1222 bool prototyped = true;
1223 if (!args)
1225 args = DECL_ARGUMENTS (decl);
1226 prototyped = false;
1229 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1231 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1233 argno = write_arg_type (s, 0, argno, type, prototyped);
1236 if (stdarg_p (fntype))
1237 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1238 true);
1240 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1241 write_arg_type (s, STATIC_CHAIN_REGNUM,
1242 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1243 true);
1245 fprintf (file, "%s", s.str().c_str());
1247 /* Usually 'crtl->is_leaf' is computed during register allocator
1248 initialization (which is not done on NVPTX) or for pressure-sensitive
1249 optimizations. Initialize it here, except if already set. */
1250 if (!crtl->is_leaf)
1251 crtl->is_leaf = leaf_function_p ();
1253 HOST_WIDE_INT sz = get_frame_size ();
1254 bool need_frameptr = sz || cfun->machine->has_chain;
1255 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1256 if (!TARGET_SOFT_STACK)
1258 /* Declare a local var for outgoing varargs. */
1259 if (cfun->machine->has_varadic)
1260 init_frame (file, STACK_POINTER_REGNUM,
1261 UNITS_PER_WORD, crtl->outgoing_args_size);
1263 /* Declare a local variable for the frame. Force its size to be
1264 DImode-compatible. */
1265 if (need_frameptr)
1266 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1267 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1269 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1270 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1271 init_softstack_frame (file, alignment, sz);
1273 if (cfun->machine->has_simtreg)
1275 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1276 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1277 align = MAX (align, GET_MODE_SIZE (DImode));
1278 if (!crtl->is_leaf || cfun->calls_alloca)
1279 simtsz = HOST_WIDE_INT_M1U;
1280 if (simtsz == HOST_WIDE_INT_M1U)
1281 simtsz = nvptx_softstack_size;
1282 if (cfun->machine->has_softstack)
1283 simtsz += POINTER_SIZE / 8;
1284 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1285 if (align > GET_MODE_SIZE (DImode))
1286 simtsz += align - GET_MODE_SIZE (DImode);
1287 if (simtsz)
1288 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1289 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1291 /* Declare the pseudos we have as ptx registers. */
1292 int maxregs = max_reg_num ();
1293 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1295 if (regno_reg_rtx[i] != const0_rtx)
1297 machine_mode mode = PSEUDO_REGNO_MODE (i);
1298 machine_mode split = maybe_split_mode (mode);
1300 if (split_mode_p (mode))
1301 mode = split;
1302 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1303 output_reg (file, i, split, -2);
1304 fprintf (file, ";\n");
1308 /* Emit axis predicates. */
1309 if (cfun->machine->axis_predicate[0])
1310 nvptx_init_axis_predicate (file,
1311 REGNO (cfun->machine->axis_predicate[0]), "y");
1312 if (cfun->machine->axis_predicate[1])
1313 nvptx_init_axis_predicate (file,
1314 REGNO (cfun->machine->axis_predicate[1]), "x");
1315 if (cfun->machine->unisimt_predicate
1316 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1317 nvptx_init_unisimt_predicate (file);
1320 /* Output code for switching uniform-simt state. ENTERING indicates whether
1321 we are entering or leaving non-uniform execution region. */
1323 static void
1324 nvptx_output_unisimt_switch (FILE *file, bool entering)
1326 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1327 return;
1328 fprintf (file, "\t{\n");
1329 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1330 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1331 if (!crtl->is_leaf)
1333 int loc = REGNO (cfun->machine->unisimt_location);
1334 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1336 if (cfun->machine->unisimt_predicate)
1338 int master = REGNO (cfun->machine->unisimt_master);
1339 int pred = REGNO (cfun->machine->unisimt_predicate);
1340 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1341 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1342 master, entering ? "%ustmp2" : "0");
1343 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1345 fprintf (file, "\t}\n");
1348 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1349 ENTERING indicates whether we are entering or leaving non-uniform execution.
1350 PTR is the register pointing to allocated storage, it is assigned to on
1351 entering and used to restore state on leaving. SIZE and ALIGN are used only
1352 on entering. */
1354 static void
1355 nvptx_output_softstack_switch (FILE *file, bool entering,
1356 rtx ptr, rtx size, rtx align)
1358 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1359 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1360 return;
1361 int bits = POINTER_SIZE, regno = REGNO (ptr);
1362 fprintf (file, "\t{\n");
1363 if (entering)
1365 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1366 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1367 cfun->machine->simt_stack_size);
1368 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1369 if (CONST_INT_P (size))
1370 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1371 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1372 else
1373 output_reg (file, REGNO (size), VOIDmode);
1374 fputs (";\n", file);
1375 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1376 fprintf (file,
1377 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1378 bits, regno, regno, UINTVAL (align));
1380 if (cfun->machine->has_softstack)
1382 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1383 if (entering)
1385 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1386 bits, regno, bits / 8, reg_stack);
1387 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1388 bits, reg_stack, regno, bits / 8);
1390 else
1392 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1393 bits, reg_stack, regno, bits / 8);
1395 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1397 fprintf (file, "\t}\n");
1400 /* Output code to enter non-uniform execution region. DEST is a register
1401 to hold a per-lane allocation given by SIZE and ALIGN. */
1403 const char *
1404 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1406 nvptx_output_unisimt_switch (asm_out_file, true);
1407 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1408 return "";
1411 /* Output code to leave non-uniform execution region. SRC is the register
1412 holding per-lane storage previously allocated by omp_simt_enter insn. */
1414 const char *
1415 nvptx_output_simt_exit (rtx src)
1417 nvptx_output_unisimt_switch (asm_out_file, false);
1418 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1419 return "";
1422 /* Output instruction that sets soft stack pointer in shared memory to the
1423 value in register given by SRC_REGNO. */
1425 const char *
1426 nvptx_output_set_softstack (unsigned src_regno)
1428 if (cfun->machine->has_softstack && !crtl->is_leaf)
1430 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1431 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1432 output_reg (asm_out_file, src_regno, VOIDmode);
1433 fprintf (asm_out_file, ";\n");
1435 return "";
1437 /* Output a return instruction. Also copy the return value to its outgoing
1438 location. */
1440 const char *
1441 nvptx_output_return (void)
1443 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1445 if (mode != VOIDmode)
1446 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1447 nvptx_ptx_type_from_mode (mode, false),
1448 reg_names[NVPTX_RETURN_REGNUM],
1449 reg_names[NVPTX_RETURN_REGNUM]);
1451 return "ret;";
1454 /* Terminate a function by writing a closing brace to FILE. */
1456 void
1457 nvptx_function_end (FILE *file)
1459 fprintf (file, "}\n");
1462 /* Decide whether we can make a sibling call to a function. For ptx, we
1463 can't. */
1465 static bool
1466 nvptx_function_ok_for_sibcall (tree, tree)
1468 return false;
1471 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1473 static rtx
1474 nvptx_get_drap_rtx (void)
1476 if (TARGET_SOFT_STACK && stack_realign_drap)
1477 return arg_pointer_rtx;
1478 return NULL_RTX;
1481 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1482 argument to the next call. */
1484 static void
1485 nvptx_call_args (rtx arg, tree fntype)
1487 if (!cfun->machine->doing_call)
1489 cfun->machine->doing_call = true;
1490 cfun->machine->is_varadic = false;
1491 cfun->machine->num_args = 0;
1493 if (fntype && stdarg_p (fntype))
1495 cfun->machine->is_varadic = true;
1496 cfun->machine->has_varadic = true;
1497 cfun->machine->num_args++;
1501 if (REG_P (arg) && arg != pc_rtx)
1503 cfun->machine->num_args++;
1504 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1505 cfun->machine->call_args);
1509 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1510 information we recorded. */
1512 static void
1513 nvptx_end_call_args (void)
1515 cfun->machine->doing_call = false;
1516 free_EXPR_LIST_list (&cfun->machine->call_args);
1519 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1520 track of whether calls involving static chains or varargs were seen
1521 in the current function.
1522 For libcalls, maintain a hash table of decls we have seen, and
1523 record a function decl for later when encountering a new one. */
1525 void
1526 nvptx_expand_call (rtx retval, rtx address)
1528 rtx callee = XEXP (address, 0);
1529 rtx varargs = NULL_RTX;
1530 unsigned parallel = 0;
1532 if (!call_insn_operand (callee, Pmode))
1534 callee = force_reg (Pmode, callee);
1535 address = change_address (address, QImode, callee);
1538 if (GET_CODE (callee) == SYMBOL_REF)
1540 tree decl = SYMBOL_REF_DECL (callee);
1541 if (decl != NULL_TREE)
1543 if (DECL_STATIC_CHAIN (decl))
1544 cfun->machine->has_chain = true;
1546 tree attr = oacc_get_fn_attrib (decl);
1547 if (attr)
1549 tree dims = TREE_VALUE (attr);
1551 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1552 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1554 if (TREE_PURPOSE (dims)
1555 && !integer_zerop (TREE_PURPOSE (dims)))
1556 break;
1557 /* Not on this axis. */
1558 parallel ^= GOMP_DIM_MASK (ix);
1559 dims = TREE_CHAIN (dims);
1565 unsigned nargs = cfun->machine->num_args;
1566 if (cfun->machine->is_varadic)
1568 varargs = gen_reg_rtx (Pmode);
1569 emit_move_insn (varargs, stack_pointer_rtx);
1572 rtvec vec = rtvec_alloc (nargs + 1);
1573 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1574 int vec_pos = 0;
1576 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1577 rtx tmp_retval = retval;
1578 if (retval)
1580 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1581 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1582 call = gen_rtx_SET (tmp_retval, call);
1584 XVECEXP (pat, 0, vec_pos++) = call;
1586 /* Construct the call insn, including a USE for each argument pseudo
1587 register. These will be used when printing the insn. */
1588 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1589 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1591 if (varargs)
1592 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1594 gcc_assert (vec_pos = XVECLEN (pat, 0));
1596 nvptx_emit_forking (parallel, true);
1597 emit_call_insn (pat);
1598 nvptx_emit_joining (parallel, true);
1600 if (tmp_retval != retval)
1601 emit_move_insn (retval, tmp_retval);
1604 /* Emit a comparison COMPARE, and return the new test to be used in the
1605 jump. */
1608 nvptx_expand_compare (rtx compare)
1610 rtx pred = gen_reg_rtx (BImode);
1611 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1612 XEXP (compare, 0), XEXP (compare, 1));
1613 emit_insn (gen_rtx_SET (pred, cmp));
1614 return gen_rtx_NE (BImode, pred, const0_rtx);
1617 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1619 void
1620 nvptx_expand_oacc_fork (unsigned mode)
1622 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1625 void
1626 nvptx_expand_oacc_join (unsigned mode)
1628 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1631 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1632 objects. */
1634 static rtx
1635 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1637 rtx res;
1639 switch (GET_MODE (src))
1641 case E_DImode:
1642 res = gen_unpackdisi2 (dst0, dst1, src);
1643 break;
1644 case E_DFmode:
1645 res = gen_unpackdfsi2 (dst0, dst1, src);
1646 break;
1647 default: gcc_unreachable ();
1649 return res;
1652 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1653 object. */
1655 static rtx
1656 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1658 rtx res;
1660 switch (GET_MODE (dst))
1662 case E_DImode:
1663 res = gen_packsidi2 (dst, src0, src1);
1664 break;
1665 case E_DFmode:
1666 res = gen_packsidf2 (dst, src0, src1);
1667 break;
1668 default: gcc_unreachable ();
1670 return res;
1673 /* Generate an instruction or sequence to broadcast register REG
1674 across the vectors of a single warp. */
1677 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1679 rtx res;
1681 switch (GET_MODE (dst))
1683 case E_SImode:
1684 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1685 break;
1686 case E_SFmode:
1687 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1688 break;
1689 case E_DImode:
1690 case E_DFmode:
1692 rtx tmp0 = gen_reg_rtx (SImode);
1693 rtx tmp1 = gen_reg_rtx (SImode);
1695 start_sequence ();
1696 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1697 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1698 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1699 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1700 res = get_insns ();
1701 end_sequence ();
1703 break;
1704 case E_BImode:
1706 rtx tmp = gen_reg_rtx (SImode);
1708 start_sequence ();
1709 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1710 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1711 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1712 res = get_insns ();
1713 end_sequence ();
1715 break;
1716 case E_QImode:
1717 case E_HImode:
1719 rtx tmp = gen_reg_rtx (SImode);
1721 start_sequence ();
1722 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1723 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1724 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1725 tmp)));
1726 res = get_insns ();
1727 end_sequence ();
1729 break;
1731 default:
1732 gcc_unreachable ();
1734 return res;
1737 /* Generate an instruction or sequence to broadcast register REG
1738 across the vectors of a single warp. */
1740 static rtx
1741 nvptx_gen_vcast (rtx reg)
1743 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1746 /* Structure used when generating a worker-level spill or fill. */
1748 struct wcast_data_t
1750 rtx base; /* Register holding base addr of buffer. */
1751 rtx ptr; /* Iteration var, if needed. */
1752 unsigned offset; /* Offset into worker buffer. */
1755 /* Direction of the spill/fill and looping setup/teardown indicator. */
1757 enum propagate_mask
1759 PM_read = 1 << 0,
1760 PM_write = 1 << 1,
1761 PM_loop_begin = 1 << 2,
1762 PM_loop_end = 1 << 3,
1764 PM_read_write = PM_read | PM_write
1767 /* Generate instruction(s) to spill or fill register REG to/from the
1768 worker broadcast array. PM indicates what is to be done, REP
1769 how many loop iterations will be executed (0 for not a loop). */
1771 static rtx
1772 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1774 rtx res;
1775 machine_mode mode = GET_MODE (reg);
1777 switch (mode)
1779 case E_BImode:
1781 rtx tmp = gen_reg_rtx (SImode);
1783 start_sequence ();
1784 if (pm & PM_read)
1785 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1786 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1787 if (pm & PM_write)
1788 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1789 res = get_insns ();
1790 end_sequence ();
1792 break;
1794 default:
1796 rtx addr = data->ptr;
1798 if (!addr)
1800 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1802 if (align > worker_bcast_align)
1803 worker_bcast_align = align;
1804 data->offset = (data->offset + align - 1) & ~(align - 1);
1805 addr = data->base;
1806 if (data->offset)
1807 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1810 addr = gen_rtx_MEM (mode, addr);
1811 if (pm == PM_read)
1812 res = gen_rtx_SET (addr, reg);
1813 else if (pm == PM_write)
1814 res = gen_rtx_SET (reg, addr);
1815 else
1816 gcc_unreachable ();
1818 if (data->ptr)
1820 /* We're using a ptr, increment it. */
1821 start_sequence ();
1823 emit_insn (res);
1824 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1825 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1826 res = get_insns ();
1827 end_sequence ();
1829 else
1830 rep = 1;
1831 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1833 break;
1835 return res;
1838 /* Returns true if X is a valid address for use in a memory reference. */
1840 static bool
1841 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1843 enum rtx_code code = GET_CODE (x);
1845 switch (code)
1847 case REG:
1848 return true;
1850 case PLUS:
1851 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1852 return true;
1853 return false;
1855 case CONST:
1856 case SYMBOL_REF:
1857 case LABEL_REF:
1858 return true;
1860 default:
1861 return false;
1865 /* Machinery to output constant initializers. When beginning an
1866 initializer, we decide on a fragment size (which is visible in ptx
1867 in the type used), and then all initializer data is buffered until
1868 a fragment is filled and ready to be written out. */
1870 static struct
1872 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1873 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1874 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1875 out. */
1876 unsigned size; /* Fragment size to accumulate. */
1877 unsigned offset; /* Offset within current fragment. */
1878 bool started; /* Whether we've output any initializer. */
1879 } init_frag;
1881 /* The current fragment is full, write it out. SYM may provide a
1882 symbolic reference we should output, in which case the fragment
1883 value is the addend. */
1885 static void
1886 output_init_frag (rtx sym)
1888 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1889 unsigned HOST_WIDE_INT val = init_frag.val;
1891 init_frag.started = true;
1892 init_frag.val = 0;
1893 init_frag.offset = 0;
1894 init_frag.remaining--;
1896 if (sym)
1898 fprintf (asm_out_file, "generic(");
1899 output_address (VOIDmode, sym);
1900 fprintf (asm_out_file, val ? ") + " : ")");
1903 if (!sym || val)
1904 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1907 /* Add value VAL of size SIZE to the data we're emitting, and keep
1908 writing out chunks as they fill up. */
1910 static void
1911 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1913 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1915 for (unsigned part = 0; size; size -= part)
1917 val >>= part * BITS_PER_UNIT;
1918 part = init_frag.size - init_frag.offset;
1919 if (part > size)
1920 part = size;
1922 unsigned HOST_WIDE_INT partial
1923 = val << (init_frag.offset * BITS_PER_UNIT);
1924 init_frag.val |= partial & init_frag.mask;
1925 init_frag.offset += part;
1927 if (init_frag.offset == init_frag.size)
1928 output_init_frag (NULL);
1932 /* Target hook for assembling integer object X of size SIZE. */
1934 static bool
1935 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1937 HOST_WIDE_INT val = 0;
1939 switch (GET_CODE (x))
1941 default:
1942 /* Let the generic machinery figure it out, usually for a
1943 CONST_WIDE_INT. */
1944 return false;
1946 case CONST_INT:
1947 nvptx_assemble_value (INTVAL (x), size);
1948 break;
1950 case CONST:
1951 x = XEXP (x, 0);
1952 gcc_assert (GET_CODE (x) == PLUS);
1953 val = INTVAL (XEXP (x, 1));
1954 x = XEXP (x, 0);
1955 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1956 /* FALLTHROUGH */
1958 case SYMBOL_REF:
1959 gcc_assert (size == init_frag.size);
1960 if (init_frag.offset)
1961 sorry ("cannot emit unaligned pointers in ptx assembly");
1963 nvptx_maybe_record_fnsym (x);
1964 init_frag.val = val;
1965 output_init_frag (x);
1966 break;
1969 return true;
1972 /* Output SIZE zero bytes. We ignore the FILE argument since the
1973 functions we're calling to perform the output just use
1974 asm_out_file. */
1976 void
1977 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1979 /* Finish the current fragment, if it's started. */
1980 if (init_frag.offset)
1982 unsigned part = init_frag.size - init_frag.offset;
1983 if (part > size)
1984 part = (unsigned) size;
1985 size -= part;
1986 nvptx_assemble_value (0, part);
1989 /* If this skip doesn't terminate the initializer, write as many
1990 remaining pieces as possible directly. */
1991 if (size < init_frag.remaining * init_frag.size)
1993 while (size >= init_frag.size)
1995 size -= init_frag.size;
1996 output_init_frag (NULL_RTX);
1998 if (size)
1999 nvptx_assemble_value (0, size);
2003 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2004 ignore the FILE arg. */
2006 void
2007 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2009 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2010 nvptx_assemble_value (str[i], 1);
2013 /* Emit a PTX variable decl and prepare for emission of its
2014 initializer. NAME is the symbol name and SETION the PTX data
2015 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2016 The caller has already emitted any indentation and linkage
2017 specifier. It is responsible for any initializer, terminating ;
2018 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2019 this is the opposite way round that PTX wants them! */
2021 static void
2022 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2023 const_tree type, HOST_WIDE_INT size, unsigned align)
2025 while (TREE_CODE (type) == ARRAY_TYPE)
2026 type = TREE_TYPE (type);
2028 if (TREE_CODE (type) == VECTOR_TYPE
2029 || TREE_CODE (type) == COMPLEX_TYPE)
2030 /* Neither vector nor complex types can contain the other. */
2031 type = TREE_TYPE (type);
2033 unsigned elt_size = int_size_in_bytes (type);
2035 /* Largest mode we're prepared to accept. For BLKmode types we
2036 don't know if it'll contain pointer constants, so have to choose
2037 pointer size, otherwise we can choose DImode. */
2038 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2040 elt_size |= GET_MODE_SIZE (elt_mode);
2041 elt_size &= -elt_size; /* Extract LSB set. */
2043 init_frag.size = elt_size;
2044 /* Avoid undefined shift behavior by using '2'. */
2045 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2046 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2047 init_frag.val = 0;
2048 init_frag.offset = 0;
2049 init_frag.started = false;
2050 /* Size might not be a multiple of elt size, if there's an
2051 initialized trailing struct array with smaller type than
2052 elt_size. */
2053 init_frag.remaining = (size + elt_size - 1) / elt_size;
2055 fprintf (file, "%s .align %d .u%d ",
2056 section, align / BITS_PER_UNIT,
2057 elt_size * BITS_PER_UNIT);
2058 assemble_name (file, name);
2060 if (size)
2061 /* We make everything an array, to simplify any initialization
2062 emission. */
2063 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2066 /* Called when the initializer for a decl has been completely output through
2067 combinations of the three functions above. */
2069 static void
2070 nvptx_assemble_decl_end (void)
2072 if (init_frag.offset)
2073 /* This can happen with a packed struct with trailing array member. */
2074 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2075 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2078 /* Output an uninitialized common or file-scope variable. */
2080 void
2081 nvptx_output_aligned_decl (FILE *file, const char *name,
2082 const_tree decl, HOST_WIDE_INT size, unsigned align)
2084 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2086 /* If this is public, it is common. The nearest thing we have to
2087 common is weak. */
2088 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2090 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2091 TREE_TYPE (decl), size, align);
2092 nvptx_assemble_decl_end ();
2095 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2096 writing a constant variable EXP with NAME and SIZE and its
2097 initializer to FILE. */
2099 static void
2100 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2101 const_tree exp, HOST_WIDE_INT obj_size)
2103 write_var_marker (file, true, false, name);
2105 fprintf (file, "\t");
2107 tree type = TREE_TYPE (exp);
2108 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2109 TYPE_ALIGN (type));
2112 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2113 a variable DECL with NAME to FILE. */
2115 void
2116 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2118 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2120 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2121 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2123 tree type = TREE_TYPE (decl);
2124 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2125 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2126 type, obj_size, DECL_ALIGN (decl));
2129 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2131 static void
2132 nvptx_globalize_label (FILE *, const char *)
2136 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2137 declaration only for variable DECL with NAME to FILE. */
2139 static void
2140 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2142 /* The middle end can place constant pool decls into the varpool as
2143 undefined. Until that is fixed, catch the problem here. */
2144 if (DECL_IN_CONSTANT_POOL (decl))
2145 return;
2147 /* We support weak defintions, and hence have the right
2148 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2149 if (DECL_WEAK (decl))
2150 error_at (DECL_SOURCE_LOCATION (decl),
2151 "PTX does not support weak declarations"
2152 " (only weak definitions)");
2153 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2155 fprintf (file, "\t.extern ");
2156 tree size = DECL_SIZE_UNIT (decl);
2157 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2158 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2159 DECL_ALIGN (decl));
2160 nvptx_assemble_decl_end ();
2163 /* Output a pattern for a move instruction. */
2165 const char *
2166 nvptx_output_mov_insn (rtx dst, rtx src)
2168 machine_mode dst_mode = GET_MODE (dst);
2169 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2170 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2171 machine_mode src_inner = (GET_CODE (src) == SUBREG
2172 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2174 rtx sym = src;
2175 if (GET_CODE (sym) == CONST)
2176 sym = XEXP (XEXP (sym, 0), 0);
2177 if (SYMBOL_REF_P (sym))
2179 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2180 return "%.\tcvta%D1%t0\t%0, %1;";
2181 nvptx_maybe_record_fnsym (sym);
2184 if (src_inner == dst_inner)
2185 return "%.\tmov%t0\t%0, %1;";
2187 if (CONSTANT_P (src))
2188 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2189 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2190 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2192 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2194 if (GET_MODE_BITSIZE (dst_mode) == 128
2195 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2197 /* mov.b128 is not supported. */
2198 if (dst_inner == V2DImode && src_inner == TImode)
2199 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2200 else if (dst_inner == TImode && src_inner == V2DImode)
2201 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2203 gcc_unreachable ();
2205 return "%.\tmov.b%T0\t%0, %1;";
2208 return "%.\tcvt%t0%t1\t%0, %1;";
2211 static void nvptx_print_operand (FILE *, rtx, int);
2213 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2214 involves writing .param declarations and in/out copies into them. For
2215 indirect calls, also write the .callprototype. */
2217 const char *
2218 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2220 char buf[16];
2221 static int labelno;
2222 bool needs_tgt = register_operand (callee, Pmode);
2223 rtx pat = PATTERN (insn);
2224 if (GET_CODE (pat) == COND_EXEC)
2225 pat = COND_EXEC_CODE (pat);
2226 int arg_end = XVECLEN (pat, 0);
2227 tree decl = NULL_TREE;
2229 fprintf (asm_out_file, "\t{\n");
2230 if (result != NULL)
2231 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2232 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2233 reg_names[NVPTX_RETURN_REGNUM]);
2235 /* Ensure we have a ptx declaration in the output if necessary. */
2236 if (GET_CODE (callee) == SYMBOL_REF)
2238 decl = SYMBOL_REF_DECL (callee);
2239 if (!decl
2240 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2241 nvptx_record_libfunc (callee, result, pat);
2242 else if (DECL_EXTERNAL (decl))
2243 nvptx_record_fndecl (decl);
2246 if (needs_tgt)
2248 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2249 labelno++;
2250 ASM_OUTPUT_LABEL (asm_out_file, buf);
2251 std::stringstream s;
2252 write_fn_proto_from_insn (s, NULL, result, pat);
2253 fputs (s.str().c_str(), asm_out_file);
2256 for (int argno = 1; argno < arg_end; argno++)
2258 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2259 machine_mode mode = GET_MODE (t);
2260 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2262 /* Mode splitting has already been done. */
2263 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2264 "\t\tst.param%s [%%out_arg%d], ",
2265 ptx_type, argno, ptx_type, argno);
2266 output_reg (asm_out_file, REGNO (t), VOIDmode);
2267 fprintf (asm_out_file, ";\n");
2270 /* The '.' stands for the call's predicate, if any. */
2271 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2272 fprintf (asm_out_file, "\t\tcall ");
2273 if (result != NULL_RTX)
2274 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2276 if (decl)
2278 const char *name = get_fnname_from_decl (decl);
2279 name = nvptx_name_replacement (name);
2280 assemble_name (asm_out_file, name);
2282 else
2283 output_address (VOIDmode, callee);
2285 const char *open = "(";
2286 for (int argno = 1; argno < arg_end; argno++)
2288 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2289 open = "";
2291 if (decl && DECL_STATIC_CHAIN (decl))
2293 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2294 open = "";
2296 if (!open[0])
2297 fprintf (asm_out_file, ")");
2299 if (needs_tgt)
2301 fprintf (asm_out_file, ", ");
2302 assemble_name (asm_out_file, buf);
2304 fprintf (asm_out_file, ";\n");
2306 if (find_reg_note (insn, REG_NORETURN, NULL))
2308 /* No return functions confuse the PTX JIT, as it doesn't realize
2309 the flow control barrier they imply. It can seg fault if it
2310 encounters what looks like an unexitable loop. Emit a trailing
2311 trap and exit, which it does grok. */
2312 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2313 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2316 if (result)
2318 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2320 if (!rval[0])
2321 /* We must escape the '%' that starts RETURN_REGNUM. */
2322 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2323 reg_names[NVPTX_RETURN_REGNUM]);
2324 return rval;
2327 return "}";
2330 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2332 static bool
2333 nvptx_print_operand_punct_valid_p (unsigned char c)
2335 return c == '.' || c== '#';
2338 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2340 static void
2341 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2343 rtx off;
2344 if (GET_CODE (x) == CONST)
2345 x = XEXP (x, 0);
2346 switch (GET_CODE (x))
2348 case PLUS:
2349 off = XEXP (x, 1);
2350 output_address (VOIDmode, XEXP (x, 0));
2351 fprintf (file, "+");
2352 output_address (VOIDmode, off);
2353 break;
2355 case SYMBOL_REF:
2356 case LABEL_REF:
2357 output_addr_const (file, x);
2358 break;
2360 default:
2361 gcc_assert (GET_CODE (x) != MEM);
2362 nvptx_print_operand (file, x, 0);
2363 break;
2367 /* Write assembly language output for the address ADDR to FILE. */
2369 static void
2370 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2372 nvptx_print_address_operand (file, addr, mode);
2375 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2377 Meaning of CODE:
2378 . -- print the predicate for the instruction or an emptry string for an
2379 unconditional one.
2380 # -- print a rounding mode for the instruction
2382 A -- print a data area for a MEM
2383 c -- print an opcode suffix for a comparison operator, including a type code
2384 D -- print a data area for a MEM operand
2385 S -- print a shuffle kind specified by CONST_INT
2386 t -- print a type opcode suffix, promoting QImode to 32 bits
2387 T -- print a type size in bits
2388 u -- print a type opcode suffix without promotions. */
2390 static void
2391 nvptx_print_operand (FILE *file, rtx x, int code)
2393 if (code == '.')
2395 x = current_insn_predicate;
2396 if (x)
2398 fputs ("@", file);
2399 if (GET_CODE (x) == EQ)
2400 fputs ("!", file);
2401 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2403 return;
2405 else if (code == '#')
2407 fputs (".rn", file);
2408 return;
2411 enum rtx_code x_code = GET_CODE (x);
2412 machine_mode mode = GET_MODE (x);
2414 switch (code)
2416 case 'A':
2417 x = XEXP (x, 0);
2418 /* FALLTHROUGH. */
2420 case 'D':
2421 if (GET_CODE (x) == CONST)
2422 x = XEXP (x, 0);
2423 if (GET_CODE (x) == PLUS)
2424 x = XEXP (x, 0);
2426 if (GET_CODE (x) == SYMBOL_REF)
2427 fputs (section_for_sym (x), file);
2428 break;
2430 case 't':
2431 case 'u':
2432 if (x_code == SUBREG)
2434 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2435 if (VECTOR_MODE_P (inner_mode)
2436 && (GET_MODE_SIZE (mode)
2437 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2438 mode = GET_MODE_INNER (inner_mode);
2439 else if (split_mode_p (inner_mode))
2440 mode = maybe_split_mode (inner_mode);
2441 else
2442 mode = inner_mode;
2444 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2445 break;
2447 case 'H':
2448 case 'L':
2450 rtx inner_x = SUBREG_REG (x);
2451 machine_mode inner_mode = GET_MODE (inner_x);
2452 machine_mode split = maybe_split_mode (inner_mode);
2454 output_reg (file, REGNO (inner_x), split,
2455 (code == 'H'
2456 ? GET_MODE_SIZE (inner_mode) / 2
2457 : 0));
2459 break;
2461 case 'S':
2463 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2464 /* Same order as nvptx_shuffle_kind. */
2465 static const char *const kinds[] =
2466 {".up", ".down", ".bfly", ".idx"};
2467 fputs (kinds[kind], file);
2469 break;
2471 case 'T':
2472 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2473 break;
2475 case 'j':
2476 fprintf (file, "@");
2477 goto common;
2479 case 'J':
2480 fprintf (file, "@!");
2481 goto common;
2483 case 'c':
2484 mode = GET_MODE (XEXP (x, 0));
2485 switch (x_code)
2487 case EQ:
2488 fputs (".eq", file);
2489 break;
2490 case NE:
2491 if (FLOAT_MODE_P (mode))
2492 fputs (".neu", file);
2493 else
2494 fputs (".ne", file);
2495 break;
2496 case LE:
2497 case LEU:
2498 fputs (".le", file);
2499 break;
2500 case GE:
2501 case GEU:
2502 fputs (".ge", file);
2503 break;
2504 case LT:
2505 case LTU:
2506 fputs (".lt", file);
2507 break;
2508 case GT:
2509 case GTU:
2510 fputs (".gt", file);
2511 break;
2512 case LTGT:
2513 fputs (".ne", file);
2514 break;
2515 case UNEQ:
2516 fputs (".equ", file);
2517 break;
2518 case UNLE:
2519 fputs (".leu", file);
2520 break;
2521 case UNGE:
2522 fputs (".geu", file);
2523 break;
2524 case UNLT:
2525 fputs (".ltu", file);
2526 break;
2527 case UNGT:
2528 fputs (".gtu", file);
2529 break;
2530 case UNORDERED:
2531 fputs (".nan", file);
2532 break;
2533 case ORDERED:
2534 fputs (".num", file);
2535 break;
2536 default:
2537 gcc_unreachable ();
2539 if (FLOAT_MODE_P (mode)
2540 || x_code == EQ || x_code == NE
2541 || x_code == GEU || x_code == GTU
2542 || x_code == LEU || x_code == LTU)
2543 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2544 else
2545 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2546 break;
2547 default:
2548 common:
2549 switch (x_code)
2551 case SUBREG:
2553 rtx inner_x = SUBREG_REG (x);
2554 machine_mode inner_mode = GET_MODE (inner_x);
2555 machine_mode split = maybe_split_mode (inner_mode);
2557 if (VECTOR_MODE_P (inner_mode)
2558 && (GET_MODE_SIZE (mode)
2559 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2561 output_reg (file, REGNO (inner_x), VOIDmode);
2562 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2564 else if (split_mode_p (inner_mode)
2565 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2566 output_reg (file, REGNO (inner_x), split);
2567 else
2568 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2570 break;
2572 case REG:
2573 output_reg (file, REGNO (x), maybe_split_mode (mode));
2574 break;
2576 case MEM:
2577 fputc ('[', file);
2578 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2579 fputc (']', file);
2580 break;
2582 case CONST_INT:
2583 output_addr_const (file, x);
2584 break;
2586 case CONST:
2587 case SYMBOL_REF:
2588 case LABEL_REF:
2589 /* We could use output_addr_const, but that can print things like
2590 "x-8", which breaks ptxas. Need to ensure it is output as
2591 "x+-8". */
2592 nvptx_print_address_operand (file, x, VOIDmode);
2593 break;
2595 case CONST_DOUBLE:
2596 long vals[2];
2597 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2598 vals[0] &= 0xffffffff;
2599 vals[1] &= 0xffffffff;
2600 if (mode == SFmode)
2601 fprintf (file, "0f%08lx", vals[0]);
2602 else
2603 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2604 break;
2606 case CONST_VECTOR:
2608 unsigned n = CONST_VECTOR_NUNITS (x);
2609 fprintf (file, "{ ");
2610 for (unsigned i = 0; i < n; ++i)
2612 if (i != 0)
2613 fprintf (file, ", ");
2615 rtx elem = CONST_VECTOR_ELT (x, i);
2616 output_addr_const (file, elem);
2618 fprintf (file, " }");
2620 break;
2622 default:
2623 output_addr_const (file, x);
2628 /* Record replacement regs used to deal with subreg operands. */
2629 struct reg_replace
2631 rtx replacement[MAX_RECOG_OPERANDS];
2632 machine_mode mode;
2633 int n_allocated;
2634 int n_in_use;
2637 /* Allocate or reuse a replacement in R and return the rtx. */
2639 static rtx
2640 get_replacement (struct reg_replace *r)
2642 if (r->n_allocated == r->n_in_use)
2643 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2644 return r->replacement[r->n_in_use++];
2647 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2648 the presence of subregs would break the rules for most instructions.
2649 Replace them with a suitable new register of the right size, plus
2650 conversion copyin/copyout instructions. */
2652 static void
2653 nvptx_reorg_subreg (void)
2655 struct reg_replace qiregs, hiregs, siregs, diregs;
2656 rtx_insn *insn, *next;
2658 qiregs.n_allocated = 0;
2659 hiregs.n_allocated = 0;
2660 siregs.n_allocated = 0;
2661 diregs.n_allocated = 0;
2662 qiregs.mode = QImode;
2663 hiregs.mode = HImode;
2664 siregs.mode = SImode;
2665 diregs.mode = DImode;
2667 for (insn = get_insns (); insn; insn = next)
2669 next = NEXT_INSN (insn);
2670 if (!NONDEBUG_INSN_P (insn)
2671 || asm_noperands (PATTERN (insn)) >= 0
2672 || GET_CODE (PATTERN (insn)) == USE
2673 || GET_CODE (PATTERN (insn)) == CLOBBER)
2674 continue;
2676 qiregs.n_in_use = 0;
2677 hiregs.n_in_use = 0;
2678 siregs.n_in_use = 0;
2679 diregs.n_in_use = 0;
2680 extract_insn (insn);
2681 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2683 for (int i = 0; i < recog_data.n_operands; i++)
2685 rtx op = recog_data.operand[i];
2686 if (GET_CODE (op) != SUBREG)
2687 continue;
2689 rtx inner = SUBREG_REG (op);
2691 machine_mode outer_mode = GET_MODE (op);
2692 machine_mode inner_mode = GET_MODE (inner);
2693 gcc_assert (s_ok);
2694 if (s_ok
2695 && (GET_MODE_PRECISION (inner_mode)
2696 >= GET_MODE_PRECISION (outer_mode)))
2697 continue;
2698 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2699 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2700 : outer_mode == HImode ? &hiregs
2701 : outer_mode == SImode ? &siregs
2702 : &diregs);
2703 rtx new_reg = get_replacement (r);
2705 if (recog_data.operand_type[i] != OP_OUT)
2707 enum rtx_code code;
2708 if (GET_MODE_PRECISION (inner_mode)
2709 < GET_MODE_PRECISION (outer_mode))
2710 code = ZERO_EXTEND;
2711 else
2712 code = TRUNCATE;
2714 rtx pat = gen_rtx_SET (new_reg,
2715 gen_rtx_fmt_e (code, outer_mode, inner));
2716 emit_insn_before (pat, insn);
2719 if (recog_data.operand_type[i] != OP_IN)
2721 enum rtx_code code;
2722 if (GET_MODE_PRECISION (inner_mode)
2723 < GET_MODE_PRECISION (outer_mode))
2724 code = TRUNCATE;
2725 else
2726 code = ZERO_EXTEND;
2728 rtx pat = gen_rtx_SET (inner,
2729 gen_rtx_fmt_e (code, inner_mode, new_reg));
2730 emit_insn_after (pat, insn);
2732 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2737 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2738 first use. */
2740 static rtx
2741 nvptx_get_unisimt_master ()
2743 rtx &master = cfun->machine->unisimt_master;
2744 return master ? master : master = gen_reg_rtx (SImode);
2747 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2749 static rtx
2750 nvptx_get_unisimt_predicate ()
2752 rtx &pred = cfun->machine->unisimt_predicate;
2753 return pred ? pred : pred = gen_reg_rtx (BImode);
2756 /* Return true if given call insn references one of the functions provided by
2757 the CUDA runtime: malloc, free, vprintf. */
2759 static bool
2760 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2762 rtx pat = PATTERN (insn);
2763 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2764 pat = XVECEXP (pat, 0, 0);
2765 if (GET_CODE (pat) == SET)
2766 pat = SET_SRC (pat);
2767 gcc_checking_assert (GET_CODE (pat) == CALL
2768 && GET_CODE (XEXP (pat, 0)) == MEM);
2769 rtx addr = XEXP (XEXP (pat, 0), 0);
2770 if (GET_CODE (addr) != SYMBOL_REF)
2771 return false;
2772 const char *name = XSTR (addr, 0);
2773 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2774 references with forced assembler name refer to PTX syscalls. For vprintf,
2775 accept both normal and forced-assembler-name references. */
2776 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2777 || !strcmp (name, "*malloc")
2778 || !strcmp (name, "*free"));
2781 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2782 propagate its value from lane MASTER to current lane. */
2784 static void
2785 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2787 rtx reg;
2788 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2789 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2792 /* Adjust code for uniform-simt code generation variant by making atomics and
2793 "syscalls" conditionally executed, and inserting shuffle-based propagation
2794 for registers being set. */
2796 static void
2797 nvptx_reorg_uniform_simt ()
2799 rtx_insn *insn, *next;
2801 for (insn = get_insns (); insn; insn = next)
2803 next = NEXT_INSN (insn);
2804 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2805 && !(NONJUMP_INSN_P (insn)
2806 && GET_CODE (PATTERN (insn)) == PARALLEL
2807 && get_attr_atomic (insn)))
2808 continue;
2809 rtx pat = PATTERN (insn);
2810 rtx master = nvptx_get_unisimt_master ();
2811 for (int i = 0; i < XVECLEN (pat, 0); i++)
2812 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2813 rtx pred = nvptx_get_unisimt_predicate ();
2814 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2815 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2816 validate_change (insn, &PATTERN (insn), pat, false);
2820 /* Loop structure of the function. The entire function is described as
2821 a NULL loop. */
2823 struct parallel
2825 /* Parent parallel. */
2826 parallel *parent;
2828 /* Next sibling parallel. */
2829 parallel *next;
2831 /* First child parallel. */
2832 parallel *inner;
2834 /* Partitioning mask of the parallel. */
2835 unsigned mask;
2837 /* Partitioning used within inner parallels. */
2838 unsigned inner_mask;
2840 /* Location of parallel forked and join. The forked is the first
2841 block in the parallel and the join is the first block after of
2842 the partition. */
2843 basic_block forked_block;
2844 basic_block join_block;
2846 rtx_insn *forked_insn;
2847 rtx_insn *join_insn;
2849 rtx_insn *fork_insn;
2850 rtx_insn *joining_insn;
2852 /* Basic blocks in this parallel, but not in child parallels. The
2853 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2854 blocks are not. */
2855 auto_vec<basic_block> blocks;
2857 public:
2858 parallel (parallel *parent, unsigned mode);
2859 ~parallel ();
2862 /* Constructor links the new parallel into it's parent's chain of
2863 children. */
2865 parallel::parallel (parallel *parent_, unsigned mask_)
2866 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2868 forked_block = join_block = 0;
2869 forked_insn = join_insn = 0;
2870 fork_insn = joining_insn = 0;
2872 if (parent)
2874 next = parent->inner;
2875 parent->inner = this;
2879 parallel::~parallel ()
2881 delete inner;
2882 delete next;
2885 /* Map of basic blocks to insns */
2886 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2888 /* A tuple of an insn of interest and the BB in which it resides. */
2889 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2890 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2892 /* Split basic blocks such that each forked and join unspecs are at
2893 the start of their basic blocks. Thus afterwards each block will
2894 have a single partitioning mode. We also do the same for return
2895 insns, as they are executed by every thread. Return the
2896 partitioning mode of the function as a whole. Populate MAP with
2897 head and tail blocks. We also clear the BB visited flag, which is
2898 used when finding partitions. */
2900 static void
2901 nvptx_split_blocks (bb_insn_map_t *map)
2903 insn_bb_vec_t worklist;
2904 basic_block block;
2905 rtx_insn *insn;
2907 /* Locate all the reorg instructions of interest. */
2908 FOR_ALL_BB_FN (block, cfun)
2910 bool seen_insn = false;
2912 /* Clear visited flag, for use by parallel locator */
2913 block->flags &= ~BB_VISITED;
2915 FOR_BB_INSNS (block, insn)
2917 if (!INSN_P (insn))
2918 continue;
2919 switch (recog_memoized (insn))
2921 default:
2922 seen_insn = true;
2923 continue;
2924 case CODE_FOR_nvptx_forked:
2925 case CODE_FOR_nvptx_join:
2926 break;
2928 case CODE_FOR_return:
2929 /* We also need to split just before return insns, as
2930 that insn needs executing by all threads, but the
2931 block it is in probably does not. */
2932 break;
2935 if (seen_insn)
2936 /* We've found an instruction that must be at the start of
2937 a block, but isn't. Add it to the worklist. */
2938 worklist.safe_push (insn_bb_t (insn, block));
2939 else
2940 /* It was already the first instruction. Just add it to
2941 the map. */
2942 map->get_or_insert (block) = insn;
2943 seen_insn = true;
2947 /* Split blocks on the worklist. */
2948 unsigned ix;
2949 insn_bb_t *elt;
2950 basic_block remap = 0;
2951 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2953 if (remap != elt->second)
2955 block = elt->second;
2956 remap = block;
2959 /* Split block before insn. The insn is in the new block */
2960 edge e = split_block (block, PREV_INSN (elt->first));
2962 block = e->dest;
2963 map->get_or_insert (block) = elt->first;
2967 /* BLOCK is a basic block containing a head or tail instruction.
2968 Locate the associated prehead or pretail instruction, which must be
2969 in the single predecessor block. */
2971 static rtx_insn *
2972 nvptx_discover_pre (basic_block block, int expected)
2974 gcc_assert (block->preds->length () == 1);
2975 basic_block pre_block = (*block->preds)[0]->src;
2976 rtx_insn *pre_insn;
2978 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2979 pre_insn = PREV_INSN (pre_insn))
2980 gcc_assert (pre_insn != BB_HEAD (pre_block));
2982 gcc_assert (recog_memoized (pre_insn) == expected);
2983 return pre_insn;
2986 /* Dump this parallel and all its inner parallels. */
2988 static void
2989 nvptx_dump_pars (parallel *par, unsigned depth)
2991 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2992 depth, par->mask,
2993 par->forked_block ? par->forked_block->index : -1,
2994 par->join_block ? par->join_block->index : -1);
2996 fprintf (dump_file, " blocks:");
2998 basic_block block;
2999 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3000 fprintf (dump_file, " %d", block->index);
3001 fprintf (dump_file, "\n");
3002 if (par->inner)
3003 nvptx_dump_pars (par->inner, depth + 1);
3005 if (par->next)
3006 nvptx_dump_pars (par->next, depth);
3009 /* If BLOCK contains a fork/join marker, process it to create or
3010 terminate a loop structure. Add this block to the current loop,
3011 and then walk successor blocks. */
3013 static parallel *
3014 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3016 if (block->flags & BB_VISITED)
3017 return par;
3018 block->flags |= BB_VISITED;
3020 if (rtx_insn **endp = map->get (block))
3022 rtx_insn *end = *endp;
3024 /* This is a block head or tail, or return instruction. */
3025 switch (recog_memoized (end))
3027 case CODE_FOR_return:
3028 /* Return instructions are in their own block, and we
3029 don't need to do anything more. */
3030 return par;
3032 case CODE_FOR_nvptx_forked:
3033 /* Loop head, create a new inner loop and add it into
3034 our parent's child list. */
3036 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3038 gcc_assert (mask);
3039 par = new parallel (par, mask);
3040 par->forked_block = block;
3041 par->forked_insn = end;
3042 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3043 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
3044 par->fork_insn
3045 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3047 break;
3049 case CODE_FOR_nvptx_join:
3050 /* A loop tail. Finish the current loop and return to
3051 parent. */
3053 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3055 gcc_assert (par->mask == mask);
3056 par->join_block = block;
3057 par->join_insn = end;
3058 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3059 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
3060 par->joining_insn
3061 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3062 par = par->parent;
3064 break;
3066 default:
3067 gcc_unreachable ();
3071 if (par)
3072 /* Add this block onto the current loop's list of blocks. */
3073 par->blocks.safe_push (block);
3074 else
3075 /* This must be the entry block. Create a NULL parallel. */
3076 par = new parallel (0, 0);
3078 /* Walk successor blocks. */
3079 edge e;
3080 edge_iterator ei;
3082 FOR_EACH_EDGE (e, ei, block->succs)
3083 nvptx_find_par (map, par, e->dest);
3085 return par;
3088 /* DFS walk the CFG looking for fork & join markers. Construct
3089 loop structures as we go. MAP is a mapping of basic blocks
3090 to head & tail markers, discovered when splitting blocks. This
3091 speeds up the discovery. We rely on the BB visited flag having
3092 been cleared when splitting blocks. */
3094 static parallel *
3095 nvptx_discover_pars (bb_insn_map_t *map)
3097 basic_block block;
3099 /* Mark exit blocks as visited. */
3100 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3101 block->flags |= BB_VISITED;
3103 /* And entry block as not. */
3104 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3105 block->flags &= ~BB_VISITED;
3107 parallel *par = nvptx_find_par (map, 0, block);
3109 if (dump_file)
3111 fprintf (dump_file, "\nLoops\n");
3112 nvptx_dump_pars (par, 0);
3113 fprintf (dump_file, "\n");
3116 return par;
3119 /* Analyse a group of BBs within a partitioned region and create N
3120 Single-Entry-Single-Exit regions. Some of those regions will be
3121 trivial ones consisting of a single BB. The blocks of a
3122 partitioned region might form a set of disjoint graphs -- because
3123 the region encloses a differently partitoned sub region.
3125 We use the linear time algorithm described in 'Finding Regions Fast:
3126 Single Entry Single Exit and control Regions in Linear Time'
3127 Johnson, Pearson & Pingali. That algorithm deals with complete
3128 CFGs, where a back edge is inserted from END to START, and thus the
3129 problem becomes one of finding equivalent loops.
3131 In this case we have a partial CFG. We complete it by redirecting
3132 any incoming edge to the graph to be from an arbitrary external BB,
3133 and similarly redirecting any outgoing edge to be to that BB.
3134 Thus we end up with a closed graph.
3136 The algorithm works by building a spanning tree of an undirected
3137 graph and keeping track of back edges from nodes further from the
3138 root in the tree to nodes nearer to the root in the tree. In the
3139 description below, the root is up and the tree grows downwards.
3141 We avoid having to deal with degenerate back-edges to the same
3142 block, by splitting each BB into 3 -- one for input edges, one for
3143 the node itself and one for the output edges. Such back edges are
3144 referred to as 'Brackets'. Cycle equivalent nodes will have the
3145 same set of brackets.
3147 Determining bracket equivalency is done by maintaining a list of
3148 brackets in such a manner that the list length and final bracket
3149 uniquely identify the set.
3151 We use coloring to mark all BBs with cycle equivalency with the
3152 same color. This is the output of the 'Finding Regions Fast'
3153 algorithm. Notice it doesn't actually find the set of nodes within
3154 a particular region, just unorderd sets of nodes that are the
3155 entries and exits of SESE regions.
3157 After determining cycle equivalency, we need to find the minimal
3158 set of SESE regions. Do this with a DFS coloring walk of the
3159 complete graph. We're either 'looking' or 'coloring'. When
3160 looking, and we're in the subgraph, we start coloring the color of
3161 the current node, and remember that node as the start of the
3162 current color's SESE region. Every time we go to a new node, we
3163 decrement the count of nodes with thet color. If it reaches zero,
3164 we remember that node as the end of the current color's SESE region
3165 and return to 'looking'. Otherwise we color the node the current
3166 color.
3168 This way we end up with coloring the inside of non-trivial SESE
3169 regions with the color of that region. */
3171 /* A pair of BBs. We use this to represent SESE regions. */
3172 typedef std::pair<basic_block, basic_block> bb_pair_t;
3173 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3175 /* A node in the undirected CFG. The discriminator SECOND indicates just
3176 above or just below the BB idicated by FIRST. */
3177 typedef std::pair<basic_block, int> pseudo_node_t;
3179 /* A bracket indicates an edge towards the root of the spanning tree of the
3180 undirected graph. Each bracket has a color, determined
3181 from the currrent set of brackets. */
3182 struct bracket
3184 pseudo_node_t back; /* Back target */
3186 /* Current color and size of set. */
3187 unsigned color;
3188 unsigned size;
3190 bracket (pseudo_node_t back_)
3191 : back (back_), color (~0u), size (~0u)
3195 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3197 if (length != size)
3199 size = length;
3200 color = color_counts.length ();
3201 color_counts.quick_push (0);
3203 color_counts[color]++;
3204 return color;
3208 typedef auto_vec<bracket> bracket_vec_t;
3210 /* Basic block info for finding SESE regions. */
3212 struct bb_sese
3214 int node; /* Node number in spanning tree. */
3215 int parent; /* Parent node number. */
3217 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3218 edges arrive at pseudo-node Ai and the outgoing edges leave at
3219 pseudo-node Ao. We have to remember which way we arrived at a
3220 particular node when generating the spanning tree. dir > 0 means
3221 we arrived at Ai, dir < 0 means we arrived at Ao. */
3222 int dir;
3224 /* Lowest numbered pseudo-node reached via a backedge from thsis
3225 node, or any descendant. */
3226 pseudo_node_t high;
3228 int color; /* Cycle-equivalence color */
3230 /* Stack of brackets for this node. */
3231 bracket_vec_t brackets;
3233 bb_sese (unsigned node_, unsigned p, int dir_)
3234 :node (node_), parent (p), dir (dir_)
3237 ~bb_sese ();
3239 /* Push a bracket ending at BACK. */
3240 void push (const pseudo_node_t &back)
3242 if (dump_file)
3243 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3244 back.first ? back.first->index : 0, back.second);
3245 brackets.safe_push (bracket (back));
3248 void append (bb_sese *child);
3249 void remove (const pseudo_node_t &);
3251 /* Set node's color. */
3252 void set_color (auto_vec<unsigned> &color_counts)
3254 color = brackets.last ().get_color (color_counts, brackets.length ());
3258 bb_sese::~bb_sese ()
3262 /* Destructively append CHILD's brackets. */
3264 void
3265 bb_sese::append (bb_sese *child)
3267 if (int len = child->brackets.length ())
3269 int ix;
3271 if (dump_file)
3273 for (ix = 0; ix < len; ix++)
3275 const pseudo_node_t &pseudo = child->brackets[ix].back;
3276 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3277 child->node, pseudo.first ? pseudo.first->index : 0,
3278 pseudo.second);
3281 if (!brackets.length ())
3282 std::swap (brackets, child->brackets);
3283 else
3285 brackets.reserve (len);
3286 for (ix = 0; ix < len; ix++)
3287 brackets.quick_push (child->brackets[ix]);
3292 /* Remove brackets that terminate at PSEUDO. */
3294 void
3295 bb_sese::remove (const pseudo_node_t &pseudo)
3297 unsigned removed = 0;
3298 int len = brackets.length ();
3300 for (int ix = 0; ix < len; ix++)
3302 if (brackets[ix].back == pseudo)
3304 if (dump_file)
3305 fprintf (dump_file, "Removing backedge %d:%+d\n",
3306 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3307 removed++;
3309 else if (removed)
3310 brackets[ix-removed] = brackets[ix];
3312 while (removed--)
3313 brackets.pop ();
3316 /* Accessors for BB's aux pointer. */
3317 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3318 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3320 /* DFS walk creating SESE data structures. Only cover nodes with
3321 BB_VISITED set. Append discovered blocks to LIST. We number in
3322 increments of 3 so that the above and below pseudo nodes can be
3323 implicitly numbered too. */
3325 static int
3326 nvptx_sese_number (int n, int p, int dir, basic_block b,
3327 auto_vec<basic_block> *list)
3329 if (BB_GET_SESE (b))
3330 return n;
3332 if (dump_file)
3333 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3334 b->index, n, p, dir);
3336 BB_SET_SESE (b, new bb_sese (n, p, dir));
3337 p = n;
3339 n += 3;
3340 list->quick_push (b);
3342 /* First walk the nodes on the 'other side' of this node, then walk
3343 the nodes on the same side. */
3344 for (unsigned ix = 2; ix; ix--)
3346 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3347 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3348 : offsetof (edge_def, src));
3349 edge e;
3350 edge_iterator (ei);
3352 FOR_EACH_EDGE (e, ei, edges)
3354 basic_block target = *(basic_block *)((char *)e + offset);
3356 if (target->flags & BB_VISITED)
3357 n = nvptx_sese_number (n, p, dir, target, list);
3359 dir = -dir;
3361 return n;
3364 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3365 EDGES are the outgoing edges and OFFSET is the offset to the src
3366 or dst block on the edges. */
3368 static void
3369 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3370 vec<edge, va_gc> *edges, size_t offset)
3372 edge e;
3373 edge_iterator (ei);
3374 int hi_back = depth;
3375 pseudo_node_t node_back (0, depth);
3376 int hi_child = depth;
3377 pseudo_node_t node_child (0, depth);
3378 basic_block child = NULL;
3379 unsigned num_children = 0;
3380 int usd = -dir * sese->dir;
3382 if (dump_file)
3383 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3384 me->index, sese->node, dir);
3386 if (dir < 0)
3388 /* This is the above pseudo-child. It has the BB itself as an
3389 additional child node. */
3390 node_child = sese->high;
3391 hi_child = node_child.second;
3392 if (node_child.first)
3393 hi_child += BB_GET_SESE (node_child.first)->node;
3394 num_children++;
3397 /* Examine each edge.
3398 - if it is a child (a) append its bracket list and (b) record
3399 whether it is the child with the highest reaching bracket.
3400 - if it is an edge to ancestor, record whether it's the highest
3401 reaching backlink. */
3402 FOR_EACH_EDGE (e, ei, edges)
3404 basic_block target = *(basic_block *)((char *)e + offset);
3406 if (bb_sese *t_sese = BB_GET_SESE (target))
3408 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3410 /* Child node. Append its bracket list. */
3411 num_children++;
3412 sese->append (t_sese);
3414 /* Compare it's hi value. */
3415 int t_hi = t_sese->high.second;
3417 if (basic_block child_hi_block = t_sese->high.first)
3418 t_hi += BB_GET_SESE (child_hi_block)->node;
3420 if (hi_child > t_hi)
3422 hi_child = t_hi;
3423 node_child = t_sese->high;
3424 child = target;
3427 else if (t_sese->node < sese->node + dir
3428 && !(dir < 0 && sese->parent == t_sese->node))
3430 /* Non-parental ancestor node -- a backlink. */
3431 int d = usd * t_sese->dir;
3432 int back = t_sese->node + d;
3434 if (hi_back > back)
3436 hi_back = back;
3437 node_back = pseudo_node_t (target, d);
3441 else
3442 { /* Fallen off graph, backlink to entry node. */
3443 hi_back = 0;
3444 node_back = pseudo_node_t (0, 0);
3448 /* Remove any brackets that terminate at this pseudo node. */
3449 sese->remove (pseudo_node_t (me, dir));
3451 /* Now push any backlinks from this pseudo node. */
3452 FOR_EACH_EDGE (e, ei, edges)
3454 basic_block target = *(basic_block *)((char *)e + offset);
3455 if (bb_sese *t_sese = BB_GET_SESE (target))
3457 if (t_sese->node < sese->node + dir
3458 && !(dir < 0 && sese->parent == t_sese->node))
3459 /* Non-parental ancestor node - backedge from me. */
3460 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3462 else
3464 /* back edge to entry node */
3465 sese->push (pseudo_node_t (0, 0));
3469 /* If this node leads directly or indirectly to a no-return region of
3470 the graph, then fake a backedge to entry node. */
3471 if (!sese->brackets.length () || !edges || !edges->length ())
3473 hi_back = 0;
3474 node_back = pseudo_node_t (0, 0);
3475 sese->push (node_back);
3478 /* Record the highest reaching backedge from us or a descendant. */
3479 sese->high = hi_back < hi_child ? node_back : node_child;
3481 if (num_children > 1)
3483 /* There is more than one child -- this is a Y shaped piece of
3484 spanning tree. We have to insert a fake backedge from this
3485 node to the highest ancestor reached by not-the-highest
3486 reaching child. Note that there may be multiple children
3487 with backedges to the same highest node. That's ok and we
3488 insert the edge to that highest node. */
3489 hi_child = depth;
3490 if (dir < 0 && child)
3492 node_child = sese->high;
3493 hi_child = node_child.second;
3494 if (node_child.first)
3495 hi_child += BB_GET_SESE (node_child.first)->node;
3498 FOR_EACH_EDGE (e, ei, edges)
3500 basic_block target = *(basic_block *)((char *)e + offset);
3502 if (target == child)
3503 /* Ignore the highest child. */
3504 continue;
3506 bb_sese *t_sese = BB_GET_SESE (target);
3507 if (!t_sese)
3508 continue;
3509 if (t_sese->parent != sese->node)
3510 /* Not a child. */
3511 continue;
3513 /* Compare its hi value. */
3514 int t_hi = t_sese->high.second;
3516 if (basic_block child_hi_block = t_sese->high.first)
3517 t_hi += BB_GET_SESE (child_hi_block)->node;
3519 if (hi_child > t_hi)
3521 hi_child = t_hi;
3522 node_child = t_sese->high;
3526 sese->push (node_child);
3531 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3532 proceed to successors. Set SESE entry and exit nodes of
3533 REGIONS. */
3535 static void
3536 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3537 basic_block block, int coloring)
3539 bb_sese *sese = BB_GET_SESE (block);
3541 if (block->flags & BB_VISITED)
3543 /* If we've already encountered this block, either we must not
3544 be coloring, or it must have been colored the current color. */
3545 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3546 return;
3549 block->flags |= BB_VISITED;
3551 if (sese)
3553 if (coloring < 0)
3555 /* Start coloring a region. */
3556 regions[sese->color].first = block;
3557 coloring = sese->color;
3560 if (!--color_counts[sese->color] && sese->color == coloring)
3562 /* Found final block of SESE region. */
3563 regions[sese->color].second = block;
3564 coloring = -1;
3566 else
3567 /* Color the node, so we can assert on revisiting the node
3568 that the graph is indeed SESE. */
3569 sese->color = coloring;
3571 else
3572 /* Fallen off the subgraph, we cannot be coloring. */
3573 gcc_assert (coloring < 0);
3575 /* Walk each successor block. */
3576 if (block->succs && block->succs->length ())
3578 edge e;
3579 edge_iterator ei;
3581 FOR_EACH_EDGE (e, ei, block->succs)
3582 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3584 else
3585 gcc_assert (coloring < 0);
3588 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3589 end up with NULL entries in it. */
3591 static void
3592 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3594 basic_block block;
3595 int ix;
3597 /* First clear each BB of the whole function. */
3598 FOR_ALL_BB_FN (block, cfun)
3600 block->flags &= ~BB_VISITED;
3601 BB_SET_SESE (block, 0);
3604 /* Mark blocks in the function that are in this graph. */
3605 for (ix = 0; blocks.iterate (ix, &block); ix++)
3606 block->flags |= BB_VISITED;
3608 /* Counts of nodes assigned to each color. There cannot be more
3609 colors than blocks (and hopefully there will be fewer). */
3610 auto_vec<unsigned> color_counts;
3611 color_counts.reserve (blocks.length ());
3613 /* Worklist of nodes in the spanning tree. Again, there cannot be
3614 more nodes in the tree than blocks (there will be fewer if the
3615 CFG of blocks is disjoint). */
3616 auto_vec<basic_block> spanlist;
3617 spanlist.reserve (blocks.length ());
3619 /* Make sure every block has its cycle class determined. */
3620 for (ix = 0; blocks.iterate (ix, &block); ix++)
3622 if (BB_GET_SESE (block))
3623 /* We already met this block in an earlier graph solve. */
3624 continue;
3626 if (dump_file)
3627 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3629 /* Number the nodes reachable from block initial DFS order. */
3630 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3632 /* Now walk in reverse DFS order to find cycle equivalents. */
3633 while (spanlist.length ())
3635 block = spanlist.pop ();
3636 bb_sese *sese = BB_GET_SESE (block);
3638 /* Do the pseudo node below. */
3639 nvptx_sese_pseudo (block, sese, depth, +1,
3640 sese->dir > 0 ? block->succs : block->preds,
3641 (sese->dir > 0 ? offsetof (edge_def, dest)
3642 : offsetof (edge_def, src)));
3643 sese->set_color (color_counts);
3644 /* Do the pseudo node above. */
3645 nvptx_sese_pseudo (block, sese, depth, -1,
3646 sese->dir < 0 ? block->succs : block->preds,
3647 (sese->dir < 0 ? offsetof (edge_def, dest)
3648 : offsetof (edge_def, src)));
3650 if (dump_file)
3651 fprintf (dump_file, "\n");
3654 if (dump_file)
3656 unsigned count;
3657 const char *comma = "";
3659 fprintf (dump_file, "Found %d cycle equivalents\n",
3660 color_counts.length ());
3661 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3663 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3665 comma = "";
3666 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3667 if (BB_GET_SESE (block)->color == ix)
3669 block->flags |= BB_VISITED;
3670 fprintf (dump_file, "%s%d", comma, block->index);
3671 comma=",";
3673 fprintf (dump_file, "}");
3674 comma = ", ";
3676 fprintf (dump_file, "\n");
3679 /* Now we've colored every block in the subgraph. We now need to
3680 determine the minimal set of SESE regions that cover that
3681 subgraph. Do this with a DFS walk of the complete function.
3682 During the walk we're either 'looking' or 'coloring'. When we
3683 reach the last node of a particular color, we stop coloring and
3684 return to looking. */
3686 /* There cannot be more SESE regions than colors. */
3687 regions.reserve (color_counts.length ());
3688 for (ix = color_counts.length (); ix--;)
3689 regions.quick_push (bb_pair_t (0, 0));
3691 for (ix = 0; blocks.iterate (ix, &block); ix++)
3692 block->flags &= ~BB_VISITED;
3694 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3696 if (dump_file)
3698 const char *comma = "";
3699 int len = regions.length ();
3701 fprintf (dump_file, "SESE regions:");
3702 for (ix = 0; ix != len; ix++)
3704 basic_block from = regions[ix].first;
3705 basic_block to = regions[ix].second;
3707 if (from)
3709 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3710 if (to != from)
3711 fprintf (dump_file, "->%d", to->index);
3713 int color = BB_GET_SESE (from)->color;
3715 /* Print the blocks within the region (excluding ends). */
3716 FOR_EACH_BB_FN (block, cfun)
3718 bb_sese *sese = BB_GET_SESE (block);
3720 if (sese && sese->color == color
3721 && block != from && block != to)
3722 fprintf (dump_file, ".%d", block->index);
3724 fprintf (dump_file, "}");
3726 comma = ",";
3728 fprintf (dump_file, "\n\n");
3731 for (ix = 0; blocks.iterate (ix, &block); ix++)
3732 delete BB_GET_SESE (block);
3735 #undef BB_SET_SESE
3736 #undef BB_GET_SESE
3738 /* Propagate live state at the start of a partitioned region. BLOCK
3739 provides the live register information, and might not contain
3740 INSN. Propagation is inserted just after INSN. RW indicates whether
3741 we are reading and/or writing state. This
3742 separation is needed for worker-level proppagation where we
3743 essentially do a spill & fill. FN is the underlying worker
3744 function to generate the propagation instructions for single
3745 register. DATA is user data.
3747 We propagate the live register set and the entire frame. We could
3748 do better by (a) propagating just the live set that is used within
3749 the partitioned regions and (b) only propagating stack entries that
3750 are used. The latter might be quite hard to determine. */
3752 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3754 static void
3755 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3756 propagator_fn fn, void *data)
3758 bitmap live = DF_LIVE_IN (block);
3759 bitmap_iterator iterator;
3760 unsigned ix;
3762 /* Copy the frame array. */
3763 HOST_WIDE_INT fs = get_frame_size ();
3764 if (fs)
3766 rtx tmp = gen_reg_rtx (DImode);
3767 rtx idx = NULL_RTX;
3768 rtx ptr = gen_reg_rtx (Pmode);
3769 rtx pred = NULL_RTX;
3770 rtx_code_label *label = NULL;
3772 /* The frame size might not be DImode compatible, but the frame
3773 array's declaration will be. So it's ok to round up here. */
3774 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3775 /* Detect single iteration loop. */
3776 if (fs == 1)
3777 fs = 0;
3779 start_sequence ();
3780 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3781 if (fs)
3783 idx = gen_reg_rtx (SImode);
3784 pred = gen_reg_rtx (BImode);
3785 label = gen_label_rtx ();
3787 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3788 /* Allow worker function to initialize anything needed. */
3789 rtx init = fn (tmp, PM_loop_begin, fs, data);
3790 if (init)
3791 emit_insn (init);
3792 emit_label (label);
3793 LABEL_NUSES (label)++;
3794 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3796 if (rw & PM_read)
3797 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3798 emit_insn (fn (tmp, rw, fs, data));
3799 if (rw & PM_write)
3800 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3801 if (fs)
3803 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3804 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3805 emit_insn (gen_br_true_uni (pred, label));
3806 rtx fini = fn (tmp, PM_loop_end, fs, data);
3807 if (fini)
3808 emit_insn (fini);
3809 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3811 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3812 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3813 rtx cpy = get_insns ();
3814 end_sequence ();
3815 insn = emit_insn_after (cpy, insn);
3818 /* Copy live registers. */
3819 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3821 rtx reg = regno_reg_rtx[ix];
3823 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3825 rtx bcast = fn (reg, rw, 0, data);
3827 insn = emit_insn_after (bcast, insn);
3832 /* Worker for nvptx_vpropagate. */
3834 static rtx
3835 vprop_gen (rtx reg, propagate_mask pm,
3836 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3838 if (!(pm & PM_read_write))
3839 return 0;
3841 return nvptx_gen_vcast (reg);
3844 /* Propagate state that is live at start of BLOCK across the vectors
3845 of a single warp. Propagation is inserted just after INSN. */
3847 static void
3848 nvptx_vpropagate (basic_block block, rtx_insn *insn)
3850 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3853 /* Worker for nvptx_wpropagate. */
3855 static rtx
3856 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3858 wcast_data_t *data = (wcast_data_t *)data_;
3860 if (pm & PM_loop_begin)
3862 /* Starting a loop, initialize pointer. */
3863 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3865 if (align > worker_bcast_align)
3866 worker_bcast_align = align;
3867 data->offset = (data->offset + align - 1) & ~(align - 1);
3869 data->ptr = gen_reg_rtx (Pmode);
3871 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3873 else if (pm & PM_loop_end)
3875 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3876 data->ptr = NULL_RTX;
3877 return clobber;
3879 else
3880 return nvptx_gen_wcast (reg, pm, rep, data);
3883 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3884 indicates if this is just before partitioned mode (do spill), or
3885 just after it starts (do fill). Sequence is inserted just after
3886 INSN. */
3888 static void
3889 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3891 wcast_data_t data;
3893 data.base = gen_reg_rtx (Pmode);
3894 data.offset = 0;
3895 data.ptr = NULL_RTX;
3897 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3898 if (data.offset)
3900 /* Stuff was emitted, initialize the base pointer now. */
3901 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3902 emit_insn_after (init, insn);
3904 if (worker_bcast_size < data.offset)
3905 worker_bcast_size = data.offset;
3909 /* Emit a worker-level synchronization barrier. We use different
3910 markers for before and after synchronizations. */
3912 static rtx
3913 nvptx_wsync (bool after)
3915 return gen_nvptx_barsync (GEN_INT (after));
3918 #if WORKAROUND_PTXJIT_BUG
3919 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
3920 real insns. */
3922 static rtx_insn *
3923 bb_first_real_insn (basic_block bb)
3925 rtx_insn *insn;
3927 /* Find first insn of from block. */
3928 FOR_BB_INSNS (bb, insn)
3929 if (INSN_P (insn))
3930 return insn;
3932 return 0;
3934 #endif
3936 /* Single neutering according to MASK. FROM is the incoming block and
3937 TO is the outgoing block. These may be the same block. Insert at
3938 start of FROM:
3940 if (tid.<axis>) goto end.
3942 and insert before ending branch of TO (if there is such an insn):
3944 end:
3945 <possibly-broadcast-cond>
3946 <branch>
3948 We currently only use differnt FROM and TO when skipping an entire
3949 loop. We could do more if we detected superblocks. */
3951 static void
3952 nvptx_single (unsigned mask, basic_block from, basic_block to)
3954 rtx_insn *head = BB_HEAD (from);
3955 rtx_insn *tail = BB_END (to);
3956 unsigned skip_mask = mask;
3958 while (true)
3960 /* Find first insn of from block. */
3961 while (head != BB_END (from) && !INSN_P (head))
3962 head = NEXT_INSN (head);
3964 if (from == to)
3965 break;
3967 if (!(JUMP_P (head) && single_succ_p (from)))
3968 break;
3970 basic_block jump_target = single_succ (from);
3971 if (!single_pred_p (jump_target))
3972 break;
3974 from = jump_target;
3975 head = BB_HEAD (from);
3978 /* Find last insn of to block */
3979 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3980 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3981 tail = PREV_INSN (tail);
3983 /* Detect if tail is a branch. */
3984 rtx tail_branch = NULL_RTX;
3985 rtx cond_branch = NULL_RTX;
3986 if (tail && INSN_P (tail))
3988 tail_branch = PATTERN (tail);
3989 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3990 tail_branch = NULL_RTX;
3991 else
3993 cond_branch = SET_SRC (tail_branch);
3994 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3995 cond_branch = NULL_RTX;
3999 if (tail == head)
4001 /* If this is empty, do nothing. */
4002 if (!head || !INSN_P (head))
4003 return;
4005 /* If this is a dummy insn, do nothing. */
4006 switch (recog_memoized (head))
4008 default:
4009 break;
4010 case CODE_FOR_nvptx_fork:
4011 case CODE_FOR_nvptx_forked:
4012 case CODE_FOR_nvptx_joining:
4013 case CODE_FOR_nvptx_join:
4014 return;
4017 if (cond_branch)
4019 /* If we're only doing vector single, there's no need to
4020 emit skip code because we'll not insert anything. */
4021 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4022 skip_mask = 0;
4024 else if (tail_branch)
4025 /* Block with only unconditional branch. Nothing to do. */
4026 return;
4029 /* Insert the vector test inside the worker test. */
4030 unsigned mode;
4031 rtx_insn *before = tail;
4032 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4033 if (GOMP_DIM_MASK (mode) & skip_mask)
4035 rtx_code_label *label = gen_label_rtx ();
4036 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4038 if (!pred)
4040 pred = gen_reg_rtx (BImode);
4041 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4044 rtx br;
4045 if (mode == GOMP_DIM_VECTOR)
4046 br = gen_br_true (pred, label);
4047 else
4048 br = gen_br_true_uni (pred, label);
4049 emit_insn_before (br, head);
4051 LABEL_NUSES (label)++;
4052 if (tail_branch)
4053 before = emit_label_before (label, before);
4054 else
4055 emit_label_after (label, tail);
4058 /* Now deal with propagating the branch condition. */
4059 if (cond_branch)
4061 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4063 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
4065 /* Vector mode only, do a shuffle. */
4066 #if WORKAROUND_PTXJIT_BUG
4067 /* The branch condition %rcond is propagated like this:
4070 .reg .u32 %x;
4071 mov.u32 %x,%tid.x;
4072 setp.ne.u32 %rnotvzero,%x,0;
4075 @%rnotvzero bra Lskip;
4076 setp.<op>.<type> %rcond,op1,op2;
4077 Lskip:
4078 selp.u32 %rcondu32,1,0,%rcond;
4079 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4080 setp.ne.u32 %rcond,%rcondu32,0;
4082 There seems to be a bug in the ptx JIT compiler (observed at driver
4083 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4084 unless %rcond is initialized to something before 'bra Lskip'. The
4085 bug is not observed with ptxas from cuda 8.0.61.
4087 It is true that the code is non-trivial: at Lskip, %rcond is
4088 uninitialized in threads 1-31, and after the selp the same holds
4089 for %rcondu32. But shfl propagates the defined value in thread 0
4090 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4091 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4093 There is nothing in the PTX spec to suggest that this is wrong, or
4094 to explain why the extra initialization is needed. So, we classify
4095 it as a JIT bug, and the extra initialization as workaround. */
4096 emit_insn_before (gen_movbi (pvar, const0_rtx),
4097 bb_first_real_insn (from));
4098 #endif
4099 emit_insn_before (nvptx_gen_vcast (pvar), tail);
4101 else
4103 /* Includes worker mode, do spill & fill. By construction
4104 we should never have worker mode only. */
4105 wcast_data_t data;
4107 data.base = worker_bcast_sym;
4108 data.ptr = 0;
4110 if (worker_bcast_size < GET_MODE_SIZE (SImode))
4111 worker_bcast_size = GET_MODE_SIZE (SImode);
4113 data.offset = 0;
4114 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
4115 before);
4116 /* Barrier so other workers can see the write. */
4117 emit_insn_before (nvptx_wsync (false), tail);
4118 data.offset = 0;
4119 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
4120 /* This barrier is needed to avoid worker zero clobbering
4121 the broadcast buffer before all the other workers have
4122 had a chance to read this instance of it. */
4123 emit_insn_before (nvptx_wsync (true), tail);
4126 extract_insn (tail);
4127 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4128 UNSPEC_BR_UNIFIED);
4129 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4133 /* PAR is a parallel that is being skipped in its entirety according to
4134 MASK. Treat this as skipping a superblock starting at forked
4135 and ending at joining. */
4137 static void
4138 nvptx_skip_par (unsigned mask, parallel *par)
4140 basic_block tail = par->join_block;
4141 gcc_assert (tail->preds->length () == 1);
4143 basic_block pre_tail = (*tail->preds)[0]->src;
4144 gcc_assert (pre_tail->succs->length () == 1);
4146 nvptx_single (mask, par->forked_block, pre_tail);
4149 /* If PAR has a single inner parallel and PAR itself only contains
4150 empty entry and exit blocks, swallow the inner PAR. */
4152 static void
4153 nvptx_optimize_inner (parallel *par)
4155 parallel *inner = par->inner;
4157 /* We mustn't be the outer dummy par. */
4158 if (!par->mask)
4159 return;
4161 /* We must have a single inner par. */
4162 if (!inner || inner->next)
4163 return;
4165 /* We must only contain 2 blocks ourselves -- the head and tail of
4166 the inner par. */
4167 if (par->blocks.length () != 2)
4168 return;
4170 /* We must be disjoint partitioning. As we only have vector and
4171 worker partitioning, this is sufficient to guarantee the pars
4172 have adjacent partitioning. */
4173 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4174 /* This indicates malformed code generation. */
4175 return;
4177 /* The outer forked insn should be immediately followed by the inner
4178 fork insn. */
4179 rtx_insn *forked = par->forked_insn;
4180 rtx_insn *fork = BB_END (par->forked_block);
4182 if (NEXT_INSN (forked) != fork)
4183 return;
4184 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4186 /* The outer joining insn must immediately follow the inner join
4187 insn. */
4188 rtx_insn *joining = par->joining_insn;
4189 rtx_insn *join = inner->join_insn;
4190 if (NEXT_INSN (join) != joining)
4191 return;
4193 /* Preconditions met. Swallow the inner par. */
4194 if (dump_file)
4195 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4196 inner->mask, inner->forked_block->index,
4197 inner->join_block->index,
4198 par->mask, par->forked_block->index, par->join_block->index);
4200 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4202 par->blocks.reserve (inner->blocks.length ());
4203 while (inner->blocks.length ())
4204 par->blocks.quick_push (inner->blocks.pop ());
4206 par->inner = inner->inner;
4207 inner->inner = NULL;
4209 delete inner;
4212 /* Process the parallel PAR and all its contained
4213 parallels. We do everything but the neutering. Return mask of
4214 partitioned modes used within this parallel. */
4216 static unsigned
4217 nvptx_process_pars (parallel *par)
4219 if (nvptx_optimize)
4220 nvptx_optimize_inner (par);
4222 unsigned inner_mask = par->mask;
4224 /* Do the inner parallels first. */
4225 if (par->inner)
4227 par->inner_mask = nvptx_process_pars (par->inner);
4228 inner_mask |= par->inner_mask;
4231 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
4232 /* No propagation needed for a call. */;
4233 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4235 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
4236 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
4237 /* Insert begin and end synchronizations. */
4238 emit_insn_after (nvptx_wsync (false), par->forked_insn);
4239 emit_insn_before (nvptx_wsync (true), par->joining_insn);
4241 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4242 nvptx_vpropagate (par->forked_block, par->forked_insn);
4244 /* Now do siblings. */
4245 if (par->next)
4246 inner_mask |= nvptx_process_pars (par->next);
4247 return inner_mask;
4250 /* Neuter the parallel described by PAR. We recurse in depth-first
4251 order. MODES are the partitioning of the execution and OUTER is
4252 the partitioning of the parallels we are contained in. */
4254 static void
4255 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4257 unsigned me = (par->mask
4258 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4259 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4260 unsigned skip_mask = 0, neuter_mask = 0;
4262 if (par->inner)
4263 nvptx_neuter_pars (par->inner, modes, outer | me);
4265 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4267 if ((outer | me) & GOMP_DIM_MASK (mode))
4268 {} /* Mode is partitioned: no neutering. */
4269 else if (!(modes & GOMP_DIM_MASK (mode)))
4270 {} /* Mode is not used: nothing to do. */
4271 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4272 || !par->forked_insn)
4273 /* Partitioned in inner parallels, or we're not a partitioned
4274 at all: neuter individual blocks. */
4275 neuter_mask |= GOMP_DIM_MASK (mode);
4276 else if (!par->parent || !par->parent->forked_insn
4277 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4278 /* Parent isn't a parallel or contains this paralleling: skip
4279 parallel at this level. */
4280 skip_mask |= GOMP_DIM_MASK (mode);
4281 else
4282 {} /* Parent will skip this parallel itself. */
4285 if (neuter_mask)
4287 int ix, len;
4289 if (nvptx_optimize)
4291 /* Neuter whole SESE regions. */
4292 bb_pair_vec_t regions;
4294 nvptx_find_sese (par->blocks, regions);
4295 len = regions.length ();
4296 for (ix = 0; ix != len; ix++)
4298 basic_block from = regions[ix].first;
4299 basic_block to = regions[ix].second;
4301 if (from)
4302 nvptx_single (neuter_mask, from, to);
4303 else
4304 gcc_assert (!to);
4307 else
4309 /* Neuter each BB individually. */
4310 len = par->blocks.length ();
4311 for (ix = 0; ix != len; ix++)
4313 basic_block block = par->blocks[ix];
4315 nvptx_single (neuter_mask, block, block);
4320 if (skip_mask)
4321 nvptx_skip_par (skip_mask, par);
4323 if (par->next)
4324 nvptx_neuter_pars (par->next, modes, outer);
4327 /* PTX-specific reorganization
4328 - Split blocks at fork and join instructions
4329 - Compute live registers
4330 - Mark now-unused registers, so function begin doesn't declare
4331 unused registers.
4332 - Insert state propagation when entering partitioned mode
4333 - Insert neutering instructions when in single mode
4334 - Replace subregs with suitable sequences.
4337 static void
4338 nvptx_reorg (void)
4340 /* We are freeing block_for_insn in the toplev to keep compatibility
4341 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4342 compute_bb_for_insn ();
4344 thread_prologue_and_epilogue_insns ();
4346 /* Split blocks and record interesting unspecs. */
4347 bb_insn_map_t bb_insn_map;
4349 nvptx_split_blocks (&bb_insn_map);
4351 /* Compute live regs */
4352 df_clear_flags (DF_LR_RUN_DCE);
4353 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
4354 df_live_add_problem ();
4355 df_live_set_all_dirty ();
4356 df_analyze ();
4357 regstat_init_n_sets_and_refs ();
4359 if (dump_file)
4360 df_dump (dump_file);
4362 /* Mark unused regs as unused. */
4363 int max_regs = max_reg_num ();
4364 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
4365 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4366 regno_reg_rtx[i] = const0_rtx;
4368 /* Determine launch dimensions of the function. If it is not an
4369 offloaded function (i.e. this is a regular compiler), the
4370 function has no neutering. */
4371 tree attr = oacc_get_fn_attrib (current_function_decl);
4372 if (attr)
4374 /* If we determined this mask before RTL expansion, we could
4375 elide emission of some levels of forks and joins. */
4376 unsigned mask = 0;
4377 tree dims = TREE_VALUE (attr);
4378 unsigned ix;
4380 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4382 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4383 tree allowed = TREE_PURPOSE (dims);
4385 if (size != 1 && !(allowed && integer_zerop (allowed)))
4386 mask |= GOMP_DIM_MASK (ix);
4388 /* If there is worker neutering, there must be vector
4389 neutering. Otherwise the hardware will fail. */
4390 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4391 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4393 /* Discover & process partitioned regions. */
4394 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4395 nvptx_process_pars (pars);
4396 nvptx_neuter_pars (pars, mask, 0);
4397 delete pars;
4400 /* Replace subregs. */
4401 nvptx_reorg_subreg ();
4403 if (TARGET_UNIFORM_SIMT)
4404 nvptx_reorg_uniform_simt ();
4406 regstat_free_n_sets_and_refs ();
4408 df_finish_pass (true);
4411 /* Handle a "kernel" attribute; arguments as in
4412 struct attribute_spec.handler. */
4414 static tree
4415 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4416 int ARG_UNUSED (flags), bool *no_add_attrs)
4418 tree decl = *node;
4420 if (TREE_CODE (decl) != FUNCTION_DECL)
4422 error ("%qE attribute only applies to functions", name);
4423 *no_add_attrs = true;
4425 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
4427 error ("%qE attribute requires a void return type", name);
4428 *no_add_attrs = true;
4431 return NULL_TREE;
4434 /* Handle a "shared" attribute; arguments as in
4435 struct attribute_spec.handler. */
4437 static tree
4438 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4439 int ARG_UNUSED (flags), bool *no_add_attrs)
4441 tree decl = *node;
4443 if (TREE_CODE (decl) != VAR_DECL)
4445 error ("%qE attribute only applies to variables", name);
4446 *no_add_attrs = true;
4448 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4450 error ("%qE attribute not allowed with auto storage class", name);
4451 *no_add_attrs = true;
4454 return NULL_TREE;
4457 /* Table of valid machine attributes. */
4458 static const struct attribute_spec nvptx_attribute_table[] =
4460 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
4461 affects_type_identity, exclusions } */
4462 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false,
4463 NULL },
4464 { "shared", 0, 0, true, false, false, nvptx_handle_shared_attribute, false,
4465 NULL },
4466 { NULL, 0, 0, false, false, false, NULL, false, NULL }
4469 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
4471 static HOST_WIDE_INT
4472 nvptx_vector_alignment (const_tree type)
4474 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4476 return MIN (align, BIGGEST_ALIGNMENT);
4479 /* Indicate that INSN cannot be duplicated. */
4481 static bool
4482 nvptx_cannot_copy_insn_p (rtx_insn *insn)
4484 switch (recog_memoized (insn))
4486 case CODE_FOR_nvptx_shufflesi:
4487 case CODE_FOR_nvptx_shufflesf:
4488 case CODE_FOR_nvptx_barsync:
4489 case CODE_FOR_nvptx_fork:
4490 case CODE_FOR_nvptx_forked:
4491 case CODE_FOR_nvptx_joining:
4492 case CODE_FOR_nvptx_join:
4493 return true;
4494 default:
4495 return false;
4499 /* Section anchors do not work. Initialization for flag_section_anchor
4500 probes the existence of the anchoring target hooks and prevents
4501 anchoring if they don't exist. However, we may be being used with
4502 a host-side compiler that does support anchoring, and hence see
4503 the anchor flag set (as it's not recalculated). So provide an
4504 implementation denying anchoring. */
4506 static bool
4507 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4509 return false;
4512 /* Record a symbol for mkoffload to enter into the mapping table. */
4514 static void
4515 nvptx_record_offload_symbol (tree decl)
4517 switch (TREE_CODE (decl))
4519 case VAR_DECL:
4520 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4521 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4522 break;
4524 case FUNCTION_DECL:
4526 tree attr = oacc_get_fn_attrib (decl);
4527 /* OpenMP offloading does not set this attribute. */
4528 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
4530 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4531 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4533 for (; dims; dims = TREE_CHAIN (dims))
4535 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4537 gcc_assert (!TREE_PURPOSE (dims));
4538 fprintf (asm_out_file, ", %#x", size);
4541 fprintf (asm_out_file, "\n");
4543 break;
4545 default:
4546 gcc_unreachable ();
4550 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4551 at the start of a file. */
4553 static void
4554 nvptx_file_start (void)
4556 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4557 fputs ("\t.version\t3.1\n", asm_out_file);
4558 fputs ("\t.target\tsm_30\n", asm_out_file);
4559 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4560 fputs ("// END PREAMBLE\n", asm_out_file);
4563 /* Emit a declaration for a worker-level buffer in .shared memory. */
4565 static void
4566 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
4568 const char *name = XSTR (sym, 0);
4570 write_var_marker (file, true, false, name);
4571 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4572 align, name, size);
4575 /* Write out the function declarations we've collected and declare storage
4576 for the broadcast buffer. */
4578 static void
4579 nvptx_file_end (void)
4581 hash_table<tree_hasher>::iterator iter;
4582 tree decl;
4583 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
4584 nvptx_record_fndecl (decl);
4585 fputs (func_decls.str().c_str(), asm_out_file);
4587 if (worker_bcast_size)
4588 write_worker_buffer (asm_out_file, worker_bcast_sym,
4589 worker_bcast_align, worker_bcast_size);
4591 if (worker_red_size)
4592 write_worker_buffer (asm_out_file, worker_red_sym,
4593 worker_red_align, worker_red_size);
4595 if (need_softstack_decl)
4597 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4598 /* 32 is the maximum number of warps in a block. Even though it's an
4599 external declaration, emit the array size explicitly; otherwise, it
4600 may fail at PTX JIT time if the definition is later in link order. */
4601 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
4602 POINTER_SIZE);
4604 if (need_unisimt_decl)
4606 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
4607 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
4611 /* Expander for the shuffle builtins. */
4613 static rtx
4614 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4616 if (ignore)
4617 return target;
4619 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4620 NULL_RTX, mode, EXPAND_NORMAL);
4621 if (!REG_P (src))
4622 src = copy_to_mode_reg (mode, src);
4624 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
4625 NULL_RTX, SImode, EXPAND_NORMAL);
4626 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
4627 NULL_RTX, SImode, EXPAND_NORMAL);
4629 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4630 idx = copy_to_mode_reg (SImode, idx);
4632 rtx pat = nvptx_gen_shuffle (target, src, idx,
4633 (nvptx_shuffle_kind) INTVAL (op));
4634 if (pat)
4635 emit_insn (pat);
4637 return target;
4640 /* Worker reduction address expander. */
4642 static rtx
4643 nvptx_expand_worker_addr (tree exp, rtx target,
4644 machine_mode ARG_UNUSED (mode), int ignore)
4646 if (ignore)
4647 return target;
4649 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4650 if (align > worker_red_align)
4651 worker_red_align = align;
4653 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4654 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4655 if (size + offset > worker_red_size)
4656 worker_red_size = size + offset;
4658 rtx addr = worker_red_sym;
4659 if (offset)
4661 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4662 addr = gen_rtx_CONST (Pmode, addr);
4665 emit_move_insn (target, addr);
4667 return target;
4670 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4671 not require taking the address of any object, other than the memory
4672 cell being operated on. */
4674 static rtx
4675 nvptx_expand_cmp_swap (tree exp, rtx target,
4676 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4678 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4680 if (!target)
4681 target = gen_reg_rtx (mode);
4683 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4684 NULL_RTX, Pmode, EXPAND_NORMAL);
4685 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4686 NULL_RTX, mode, EXPAND_NORMAL);
4687 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4688 NULL_RTX, mode, EXPAND_NORMAL);
4689 rtx pat;
4691 mem = gen_rtx_MEM (mode, mem);
4692 if (!REG_P (cmp))
4693 cmp = copy_to_mode_reg (mode, cmp);
4694 if (!REG_P (src))
4695 src = copy_to_mode_reg (mode, src);
4697 if (mode == SImode)
4698 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4699 else
4700 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4702 emit_insn (pat);
4704 return target;
4708 /* Codes for all the NVPTX builtins. */
4709 enum nvptx_builtins
4711 NVPTX_BUILTIN_SHUFFLE,
4712 NVPTX_BUILTIN_SHUFFLELL,
4713 NVPTX_BUILTIN_WORKER_ADDR,
4714 NVPTX_BUILTIN_CMP_SWAP,
4715 NVPTX_BUILTIN_CMP_SWAPLL,
4716 NVPTX_BUILTIN_MAX
4719 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4721 /* Return the NVPTX builtin for CODE. */
4723 static tree
4724 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4726 if (code >= NVPTX_BUILTIN_MAX)
4727 return error_mark_node;
4729 return nvptx_builtin_decls[code];
4732 /* Set up all builtin functions for this target. */
4734 static void
4735 nvptx_init_builtins (void)
4737 #define DEF(ID, NAME, T) \
4738 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4739 = add_builtin_function ("__builtin_nvptx_" NAME, \
4740 build_function_type_list T, \
4741 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4742 #define ST sizetype
4743 #define UINT unsigned_type_node
4744 #define LLUINT long_long_unsigned_type_node
4745 #define PTRVOID ptr_type_node
4747 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4748 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4749 DEF (WORKER_ADDR, "worker_addr",
4750 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4751 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4752 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4754 #undef DEF
4755 #undef ST
4756 #undef UINT
4757 #undef LLUINT
4758 #undef PTRVOID
4761 /* Expand an expression EXP that calls a built-in function,
4762 with result going to TARGET if that's convenient
4763 (and in mode MODE if that's convenient).
4764 SUBTARGET may be used as the target for computing one of EXP's operands.
4765 IGNORE is nonzero if the value is to be ignored. */
4767 static rtx
4768 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4769 machine_mode mode, int ignore)
4771 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4772 switch (DECL_FUNCTION_CODE (fndecl))
4774 case NVPTX_BUILTIN_SHUFFLE:
4775 case NVPTX_BUILTIN_SHUFFLELL:
4776 return nvptx_expand_shuffle (exp, target, mode, ignore);
4778 case NVPTX_BUILTIN_WORKER_ADDR:
4779 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4781 case NVPTX_BUILTIN_CMP_SWAP:
4782 case NVPTX_BUILTIN_CMP_SWAPLL:
4783 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4785 default: gcc_unreachable ();
4789 /* Define dimension sizes for known hardware. */
4790 #define PTX_VECTOR_LENGTH 32
4791 #define PTX_WORKER_LENGTH 32
4792 #define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
4794 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
4796 static int
4797 nvptx_simt_vf ()
4799 return PTX_VECTOR_LENGTH;
4802 /* Validate compute dimensions of an OpenACC offload or routine, fill
4803 in non-unity defaults. FN_LEVEL indicates the level at which a
4804 routine might spawn a loop. It is negative for non-routines. If
4805 DECL is null, we are validating the default dimensions. */
4807 static bool
4808 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4810 bool changed = false;
4812 /* The vector size must be 32, unless this is a SEQ routine. */
4813 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4814 && dims[GOMP_DIM_VECTOR] >= 0
4815 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4817 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
4818 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4819 dims[GOMP_DIM_VECTOR]
4820 ? G_("using vector_length (%d), ignoring %d")
4821 : G_("using vector_length (%d), ignoring runtime setting"),
4822 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4823 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4824 changed = true;
4827 /* Check the num workers is not too large. */
4828 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4830 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4831 "using num_workers (%d), ignoring %d",
4832 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4833 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4834 changed = true;
4837 if (!decl)
4839 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4840 if (dims[GOMP_DIM_WORKER] < 0)
4841 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4842 if (dims[GOMP_DIM_GANG] < 0)
4843 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
4844 changed = true;
4847 return changed;
4850 /* Return maximum dimension size, or zero for unbounded. */
4852 static int
4853 nvptx_dim_limit (int axis)
4855 switch (axis)
4857 case GOMP_DIM_WORKER:
4858 return PTX_WORKER_LENGTH;
4860 case GOMP_DIM_VECTOR:
4861 return PTX_VECTOR_LENGTH;
4863 default:
4864 break;
4866 return 0;
4869 /* Determine whether fork & joins are needed. */
4871 static bool
4872 nvptx_goacc_fork_join (gcall *call, const int dims[],
4873 bool ARG_UNUSED (is_fork))
4875 tree arg = gimple_call_arg (call, 2);
4876 unsigned axis = TREE_INT_CST_LOW (arg);
4878 /* We only care about worker and vector partitioning. */
4879 if (axis < GOMP_DIM_WORKER)
4880 return false;
4882 /* If the size is 1, there's no partitioning. */
4883 if (dims[axis] == 1)
4884 return false;
4886 return true;
4889 /* Generate a PTX builtin function call that returns the address in
4890 the worker reduction buffer at OFFSET. TYPE is the type of the
4891 data at that location. */
4893 static tree
4894 nvptx_get_worker_red_addr (tree type, tree offset)
4896 machine_mode mode = TYPE_MODE (type);
4897 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4898 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4899 tree align = build_int_cst (unsigned_type_node,
4900 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4901 tree call = build_call_expr (fndecl, 3, offset, size, align);
4903 return fold_convert (build_pointer_type (type), call);
4906 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4907 will cast the variable if necessary. */
4909 static void
4910 nvptx_generate_vector_shuffle (location_t loc,
4911 tree dest_var, tree var, unsigned shift,
4912 gimple_seq *seq)
4914 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4915 tree_code code = NOP_EXPR;
4916 tree arg_type = unsigned_type_node;
4917 tree var_type = TREE_TYPE (var);
4918 tree dest_type = var_type;
4920 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4921 var_type = TREE_TYPE (var_type);
4923 if (TREE_CODE (var_type) == REAL_TYPE)
4924 code = VIEW_CONVERT_EXPR;
4926 if (TYPE_SIZE (var_type)
4927 == TYPE_SIZE (long_long_unsigned_type_node))
4929 fn = NVPTX_BUILTIN_SHUFFLELL;
4930 arg_type = long_long_unsigned_type_node;
4933 tree call = nvptx_builtin_decl (fn, true);
4934 tree bits = build_int_cst (unsigned_type_node, shift);
4935 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4936 tree expr;
4938 if (var_type != dest_type)
4940 /* Do real and imaginary parts separately. */
4941 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4942 real = fold_build1 (code, arg_type, real);
4943 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4944 real = fold_build1 (code, var_type, real);
4946 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4947 imag = fold_build1 (code, arg_type, imag);
4948 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4949 imag = fold_build1 (code, var_type, imag);
4951 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4953 else
4955 expr = fold_build1 (code, arg_type, var);
4956 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4957 expr = fold_build1 (code, dest_type, expr);
4960 gimplify_assign (dest_var, expr, seq);
4963 /* Lazily generate the global lock var decl and return its address. */
4965 static tree
4966 nvptx_global_lock_addr ()
4968 tree v = global_lock_var;
4970 if (!v)
4972 tree name = get_identifier ("__reduction_lock");
4973 tree type = build_qualified_type (unsigned_type_node,
4974 TYPE_QUAL_VOLATILE);
4975 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4976 global_lock_var = v;
4977 DECL_ARTIFICIAL (v) = 1;
4978 DECL_EXTERNAL (v) = 1;
4979 TREE_STATIC (v) = 1;
4980 TREE_PUBLIC (v) = 1;
4981 TREE_USED (v) = 1;
4982 mark_addressable (v);
4983 mark_decl_referenced (v);
4986 return build_fold_addr_expr (v);
4989 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
4990 GSI. We use a lockless scheme for nearly all case, which looks
4991 like:
4992 actual = initval(OP);
4993 do {
4994 guess = actual;
4995 write = guess OP myval;
4996 actual = cmp&swap (ptr, guess, write)
4997 } while (actual bit-different-to guess);
4998 return write;
5000 This relies on a cmp&swap instruction, which is available for 32-
5001 and 64-bit types. Larger types must use a locking scheme. */
5003 static tree
5004 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5005 tree ptr, tree var, tree_code op)
5007 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5008 tree_code code = NOP_EXPR;
5009 tree arg_type = unsigned_type_node;
5010 tree var_type = TREE_TYPE (var);
5012 if (TREE_CODE (var_type) == COMPLEX_TYPE
5013 || TREE_CODE (var_type) == REAL_TYPE)
5014 code = VIEW_CONVERT_EXPR;
5016 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
5018 arg_type = long_long_unsigned_type_node;
5019 fn = NVPTX_BUILTIN_CMP_SWAPLL;
5022 tree swap_fn = nvptx_builtin_decl (fn, true);
5024 gimple_seq init_seq = NULL;
5025 tree init_var = make_ssa_name (arg_type);
5026 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5027 init_expr = fold_build1 (code, arg_type, init_expr);
5028 gimplify_assign (init_var, init_expr, &init_seq);
5029 gimple *init_end = gimple_seq_last (init_seq);
5031 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5033 /* Split the block just after the init stmts. */
5034 basic_block pre_bb = gsi_bb (*gsi);
5035 edge pre_edge = split_block (pre_bb, init_end);
5036 basic_block loop_bb = pre_edge->dest;
5037 pre_bb = pre_edge->src;
5038 /* Reset the iterator. */
5039 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5041 tree expect_var = make_ssa_name (arg_type);
5042 tree actual_var = make_ssa_name (arg_type);
5043 tree write_var = make_ssa_name (arg_type);
5045 /* Build and insert the reduction calculation. */
5046 gimple_seq red_seq = NULL;
5047 tree write_expr = fold_build1 (code, var_type, expect_var);
5048 write_expr = fold_build2 (op, var_type, write_expr, var);
5049 write_expr = fold_build1 (code, arg_type, write_expr);
5050 gimplify_assign (write_var, write_expr, &red_seq);
5052 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5054 /* Build & insert the cmp&swap sequence. */
5055 gimple_seq latch_seq = NULL;
5056 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5057 ptr, expect_var, write_var);
5058 gimplify_assign (actual_var, swap_expr, &latch_seq);
5060 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5061 NULL_TREE, NULL_TREE);
5062 gimple_seq_add_stmt (&latch_seq, cond);
5064 gimple *latch_end = gimple_seq_last (latch_seq);
5065 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
5067 /* Split the block just after the latch stmts. */
5068 edge post_edge = split_block (loop_bb, latch_end);
5069 basic_block post_bb = post_edge->dest;
5070 loop_bb = post_edge->src;
5071 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5073 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5074 post_edge->probability = profile_probability::even ();
5075 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
5076 loop_edge->probability = profile_probability::even ();
5077 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5078 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5080 gphi *phi = create_phi_node (expect_var, loop_bb);
5081 add_phi_arg (phi, init_var, pre_edge, loc);
5082 add_phi_arg (phi, actual_var, loop_edge, loc);
5084 loop *loop = alloc_loop ();
5085 loop->header = loop_bb;
5086 loop->latch = loop_bb;
5087 add_loop (loop, loop_bb->loop_father);
5089 return fold_build1 (code, var_type, write_var);
5092 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5093 GSI. This is necessary for types larger than 64 bits, where there
5094 is no cmp&swap instruction to implement a lockless scheme. We use
5095 a lock variable in global memory.
5097 while (cmp&swap (&lock_var, 0, 1))
5098 continue;
5099 T accum = *ptr;
5100 accum = accum OP var;
5101 *ptr = accum;
5102 cmp&swap (&lock_var, 1, 0);
5103 return accum;
5105 A lock in global memory is necessary to force execution engine
5106 descheduling and avoid resource starvation that can occur if the
5107 lock is in .shared memory. */
5109 static tree
5110 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5111 tree ptr, tree var, tree_code op)
5113 tree var_type = TREE_TYPE (var);
5114 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5115 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5116 tree uns_locked = build_int_cst (unsigned_type_node, 1);
5118 /* Split the block just before the gsi. Insert a gimple nop to make
5119 this easier. */
5120 gimple *nop = gimple_build_nop ();
5121 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
5122 basic_block entry_bb = gsi_bb (*gsi);
5123 edge entry_edge = split_block (entry_bb, nop);
5124 basic_block lock_bb = entry_edge->dest;
5125 /* Reset the iterator. */
5126 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5128 /* Build and insert the locking sequence. */
5129 gimple_seq lock_seq = NULL;
5130 tree lock_var = make_ssa_name (unsigned_type_node);
5131 tree lock_expr = nvptx_global_lock_addr ();
5132 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
5133 uns_unlocked, uns_locked);
5134 gimplify_assign (lock_var, lock_expr, &lock_seq);
5135 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
5136 NULL_TREE, NULL_TREE);
5137 gimple_seq_add_stmt (&lock_seq, cond);
5138 gimple *lock_end = gimple_seq_last (lock_seq);
5139 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
5141 /* Split the block just after the lock sequence. */
5142 edge locked_edge = split_block (lock_bb, lock_end);
5143 basic_block update_bb = locked_edge->dest;
5144 lock_bb = locked_edge->src;
5145 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5147 /* Create the lock loop ... */
5148 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5149 locked_edge->probability = profile_probability::even ();
5150 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
5151 loop_edge->probability = profile_probability::even ();
5152 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
5153 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
5155 /* ... and the loop structure. */
5156 loop *lock_loop = alloc_loop ();
5157 lock_loop->header = lock_bb;
5158 lock_loop->latch = lock_bb;
5159 lock_loop->nb_iterations_estimate = 1;
5160 lock_loop->any_estimate = true;
5161 add_loop (lock_loop, entry_bb->loop_father);
5163 /* Build and insert the reduction calculation. */
5164 gimple_seq red_seq = NULL;
5165 tree acc_in = make_ssa_name (var_type);
5166 tree ref_in = build_simple_mem_ref (ptr);
5167 TREE_THIS_VOLATILE (ref_in) = 1;
5168 gimplify_assign (acc_in, ref_in, &red_seq);
5170 tree acc_out = make_ssa_name (var_type);
5171 tree update_expr = fold_build2 (op, var_type, ref_in, var);
5172 gimplify_assign (acc_out, update_expr, &red_seq);
5174 tree ref_out = build_simple_mem_ref (ptr);
5175 TREE_THIS_VOLATILE (ref_out) = 1;
5176 gimplify_assign (ref_out, acc_out, &red_seq);
5178 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5180 /* Build & insert the unlock sequence. */
5181 gimple_seq unlock_seq = NULL;
5182 tree unlock_expr = nvptx_global_lock_addr ();
5183 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
5184 uns_locked, uns_unlocked);
5185 gimplify_and_add (unlock_expr, &unlock_seq);
5186 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
5188 return acc_out;
5191 /* Emit a sequence to update a reduction accumlator at *PTR with the
5192 value held in VAR using operator OP. Return the updated value.
5194 TODO: optimize for atomic ops and indepedent complex ops. */
5196 static tree
5197 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
5198 tree ptr, tree var, tree_code op)
5200 tree type = TREE_TYPE (var);
5201 tree size = TYPE_SIZE (type);
5203 if (size == TYPE_SIZE (unsigned_type_node)
5204 || size == TYPE_SIZE (long_long_unsigned_type_node))
5205 return nvptx_lockless_update (loc, gsi, ptr, var, op);
5206 else
5207 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
5210 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5212 static void
5213 nvptx_goacc_reduction_setup (gcall *call)
5215 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5216 tree lhs = gimple_call_lhs (call);
5217 tree var = gimple_call_arg (call, 2);
5218 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5219 gimple_seq seq = NULL;
5221 push_gimplify_context (true);
5223 if (level != GOMP_DIM_GANG)
5225 /* Copy the receiver object. */
5226 tree ref_to_res = gimple_call_arg (call, 1);
5228 if (!integer_zerop (ref_to_res))
5229 var = build_simple_mem_ref (ref_to_res);
5232 if (level == GOMP_DIM_WORKER)
5234 /* Store incoming value to worker reduction buffer. */
5235 tree offset = gimple_call_arg (call, 5);
5236 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5237 tree ptr = make_ssa_name (TREE_TYPE (call));
5239 gimplify_assign (ptr, call, &seq);
5240 tree ref = build_simple_mem_ref (ptr);
5241 TREE_THIS_VOLATILE (ref) = 1;
5242 gimplify_assign (ref, var, &seq);
5245 if (lhs)
5246 gimplify_assign (lhs, var, &seq);
5248 pop_gimplify_context (NULL);
5249 gsi_replace_with_seq (&gsi, seq, true);
5252 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
5254 static void
5255 nvptx_goacc_reduction_init (gcall *call)
5257 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5258 tree lhs = gimple_call_lhs (call);
5259 tree var = gimple_call_arg (call, 2);
5260 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5261 enum tree_code rcode
5262 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5263 tree init = omp_reduction_init_op (gimple_location (call), rcode,
5264 TREE_TYPE (var));
5265 gimple_seq seq = NULL;
5267 push_gimplify_context (true);
5269 if (level == GOMP_DIM_VECTOR)
5271 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5272 tree tid = make_ssa_name (integer_type_node);
5273 tree dim_vector = gimple_call_arg (call, 3);
5274 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
5275 dim_vector);
5276 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
5277 NULL_TREE, NULL_TREE);
5279 gimple_call_set_lhs (tid_call, tid);
5280 gimple_seq_add_stmt (&seq, tid_call);
5281 gimple_seq_add_stmt (&seq, cond_stmt);
5283 /* Split the block just after the call. */
5284 edge init_edge = split_block (gsi_bb (gsi), call);
5285 basic_block init_bb = init_edge->dest;
5286 basic_block call_bb = init_edge->src;
5288 /* Fixup flags from call_bb to init_bb. */
5289 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
5290 init_edge->probability = profile_probability::even ();
5292 /* Set the initialization stmts. */
5293 gimple_seq init_seq = NULL;
5294 tree init_var = make_ssa_name (TREE_TYPE (var));
5295 gimplify_assign (init_var, init, &init_seq);
5296 gsi = gsi_start_bb (init_bb);
5297 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5299 /* Split block just after the init stmt. */
5300 gsi_prev (&gsi);
5301 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5302 basic_block dst_bb = inited_edge->dest;
5304 /* Create false edge from call_bb to dst_bb. */
5305 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
5306 nop_edge->probability = profile_probability::even ();
5308 /* Create phi node in dst block. */
5309 gphi *phi = create_phi_node (lhs, dst_bb);
5310 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5311 add_phi_arg (phi, var, nop_edge, gimple_location (call));
5313 /* Reset dominator of dst bb. */
5314 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5316 /* Reset the gsi. */
5317 gsi = gsi_for_stmt (call);
5319 else
5321 if (level == GOMP_DIM_GANG)
5323 /* If there's no receiver object, propagate the incoming VAR. */
5324 tree ref_to_res = gimple_call_arg (call, 1);
5325 if (integer_zerop (ref_to_res))
5326 init = var;
5329 gimplify_assign (lhs, init, &seq);
5332 pop_gimplify_context (NULL);
5333 gsi_replace_with_seq (&gsi, seq, true);
5336 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
5338 static void
5339 nvptx_goacc_reduction_fini (gcall *call)
5341 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5342 tree lhs = gimple_call_lhs (call);
5343 tree ref_to_res = gimple_call_arg (call, 1);
5344 tree var = gimple_call_arg (call, 2);
5345 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5346 enum tree_code op
5347 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5348 gimple_seq seq = NULL;
5349 tree r = NULL_TREE;;
5351 push_gimplify_context (true);
5353 if (level == GOMP_DIM_VECTOR)
5355 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5356 but that requires a method of emitting a unified jump at the
5357 gimple level. */
5358 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5360 tree other_var = make_ssa_name (TREE_TYPE (var));
5361 nvptx_generate_vector_shuffle (gimple_location (call),
5362 other_var, var, shfl, &seq);
5364 r = make_ssa_name (TREE_TYPE (var));
5365 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5366 var, other_var), &seq);
5367 var = r;
5370 else
5372 tree accum = NULL_TREE;
5374 if (level == GOMP_DIM_WORKER)
5376 /* Get reduction buffer address. */
5377 tree offset = gimple_call_arg (call, 5);
5378 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5379 tree ptr = make_ssa_name (TREE_TYPE (call));
5381 gimplify_assign (ptr, call, &seq);
5382 accum = ptr;
5384 else if (integer_zerop (ref_to_res))
5385 r = var;
5386 else
5387 accum = ref_to_res;
5389 if (accum)
5391 /* UPDATE the accumulator. */
5392 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5393 seq = NULL;
5394 r = nvptx_reduction_update (gimple_location (call), &gsi,
5395 accum, var, op);
5399 if (lhs)
5400 gimplify_assign (lhs, r, &seq);
5401 pop_gimplify_context (NULL);
5403 gsi_replace_with_seq (&gsi, seq, true);
5406 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5408 static void
5409 nvptx_goacc_reduction_teardown (gcall *call)
5411 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5412 tree lhs = gimple_call_lhs (call);
5413 tree var = gimple_call_arg (call, 2);
5414 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5415 gimple_seq seq = NULL;
5417 push_gimplify_context (true);
5418 if (level == GOMP_DIM_WORKER)
5420 /* Read the worker reduction buffer. */
5421 tree offset = gimple_call_arg (call, 5);
5422 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
5423 tree ptr = make_ssa_name (TREE_TYPE (call));
5425 gimplify_assign (ptr, call, &seq);
5426 var = build_simple_mem_ref (ptr);
5427 TREE_THIS_VOLATILE (var) = 1;
5430 if (level != GOMP_DIM_GANG)
5432 /* Write to the receiver object. */
5433 tree ref_to_res = gimple_call_arg (call, 1);
5435 if (!integer_zerop (ref_to_res))
5436 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5439 if (lhs)
5440 gimplify_assign (lhs, var, &seq);
5442 pop_gimplify_context (NULL);
5444 gsi_replace_with_seq (&gsi, seq, true);
5447 /* NVPTX reduction expander. */
5449 static void
5450 nvptx_goacc_reduction (gcall *call)
5452 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5454 switch (code)
5456 case IFN_GOACC_REDUCTION_SETUP:
5457 nvptx_goacc_reduction_setup (call);
5458 break;
5460 case IFN_GOACC_REDUCTION_INIT:
5461 nvptx_goacc_reduction_init (call);
5462 break;
5464 case IFN_GOACC_REDUCTION_FINI:
5465 nvptx_goacc_reduction_fini (call);
5466 break;
5468 case IFN_GOACC_REDUCTION_TEARDOWN:
5469 nvptx_goacc_reduction_teardown (call);
5470 break;
5472 default:
5473 gcc_unreachable ();
5477 static bool
5478 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
5479 rtx x ATTRIBUTE_UNUSED)
5481 return true;
5484 static bool
5485 nvptx_vector_mode_supported (machine_mode mode)
5487 return (mode == V2SImode
5488 || mode == V2DImode);
5491 /* Return the preferred mode for vectorizing scalar MODE. */
5493 static machine_mode
5494 nvptx_preferred_simd_mode (scalar_mode mode)
5496 switch (mode)
5498 case E_DImode:
5499 return V2DImode;
5500 case E_SImode:
5501 return V2SImode;
5503 default:
5504 return default_preferred_simd_mode (mode);
5508 unsigned int
5509 nvptx_data_alignment (const_tree type, unsigned int basic_align)
5511 if (TREE_CODE (type) == INTEGER_TYPE)
5513 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5514 if (size == GET_MODE_SIZE (TImode))
5515 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
5518 return basic_align;
5521 /* Implement TARGET_MODES_TIEABLE_P. */
5523 static bool
5524 nvptx_modes_tieable_p (machine_mode, machine_mode)
5526 return false;
5529 /* Implement TARGET_HARD_REGNO_NREGS. */
5531 static unsigned int
5532 nvptx_hard_regno_nregs (unsigned int, machine_mode)
5534 return 1;
5537 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
5539 static bool
5540 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
5542 return false;
5545 #undef TARGET_OPTION_OVERRIDE
5546 #define TARGET_OPTION_OVERRIDE nvptx_option_override
5548 #undef TARGET_ATTRIBUTE_TABLE
5549 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5551 #undef TARGET_LRA_P
5552 #define TARGET_LRA_P hook_bool_void_false
5554 #undef TARGET_LEGITIMATE_ADDRESS_P
5555 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5557 #undef TARGET_PROMOTE_FUNCTION_MODE
5558 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5560 #undef TARGET_FUNCTION_ARG
5561 #define TARGET_FUNCTION_ARG nvptx_function_arg
5562 #undef TARGET_FUNCTION_INCOMING_ARG
5563 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5564 #undef TARGET_FUNCTION_ARG_ADVANCE
5565 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
5566 #undef TARGET_FUNCTION_ARG_BOUNDARY
5567 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
5568 #undef TARGET_PASS_BY_REFERENCE
5569 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5570 #undef TARGET_FUNCTION_VALUE_REGNO_P
5571 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5572 #undef TARGET_FUNCTION_VALUE
5573 #define TARGET_FUNCTION_VALUE nvptx_function_value
5574 #undef TARGET_LIBCALL_VALUE
5575 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
5576 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
5577 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
5578 #undef TARGET_GET_DRAP_RTX
5579 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
5580 #undef TARGET_SPLIT_COMPLEX_ARG
5581 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5582 #undef TARGET_RETURN_IN_MEMORY
5583 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5584 #undef TARGET_OMIT_STRUCT_RETURN_REG
5585 #define TARGET_OMIT_STRUCT_RETURN_REG true
5586 #undef TARGET_STRICT_ARGUMENT_NAMING
5587 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
5588 #undef TARGET_CALL_ARGS
5589 #define TARGET_CALL_ARGS nvptx_call_args
5590 #undef TARGET_END_CALL_ARGS
5591 #define TARGET_END_CALL_ARGS nvptx_end_call_args
5593 #undef TARGET_ASM_FILE_START
5594 #define TARGET_ASM_FILE_START nvptx_file_start
5595 #undef TARGET_ASM_FILE_END
5596 #define TARGET_ASM_FILE_END nvptx_file_end
5597 #undef TARGET_ASM_GLOBALIZE_LABEL
5598 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
5599 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
5600 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
5601 #undef TARGET_PRINT_OPERAND
5602 #define TARGET_PRINT_OPERAND nvptx_print_operand
5603 #undef TARGET_PRINT_OPERAND_ADDRESS
5604 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
5605 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
5606 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
5607 #undef TARGET_ASM_INTEGER
5608 #define TARGET_ASM_INTEGER nvptx_assemble_integer
5609 #undef TARGET_ASM_DECL_END
5610 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
5611 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
5612 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
5613 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
5614 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
5615 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
5616 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
5618 #undef TARGET_MACHINE_DEPENDENT_REORG
5619 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
5620 #undef TARGET_NO_REGISTER_ALLOCATION
5621 #define TARGET_NO_REGISTER_ALLOCATION true
5623 #undef TARGET_ENCODE_SECTION_INFO
5624 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
5625 #undef TARGET_RECORD_OFFLOAD_SYMBOL
5626 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
5628 #undef TARGET_VECTOR_ALIGNMENT
5629 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
5631 #undef TARGET_CANNOT_COPY_INSN_P
5632 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
5634 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
5635 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
5637 #undef TARGET_INIT_BUILTINS
5638 #define TARGET_INIT_BUILTINS nvptx_init_builtins
5639 #undef TARGET_EXPAND_BUILTIN
5640 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
5641 #undef TARGET_BUILTIN_DECL
5642 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
5644 #undef TARGET_SIMT_VF
5645 #define TARGET_SIMT_VF nvptx_simt_vf
5647 #undef TARGET_GOACC_VALIDATE_DIMS
5648 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
5650 #undef TARGET_GOACC_DIM_LIMIT
5651 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
5653 #undef TARGET_GOACC_FORK_JOIN
5654 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
5656 #undef TARGET_GOACC_REDUCTION
5657 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
5659 #undef TARGET_CANNOT_FORCE_CONST_MEM
5660 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
5662 #undef TARGET_VECTOR_MODE_SUPPORTED_P
5663 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
5665 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
5666 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
5667 nvptx_preferred_simd_mode
5669 #undef TARGET_MODES_TIEABLE_P
5670 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
5672 #undef TARGET_HARD_REGNO_NREGS
5673 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
5675 #undef TARGET_CAN_CHANGE_MODE_CLASS
5676 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
5678 struct gcc_target targetm = TARGET_INITIALIZER;
5680 #include "gt-nvptx.h"