[nvptx] Fix bug in jit bug workaround
[official-gcc.git] / gcc / config / nvptx / nvptx.c
blobf5bb438786573f09426b0f3d1b733a737bd3141f
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2018 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "gomp-constants.h"
63 #include "dumpfile.h"
64 #include "internal-fn.h"
65 #include "gimple-iterator.h"
66 #include "stringpool.h"
67 #include "attribs.h"
68 #include "tree-vrp.h"
69 #include "tree-ssa-operands.h"
70 #include "tree-ssanames.h"
71 #include "gimplify.h"
72 #include "tree-phinodes.h"
73 #include "cfgloop.h"
74 #include "fold-const.h"
75 #include "intl.h"
77 /* This file should be included last. */
78 #include "target-def.h"
80 #define WORKAROUND_PTXJIT_BUG 1
82 /* The various PTX memory areas an object might reside in. */
83 enum nvptx_data_area
85 DATA_AREA_GENERIC,
86 DATA_AREA_GLOBAL,
87 DATA_AREA_SHARED,
88 DATA_AREA_LOCAL,
89 DATA_AREA_CONST,
90 DATA_AREA_PARAM,
91 DATA_AREA_MAX
94 /* We record the data area in the target symbol flags. */
95 #define SYMBOL_DATA_AREA(SYM) \
96 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
97 & 7)
98 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
99 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
101 /* Record the function decls we've written, and the libfuncs and function
102 decls corresponding to them. */
103 static std::stringstream func_decls;
105 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
107 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
108 static bool equal (rtx a, rtx b) { return a == b; }
111 static GTY((cache))
112 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
114 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
116 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
117 static bool equal (tree a, tree b) { return a == b; }
120 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
121 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
123 /* Buffer needed to broadcast across workers. This is used for both
124 worker-neutering and worker broadcasting. It is shared by all
125 functions emitted. The buffer is placed in shared memory. It'd be
126 nice if PTX supported common blocks, because then this could be
127 shared across TUs (taking the largest size). */
128 static unsigned worker_bcast_size;
129 static unsigned worker_bcast_align;
130 static GTY(()) rtx worker_bcast_sym;
132 /* Buffer needed for worker reductions. This has to be distinct from
133 the worker broadcast array, as both may be live concurrently. */
134 static unsigned worker_red_size;
135 static unsigned worker_red_align;
136 static GTY(()) rtx worker_red_sym;
138 /* Global lock variable, needed for 128bit worker & gang reductions. */
139 static GTY(()) tree global_lock_var;
141 /* True if any function references __nvptx_stacks. */
142 static bool need_softstack_decl;
144 /* True if any function references __nvptx_uni. */
145 static bool need_unisimt_decl;
147 /* Allocate a new, cleared machine_function structure. */
149 static struct machine_function *
150 nvptx_init_machine_status (void)
152 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
153 p->return_mode = VOIDmode;
154 return p;
157 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
158 and -fopenacc is also enabled. */
160 static void
161 diagnose_openacc_conflict (bool optval, const char *optname)
163 if (flag_openacc && optval)
164 error ("option %s is not supported together with -fopenacc", optname);
167 /* Implement TARGET_OPTION_OVERRIDE. */
169 static void
170 nvptx_option_override (void)
172 init_machine_status = nvptx_init_machine_status;
174 /* Set toplevel_reorder, unless explicitly disabled. We need
175 reordering so that we emit necessary assembler decls of
176 undeclared variables. */
177 if (!global_options_set.x_flag_toplevel_reorder)
178 flag_toplevel_reorder = 1;
180 debug_nonbind_markers_p = 0;
182 /* Set flag_no_common, unless explicitly disabled. We fake common
183 using .weak, and that's not entirely accurate, so avoid it
184 unless forced. */
185 if (!global_options_set.x_flag_no_common)
186 flag_no_common = 1;
188 /* The patch area requires nops, which we don't have. */
189 if (function_entry_patch_area_size > 0)
190 sorry ("not generating patch area, nops not supported");
192 /* Assumes that it will see only hard registers. */
193 flag_var_tracking = 0;
195 if (nvptx_optimize < 0)
196 nvptx_optimize = optimize > 0;
198 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
199 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
200 declared_libfuncs_htab
201 = hash_table<declared_libfunc_hasher>::create_ggc (17);
203 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
204 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
205 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
207 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
208 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
209 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
211 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
212 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
213 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
215 if (TARGET_GOMP)
216 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
219 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
220 deal with ptx ideosyncracies. */
222 const char *
223 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
225 switch (mode)
227 case E_BLKmode:
228 return ".b8";
229 case E_BImode:
230 return ".pred";
231 case E_QImode:
232 if (promote)
233 return ".u32";
234 else
235 return ".u8";
236 case E_HImode:
237 return ".u16";
238 case E_SImode:
239 return ".u32";
240 case E_DImode:
241 return ".u64";
243 case E_SFmode:
244 return ".f32";
245 case E_DFmode:
246 return ".f64";
248 case E_V2SImode:
249 return ".v2.u32";
250 case E_V2DImode:
251 return ".v2.u64";
253 default:
254 gcc_unreachable ();
258 /* Encode the PTX data area that DECL (which might not actually be a
259 _DECL) should reside in. */
261 static void
262 nvptx_encode_section_info (tree decl, rtx rtl, int first)
264 default_encode_section_info (decl, rtl, first);
265 if (first && MEM_P (rtl))
267 nvptx_data_area area = DATA_AREA_GENERIC;
269 if (TREE_CONSTANT (decl))
270 area = DATA_AREA_CONST;
271 else if (TREE_CODE (decl) == VAR_DECL)
273 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
275 area = DATA_AREA_SHARED;
276 if (DECL_INITIAL (decl))
277 error ("static initialization of variable %q+D in %<.shared%>"
278 " memory is not supported", decl);
280 else
281 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
284 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
288 /* Return the PTX name of the data area in which SYM should be
289 placed. The symbol must have already been processed by
290 nvptx_encode_seciton_info, or equivalent. */
292 static const char *
293 section_for_sym (rtx sym)
295 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
296 /* Same order as nvptx_data_area enum. */
297 static char const *const areas[] =
298 {"", ".global", ".shared", ".local", ".const", ".param"};
300 return areas[area];
303 /* Similarly for a decl. */
305 static const char *
306 section_for_decl (const_tree decl)
308 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
311 /* Check NAME for special function names and redirect them by returning a
312 replacement. This applies to malloc, free and realloc, for which we
313 want to use libgcc wrappers, and call, which triggers a bug in
314 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
315 not active in an offload compiler -- the names are all set by the
316 host-side compiler. */
318 static const char *
319 nvptx_name_replacement (const char *name)
321 if (strcmp (name, "call") == 0)
322 return "__nvptx_call";
323 if (strcmp (name, "malloc") == 0)
324 return "__nvptx_malloc";
325 if (strcmp (name, "free") == 0)
326 return "__nvptx_free";
327 if (strcmp (name, "realloc") == 0)
328 return "__nvptx_realloc";
329 return name;
332 /* If MODE should be treated as two registers of an inner mode, return
333 that inner mode. Otherwise return VOIDmode. */
335 static machine_mode
336 maybe_split_mode (machine_mode mode)
338 if (COMPLEX_MODE_P (mode))
339 return GET_MODE_INNER (mode);
341 if (mode == TImode)
342 return DImode;
344 return VOIDmode;
347 /* Return true if mode should be treated as two registers. */
349 static bool
350 split_mode_p (machine_mode mode)
352 return maybe_split_mode (mode) != VOIDmode;
355 /* Output a register, subreg, or register pair (with optional
356 enclosing braces). */
358 static void
359 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
360 int subreg_offset = -1)
362 if (inner_mode == VOIDmode)
364 if (HARD_REGISTER_NUM_P (regno))
365 fprintf (file, "%s", reg_names[regno]);
366 else
367 fprintf (file, "%%r%d", regno);
369 else if (subreg_offset >= 0)
371 output_reg (file, regno, VOIDmode);
372 fprintf (file, "$%d", subreg_offset);
374 else
376 if (subreg_offset == -1)
377 fprintf (file, "{");
378 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
379 fprintf (file, ",");
380 output_reg (file, regno, inner_mode, 0);
381 if (subreg_offset == -1)
382 fprintf (file, "}");
386 /* Emit forking instructions for MASK. */
388 static void
389 nvptx_emit_forking (unsigned mask, bool is_call)
391 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
392 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
393 if (mask)
395 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
397 /* Emit fork at all levels. This helps form SESE regions, as
398 it creates a block with a single successor before entering a
399 partitooned region. That is a good candidate for the end of
400 an SESE region. */
401 if (!is_call)
402 emit_insn (gen_nvptx_fork (op));
403 emit_insn (gen_nvptx_forked (op));
407 /* Emit joining instructions for MASK. */
409 static void
410 nvptx_emit_joining (unsigned mask, bool is_call)
412 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
413 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
414 if (mask)
416 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
418 /* Emit joining for all non-call pars to ensure there's a single
419 predecessor for the block the join insn ends up in. This is
420 needed for skipping entire loops. */
421 if (!is_call)
422 emit_insn (gen_nvptx_joining (op));
423 emit_insn (gen_nvptx_join (op));
428 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
429 returned in memory. Integer and floating types supported by the
430 machine are passed in registers, everything else is passed in
431 memory. Complex types are split. */
433 static bool
434 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
436 if (type)
438 if (AGGREGATE_TYPE_P (type))
439 return true;
440 if (TREE_CODE (type) == VECTOR_TYPE)
441 return true;
444 if (!for_return && COMPLEX_MODE_P (mode))
445 /* Complex types are passed as two underlying args. */
446 mode = GET_MODE_INNER (mode);
448 if (GET_MODE_CLASS (mode) != MODE_INT
449 && GET_MODE_CLASS (mode) != MODE_FLOAT)
450 return true;
452 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
453 return true;
455 return false;
458 /* A non-memory argument of mode MODE is being passed, determine the mode it
459 should be promoted to. This is also used for determining return
460 type promotion. */
462 static machine_mode
463 promote_arg (machine_mode mode, bool prototyped)
465 if (!prototyped && mode == SFmode)
466 /* K&R float promotion for unprototyped functions. */
467 mode = DFmode;
468 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
469 mode = SImode;
471 return mode;
474 /* A non-memory return type of MODE is being returned. Determine the
475 mode it should be promoted to. */
477 static machine_mode
478 promote_return (machine_mode mode)
480 return promote_arg (mode, true);
483 /* Implement TARGET_FUNCTION_ARG. */
485 static rtx
486 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
487 const_tree, bool named)
489 if (mode == VOIDmode || !named)
490 return NULL_RTX;
492 return gen_reg_rtx (mode);
495 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
497 static rtx
498 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
499 const_tree, bool named)
501 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
503 if (mode == VOIDmode || !named)
504 return NULL_RTX;
506 /* No need to deal with split modes here, the only case that can
507 happen is complex modes and those are dealt with by
508 TARGET_SPLIT_COMPLEX_ARG. */
509 return gen_rtx_UNSPEC (mode,
510 gen_rtvec (1, GEN_INT (cum->count)),
511 UNSPEC_ARG_REG);
514 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
516 static void
517 nvptx_function_arg_advance (cumulative_args_t cum_v,
518 machine_mode ARG_UNUSED (mode),
519 const_tree ARG_UNUSED (type),
520 bool ARG_UNUSED (named))
522 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
524 cum->count++;
527 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
529 For nvptx This is only used for varadic args. The type has already
530 been promoted and/or converted to invisible reference. */
532 static unsigned
533 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
535 return GET_MODE_ALIGNMENT (mode);
538 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
540 For nvptx, we know how to handle functions declared as stdarg: by
541 passing an extra pointer to the unnamed arguments. However, the
542 Fortran frontend can produce a different situation, where a
543 function pointer is declared with no arguments, but the actual
544 function and calls to it take more arguments. In that case, we
545 want to ensure the call matches the definition of the function. */
547 static bool
548 nvptx_strict_argument_naming (cumulative_args_t cum_v)
550 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
552 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
555 /* Implement TARGET_LIBCALL_VALUE. */
557 static rtx
558 nvptx_libcall_value (machine_mode mode, const_rtx)
560 if (!cfun || !cfun->machine->doing_call)
561 /* Pretend to return in a hard reg for early uses before pseudos can be
562 generated. */
563 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
565 return gen_reg_rtx (mode);
568 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
569 where function FUNC returns or receives a value of data type TYPE. */
571 static rtx
572 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
573 bool outgoing)
575 machine_mode mode = promote_return (TYPE_MODE (type));
577 if (outgoing)
579 gcc_assert (cfun);
580 cfun->machine->return_mode = mode;
581 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
584 return nvptx_libcall_value (mode, NULL_RTX);
587 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
589 static bool
590 nvptx_function_value_regno_p (const unsigned int regno)
592 return regno == NVPTX_RETURN_REGNUM;
595 /* Types with a mode other than those supported by the machine are passed by
596 reference in memory. */
598 static bool
599 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
600 machine_mode mode, const_tree type,
601 bool ARG_UNUSED (named))
603 return pass_in_memory (mode, type, false);
606 /* Implement TARGET_RETURN_IN_MEMORY. */
608 static bool
609 nvptx_return_in_memory (const_tree type, const_tree)
611 return pass_in_memory (TYPE_MODE (type), type, true);
614 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
616 static machine_mode
617 nvptx_promote_function_mode (const_tree type, machine_mode mode,
618 int *ARG_UNUSED (punsignedp),
619 const_tree funtype, int for_return)
621 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
624 /* Helper for write_arg. Emit a single PTX argument of MODE, either
625 in a prototype, or as copy in a function prologue. ARGNO is the
626 index of this argument in the PTX function. FOR_REG is negative,
627 if we're emitting the PTX prototype. It is zero if we're copying
628 to an argument register and it is greater than zero if we're
629 copying to a specific hard register. */
631 static int
632 write_arg_mode (std::stringstream &s, int for_reg, int argno,
633 machine_mode mode)
635 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
637 if (for_reg < 0)
639 /* Writing PTX prototype. */
640 s << (argno ? ", " : " (");
641 s << ".param" << ptx_type << " %in_ar" << argno;
643 else
645 s << "\t.reg" << ptx_type << " ";
646 if (for_reg)
647 s << reg_names[for_reg];
648 else
649 s << "%ar" << argno;
650 s << ";\n";
651 if (argno >= 0)
653 s << "\tld.param" << ptx_type << " ";
654 if (for_reg)
655 s << reg_names[for_reg];
656 else
657 s << "%ar" << argno;
658 s << ", [%in_ar" << argno << "];\n";
661 return argno + 1;
664 /* Process function parameter TYPE to emit one or more PTX
665 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
666 is true, if this is a prototyped function, rather than an old-style
667 C declaration. Returns the next argument number to use.
669 The promotion behavior here must match the regular GCC function
670 parameter marshalling machinery. */
672 static int
673 write_arg_type (std::stringstream &s, int for_reg, int argno,
674 tree type, bool prototyped)
676 machine_mode mode = TYPE_MODE (type);
678 if (mode == VOIDmode)
679 return argno;
681 if (pass_in_memory (mode, type, false))
682 mode = Pmode;
683 else
685 bool split = TREE_CODE (type) == COMPLEX_TYPE;
687 if (split)
689 /* Complex types are sent as two separate args. */
690 type = TREE_TYPE (type);
691 mode = TYPE_MODE (type);
692 prototyped = true;
695 mode = promote_arg (mode, prototyped);
696 if (split)
697 argno = write_arg_mode (s, for_reg, argno, mode);
700 return write_arg_mode (s, for_reg, argno, mode);
703 /* Emit a PTX return as a prototype or function prologue declaration
704 for MODE. */
706 static void
707 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
709 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
710 const char *pfx = "\t.reg";
711 const char *sfx = ";\n";
713 if (for_proto)
714 pfx = "(.param", sfx = "_out) ";
716 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
719 /* Process a function return TYPE to emit a PTX return as a prototype
720 or function prologue declaration. Returns true if return is via an
721 additional pointer parameter. The promotion behavior here must
722 match the regular GCC function return mashalling. */
724 static bool
725 write_return_type (std::stringstream &s, bool for_proto, tree type)
727 machine_mode mode = TYPE_MODE (type);
729 if (mode == VOIDmode)
730 return false;
732 bool return_in_mem = pass_in_memory (mode, type, true);
734 if (return_in_mem)
736 if (for_proto)
737 return return_in_mem;
739 /* Named return values can cause us to return a pointer as well
740 as expect an argument for the return location. This is
741 optimization-level specific, so no caller can make use of
742 this data, but more importantly for us, we must ensure it
743 doesn't change the PTX prototype. */
744 mode = (machine_mode) cfun->machine->return_mode;
746 if (mode == VOIDmode)
747 return return_in_mem;
749 /* Clear return_mode to inhibit copy of retval to non-existent
750 retval parameter. */
751 cfun->machine->return_mode = VOIDmode;
753 else
754 mode = promote_return (mode);
756 write_return_mode (s, for_proto, mode);
758 return return_in_mem;
761 /* Look for attributes in ATTRS that would indicate we must write a function
762 as a .entry kernel rather than a .func. Return true if one is found. */
764 static bool
765 write_as_kernel (tree attrs)
767 return (lookup_attribute ("kernel", attrs) != NULL_TREE
768 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
769 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
770 /* For OpenMP target regions, the corresponding kernel entry is emitted from
771 write_omp_entry as a separate function. */
774 /* Emit a linker marker for a function decl or defn. */
776 static void
777 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
778 const char *name)
780 s << "\n// BEGIN";
781 if (globalize)
782 s << " GLOBAL";
783 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
784 s << name << "\n";
787 /* Emit a linker marker for a variable decl or defn. */
789 static void
790 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
792 fprintf (file, "\n// BEGIN%s VAR %s: ",
793 globalize ? " GLOBAL" : "",
794 is_defn ? "DEF" : "DECL");
795 assemble_name_raw (file, name);
796 fputs ("\n", file);
799 /* Write a .func or .kernel declaration or definition along with
800 a helper comment for use by ld. S is the stream to write to, DECL
801 the decl for the function with name NAME. For definitions, emit
802 a declaration too. */
804 static const char *
805 write_fn_proto (std::stringstream &s, bool is_defn,
806 const char *name, const_tree decl)
808 if (is_defn)
809 /* Emit a declaration. The PTX assembler gets upset without it. */
810 name = write_fn_proto (s, false, name, decl);
811 else
813 /* Avoid repeating the name replacement. */
814 name = nvptx_name_replacement (name);
815 if (name[0] == '*')
816 name++;
819 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
821 /* PTX declaration. */
822 if (DECL_EXTERNAL (decl))
823 s << ".extern ";
824 else if (TREE_PUBLIC (decl))
825 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
826 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
828 tree fntype = TREE_TYPE (decl);
829 tree result_type = TREE_TYPE (fntype);
831 /* atomic_compare_exchange_$n builtins have an exceptional calling
832 convention. */
833 int not_atomic_weak_arg = -1;
834 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
835 switch (DECL_FUNCTION_CODE (decl))
837 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
838 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
839 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
840 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
841 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
842 /* These atomics skip the 'weak' parm in an actual library
843 call. We must skip it in the prototype too. */
844 not_atomic_weak_arg = 3;
845 break;
847 default:
848 break;
851 /* Declare the result. */
852 bool return_in_mem = write_return_type (s, true, result_type);
854 s << name;
856 int argno = 0;
858 /* Emit argument list. */
859 if (return_in_mem)
860 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
862 /* We get:
863 NULL in TYPE_ARG_TYPES, for old-style functions
864 NULL in DECL_ARGUMENTS, for builtin functions without another
865 declaration.
866 So we have to pick the best one we have. */
867 tree args = TYPE_ARG_TYPES (fntype);
868 bool prototyped = true;
869 if (!args)
871 args = DECL_ARGUMENTS (decl);
872 prototyped = false;
875 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
877 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
879 if (not_atomic_weak_arg)
880 argno = write_arg_type (s, -1, argno, type, prototyped);
881 else
882 gcc_assert (type == boolean_type_node);
885 if (stdarg_p (fntype))
886 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
888 if (DECL_STATIC_CHAIN (decl))
889 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
891 if (!argno && strcmp (name, "main") == 0)
893 argno = write_arg_type (s, -1, argno, integer_type_node, true);
894 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
897 if (argno)
898 s << ")";
900 s << (is_defn ? "\n" : ";\n");
902 return name;
905 /* Construct a function declaration from a call insn. This can be
906 necessary for two reasons - either we have an indirect call which
907 requires a .callprototype declaration, or we have a libcall
908 generated by emit_library_call for which no decl exists. */
910 static void
911 write_fn_proto_from_insn (std::stringstream &s, const char *name,
912 rtx result, rtx pat)
914 if (!name)
916 s << "\t.callprototype ";
917 name = "_";
919 else
921 name = nvptx_name_replacement (name);
922 write_fn_marker (s, false, true, name);
923 s << "\t.extern .func ";
926 if (result != NULL_RTX)
927 write_return_mode (s, true, GET_MODE (result));
929 s << name;
931 int arg_end = XVECLEN (pat, 0);
932 for (int i = 1; i < arg_end; i++)
934 /* We don't have to deal with mode splitting & promotion here,
935 as that was already done when generating the call
936 sequence. */
937 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
939 write_arg_mode (s, -1, i - 1, mode);
941 if (arg_end != 1)
942 s << ")";
943 s << ";\n";
946 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
947 table and and write a ptx prototype. These are emitted at end of
948 compilation. */
950 static void
951 nvptx_record_fndecl (tree decl)
953 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
954 if (*slot == NULL)
956 *slot = decl;
957 const char *name = get_fnname_from_decl (decl);
958 write_fn_proto (func_decls, false, name, decl);
962 /* Record a libcall or unprototyped external function. CALLEE is the
963 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
964 declaration for it. */
966 static void
967 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
969 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
970 if (*slot == NULL)
972 *slot = callee;
974 const char *name = XSTR (callee, 0);
975 write_fn_proto_from_insn (func_decls, name, retval, pat);
979 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
980 is prototyped, record it now. Otherwise record it as needed at end
981 of compilation, when we might have more information about it. */
983 void
984 nvptx_record_needed_fndecl (tree decl)
986 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
988 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
989 if (*slot == NULL)
990 *slot = decl;
992 else
993 nvptx_record_fndecl (decl);
996 /* SYM is a SYMBOL_REF. If it refers to an external function, record
997 it as needed. */
999 static void
1000 nvptx_maybe_record_fnsym (rtx sym)
1002 tree decl = SYMBOL_REF_DECL (sym);
1004 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1005 nvptx_record_needed_fndecl (decl);
1008 /* Emit a local array to hold some part of a conventional stack frame
1009 and initialize REGNO to point to it. If the size is zero, it'll
1010 never be valid to dereference, so we can simply initialize to
1011 zero. */
1013 static void
1014 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1016 if (size)
1017 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1018 align, reg_names[regno], size);
1019 fprintf (file, "\t.reg.u%d %s;\n",
1020 POINTER_SIZE, reg_names[regno]);
1021 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1022 : "\tmov.u%d %s, 0;\n"),
1023 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1026 /* Emit soft stack frame setup sequence. */
1028 static void
1029 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1031 /* Maintain 64-bit stack alignment. */
1032 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1033 size = ROUND_UP (size, keep_align);
1034 int bits = POINTER_SIZE;
1035 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1036 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1037 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1038 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1039 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1040 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1041 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1042 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1043 fprintf (file, "\t{\n");
1044 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1045 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1046 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1047 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1048 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1049 bits == 64 ? ".wide" : ".lo", bits / 8);
1050 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1052 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1053 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1055 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1056 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1057 bits, reg_sspprev, reg_sspslot);
1059 /* Initialize %frame = %sspprev - size. */
1060 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1061 bits, reg_frame, reg_sspprev, size);
1063 /* Apply alignment, if larger than 64. */
1064 if (alignment > keep_align)
1065 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1066 bits, reg_frame, reg_frame, -alignment);
1068 size = crtl->outgoing_args_size;
1069 gcc_assert (size % keep_align == 0);
1071 /* Initialize %stack. */
1072 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1073 bits, reg_stack, reg_frame, size);
1075 if (!crtl->is_leaf)
1076 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1077 bits, reg_sspslot, reg_stack);
1078 fprintf (file, "\t}\n");
1079 cfun->machine->has_softstack = true;
1080 need_softstack_decl = true;
1083 /* Emit code to initialize the REGNO predicate register to indicate
1084 whether we are not lane zero on the NAME axis. */
1086 static void
1087 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1089 fprintf (file, "\t{\n");
1090 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1091 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1092 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1093 fprintf (file, "\t}\n");
1096 /* Emit code to initialize predicate and master lane index registers for
1097 -muniform-simt code generation variant. */
1099 static void
1100 nvptx_init_unisimt_predicate (FILE *file)
1102 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1103 int loc = REGNO (cfun->machine->unisimt_location);
1104 int bits = POINTER_SIZE;
1105 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1106 fprintf (file, "\t{\n");
1107 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1108 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1109 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1110 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1111 bits == 64 ? ".wide" : ".lo");
1112 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1113 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1114 if (cfun->machine->unisimt_predicate)
1116 int master = REGNO (cfun->machine->unisimt_master);
1117 int pred = REGNO (cfun->machine->unisimt_predicate);
1118 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1119 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1120 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1121 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1122 /* Compute predicate as 'tid.x == master'. */
1123 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1125 fprintf (file, "\t}\n");
1126 need_unisimt_decl = true;
1129 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1131 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1132 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1134 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1135 __nvptx_uni[tid.y] = 0;
1136 gomp_nvptx_main (ORIG, arg);
1138 ORIG itself should not be emitted as a PTX .entry function. */
1140 static void
1141 write_omp_entry (FILE *file, const char *name, const char *orig)
1143 static bool gomp_nvptx_main_declared;
1144 if (!gomp_nvptx_main_declared)
1146 gomp_nvptx_main_declared = true;
1147 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1148 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1149 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1151 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1152 #define NTID_Y "%ntid.y"
1153 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1154 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1155 {\n\
1156 .reg.u32 %r<3>;\n\
1157 .reg.u" PS " %R<4>;\n\
1158 mov.u32 %r0, %tid.y;\n\
1159 mov.u32 %r1, " NTID_Y ";\n\
1160 mov.u32 %r2, %ctaid.x;\n\
1161 cvt.u" PS ".u32 %R1, %r0;\n\
1162 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1163 mov.u" PS " %R0, __nvptx_stacks;\n\
1164 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1165 ld.param.u" PS " %R2, [%stack];\n\
1166 ld.param.u" PS " %R3, [%sz];\n\
1167 add.u" PS " %R2, %R2, %R3;\n\
1168 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1169 st.shared.u" PS " [%R0], %R2;\n\
1170 mov.u" PS " %R0, __nvptx_uni;\n\
1171 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1172 mov.u32 %r0, 0;\n\
1173 st.shared.u32 [%R0], %r0;\n\
1174 mov.u" PS " %R0, \0;\n\
1175 ld.param.u" PS " %R1, [%arg];\n\
1176 {\n\
1177 .param.u" PS " %P<2>;\n\
1178 st.param.u" PS " [%P0], %R0;\n\
1179 st.param.u" PS " [%P1], %R1;\n\
1180 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1181 }\n\
1182 ret.uni;\n\
1183 }\n"
1184 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1185 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1186 #undef ENTRY_TEMPLATE
1187 #undef NTID_Y
1188 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1189 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1190 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1191 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1192 need_softstack_decl = need_unisimt_decl = true;
1195 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1196 function, including local var decls and copies from the arguments to
1197 local regs. */
1199 void
1200 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1202 tree fntype = TREE_TYPE (decl);
1203 tree result_type = TREE_TYPE (fntype);
1204 int argno = 0;
1206 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1207 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1209 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1210 sprintf (buf, "%s$impl", name);
1211 write_omp_entry (file, name, buf);
1212 name = buf;
1214 /* We construct the initial part of the function into a string
1215 stream, in order to share the prototype writing code. */
1216 std::stringstream s;
1217 write_fn_proto (s, true, name, decl);
1218 s << "{\n";
1220 bool return_in_mem = write_return_type (s, false, result_type);
1221 if (return_in_mem)
1222 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1224 /* Declare and initialize incoming arguments. */
1225 tree args = TYPE_ARG_TYPES (fntype);
1226 bool prototyped = true;
1227 if (!args)
1229 args = DECL_ARGUMENTS (decl);
1230 prototyped = false;
1233 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1235 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1237 argno = write_arg_type (s, 0, argno, type, prototyped);
1240 if (stdarg_p (fntype))
1241 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1242 true);
1244 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1245 write_arg_type (s, STATIC_CHAIN_REGNUM,
1246 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1247 true);
1249 fprintf (file, "%s", s.str().c_str());
1251 /* Usually 'crtl->is_leaf' is computed during register allocator
1252 initialization (which is not done on NVPTX) or for pressure-sensitive
1253 optimizations. Initialize it here, except if already set. */
1254 if (!crtl->is_leaf)
1255 crtl->is_leaf = leaf_function_p ();
1257 HOST_WIDE_INT sz = get_frame_size ();
1258 bool need_frameptr = sz || cfun->machine->has_chain;
1259 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1260 if (!TARGET_SOFT_STACK)
1262 /* Declare a local var for outgoing varargs. */
1263 if (cfun->machine->has_varadic)
1264 init_frame (file, STACK_POINTER_REGNUM,
1265 UNITS_PER_WORD, crtl->outgoing_args_size);
1267 /* Declare a local variable for the frame. Force its size to be
1268 DImode-compatible. */
1269 if (need_frameptr)
1270 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1271 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1273 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1274 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1275 init_softstack_frame (file, alignment, sz);
1277 if (cfun->machine->has_simtreg)
1279 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1280 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1281 align = MAX (align, GET_MODE_SIZE (DImode));
1282 if (!crtl->is_leaf || cfun->calls_alloca)
1283 simtsz = HOST_WIDE_INT_M1U;
1284 if (simtsz == HOST_WIDE_INT_M1U)
1285 simtsz = nvptx_softstack_size;
1286 if (cfun->machine->has_softstack)
1287 simtsz += POINTER_SIZE / 8;
1288 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1289 if (align > GET_MODE_SIZE (DImode))
1290 simtsz += align - GET_MODE_SIZE (DImode);
1291 if (simtsz)
1292 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1293 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1295 /* Declare the pseudos we have as ptx registers. */
1296 int maxregs = max_reg_num ();
1297 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1299 if (regno_reg_rtx[i] != const0_rtx)
1301 machine_mode mode = PSEUDO_REGNO_MODE (i);
1302 machine_mode split = maybe_split_mode (mode);
1304 if (split_mode_p (mode))
1305 mode = split;
1306 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1307 output_reg (file, i, split, -2);
1308 fprintf (file, ";\n");
1312 /* Emit axis predicates. */
1313 if (cfun->machine->axis_predicate[0])
1314 nvptx_init_axis_predicate (file,
1315 REGNO (cfun->machine->axis_predicate[0]), "y");
1316 if (cfun->machine->axis_predicate[1])
1317 nvptx_init_axis_predicate (file,
1318 REGNO (cfun->machine->axis_predicate[1]), "x");
1319 if (cfun->machine->unisimt_predicate
1320 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1321 nvptx_init_unisimt_predicate (file);
1324 /* Output code for switching uniform-simt state. ENTERING indicates whether
1325 we are entering or leaving non-uniform execution region. */
1327 static void
1328 nvptx_output_unisimt_switch (FILE *file, bool entering)
1330 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1331 return;
1332 fprintf (file, "\t{\n");
1333 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1334 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1335 if (!crtl->is_leaf)
1337 int loc = REGNO (cfun->machine->unisimt_location);
1338 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1340 if (cfun->machine->unisimt_predicate)
1342 int master = REGNO (cfun->machine->unisimt_master);
1343 int pred = REGNO (cfun->machine->unisimt_predicate);
1344 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1345 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1346 master, entering ? "%ustmp2" : "0");
1347 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1349 fprintf (file, "\t}\n");
1352 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1353 ENTERING indicates whether we are entering or leaving non-uniform execution.
1354 PTR is the register pointing to allocated storage, it is assigned to on
1355 entering and used to restore state on leaving. SIZE and ALIGN are used only
1356 on entering. */
1358 static void
1359 nvptx_output_softstack_switch (FILE *file, bool entering,
1360 rtx ptr, rtx size, rtx align)
1362 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1363 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1364 return;
1365 int bits = POINTER_SIZE, regno = REGNO (ptr);
1366 fprintf (file, "\t{\n");
1367 if (entering)
1369 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1370 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1371 cfun->machine->simt_stack_size);
1372 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1373 if (CONST_INT_P (size))
1374 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1375 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1376 else
1377 output_reg (file, REGNO (size), VOIDmode);
1378 fputs (";\n", file);
1379 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1380 fprintf (file,
1381 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1382 bits, regno, regno, UINTVAL (align));
1384 if (cfun->machine->has_softstack)
1386 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1387 if (entering)
1389 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1390 bits, regno, bits / 8, reg_stack);
1391 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1392 bits, reg_stack, regno, bits / 8);
1394 else
1396 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1397 bits, reg_stack, regno, bits / 8);
1399 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1401 fprintf (file, "\t}\n");
1404 /* Output code to enter non-uniform execution region. DEST is a register
1405 to hold a per-lane allocation given by SIZE and ALIGN. */
1407 const char *
1408 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1410 nvptx_output_unisimt_switch (asm_out_file, true);
1411 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1412 return "";
1415 /* Output code to leave non-uniform execution region. SRC is the register
1416 holding per-lane storage previously allocated by omp_simt_enter insn. */
1418 const char *
1419 nvptx_output_simt_exit (rtx src)
1421 nvptx_output_unisimt_switch (asm_out_file, false);
1422 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1423 return "";
1426 /* Output instruction that sets soft stack pointer in shared memory to the
1427 value in register given by SRC_REGNO. */
1429 const char *
1430 nvptx_output_set_softstack (unsigned src_regno)
1432 if (cfun->machine->has_softstack && !crtl->is_leaf)
1434 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1435 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1436 output_reg (asm_out_file, src_regno, VOIDmode);
1437 fprintf (asm_out_file, ";\n");
1439 return "";
1441 /* Output a return instruction. Also copy the return value to its outgoing
1442 location. */
1444 const char *
1445 nvptx_output_return (void)
1447 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1449 if (mode != VOIDmode)
1450 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1451 nvptx_ptx_type_from_mode (mode, false),
1452 reg_names[NVPTX_RETURN_REGNUM],
1453 reg_names[NVPTX_RETURN_REGNUM]);
1455 return "ret;";
1458 /* Terminate a function by writing a closing brace to FILE. */
1460 void
1461 nvptx_function_end (FILE *file)
1463 fprintf (file, "}\n");
1466 /* Decide whether we can make a sibling call to a function. For ptx, we
1467 can't. */
1469 static bool
1470 nvptx_function_ok_for_sibcall (tree, tree)
1472 return false;
1475 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1477 static rtx
1478 nvptx_get_drap_rtx (void)
1480 if (TARGET_SOFT_STACK && stack_realign_drap)
1481 return arg_pointer_rtx;
1482 return NULL_RTX;
1485 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1486 argument to the next call. */
1488 static void
1489 nvptx_call_args (rtx arg, tree fntype)
1491 if (!cfun->machine->doing_call)
1493 cfun->machine->doing_call = true;
1494 cfun->machine->is_varadic = false;
1495 cfun->machine->num_args = 0;
1497 if (fntype && stdarg_p (fntype))
1499 cfun->machine->is_varadic = true;
1500 cfun->machine->has_varadic = true;
1501 cfun->machine->num_args++;
1505 if (REG_P (arg) && arg != pc_rtx)
1507 cfun->machine->num_args++;
1508 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1509 cfun->machine->call_args);
1513 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1514 information we recorded. */
1516 static void
1517 nvptx_end_call_args (void)
1519 cfun->machine->doing_call = false;
1520 free_EXPR_LIST_list (&cfun->machine->call_args);
1523 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1524 track of whether calls involving static chains or varargs were seen
1525 in the current function.
1526 For libcalls, maintain a hash table of decls we have seen, and
1527 record a function decl for later when encountering a new one. */
1529 void
1530 nvptx_expand_call (rtx retval, rtx address)
1532 rtx callee = XEXP (address, 0);
1533 rtx varargs = NULL_RTX;
1534 unsigned parallel = 0;
1536 if (!call_insn_operand (callee, Pmode))
1538 callee = force_reg (Pmode, callee);
1539 address = change_address (address, QImode, callee);
1542 if (GET_CODE (callee) == SYMBOL_REF)
1544 tree decl = SYMBOL_REF_DECL (callee);
1545 if (decl != NULL_TREE)
1547 if (DECL_STATIC_CHAIN (decl))
1548 cfun->machine->has_chain = true;
1550 tree attr = oacc_get_fn_attrib (decl);
1551 if (attr)
1553 tree dims = TREE_VALUE (attr);
1555 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1556 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1558 if (TREE_PURPOSE (dims)
1559 && !integer_zerop (TREE_PURPOSE (dims)))
1560 break;
1561 /* Not on this axis. */
1562 parallel ^= GOMP_DIM_MASK (ix);
1563 dims = TREE_CHAIN (dims);
1569 unsigned nargs = cfun->machine->num_args;
1570 if (cfun->machine->is_varadic)
1572 varargs = gen_reg_rtx (Pmode);
1573 emit_move_insn (varargs, stack_pointer_rtx);
1576 rtvec vec = rtvec_alloc (nargs + 1);
1577 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1578 int vec_pos = 0;
1580 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1581 rtx tmp_retval = retval;
1582 if (retval)
1584 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1585 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1586 call = gen_rtx_SET (tmp_retval, call);
1588 XVECEXP (pat, 0, vec_pos++) = call;
1590 /* Construct the call insn, including a USE for each argument pseudo
1591 register. These will be used when printing the insn. */
1592 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1593 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1595 if (varargs)
1596 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1598 gcc_assert (vec_pos = XVECLEN (pat, 0));
1600 nvptx_emit_forking (parallel, true);
1601 emit_call_insn (pat);
1602 nvptx_emit_joining (parallel, true);
1604 if (tmp_retval != retval)
1605 emit_move_insn (retval, tmp_retval);
1608 /* Emit a comparison COMPARE, and return the new test to be used in the
1609 jump. */
1612 nvptx_expand_compare (rtx compare)
1614 rtx pred = gen_reg_rtx (BImode);
1615 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1616 XEXP (compare, 0), XEXP (compare, 1));
1617 emit_insn (gen_rtx_SET (pred, cmp));
1618 return gen_rtx_NE (BImode, pred, const0_rtx);
1621 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1623 void
1624 nvptx_expand_oacc_fork (unsigned mode)
1626 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1629 void
1630 nvptx_expand_oacc_join (unsigned mode)
1632 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1635 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1636 objects. */
1638 static rtx
1639 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1641 rtx res;
1643 switch (GET_MODE (src))
1645 case E_DImode:
1646 res = gen_unpackdisi2 (dst0, dst1, src);
1647 break;
1648 case E_DFmode:
1649 res = gen_unpackdfsi2 (dst0, dst1, src);
1650 break;
1651 default: gcc_unreachable ();
1653 return res;
1656 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1657 object. */
1659 static rtx
1660 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1662 rtx res;
1664 switch (GET_MODE (dst))
1666 case E_DImode:
1667 res = gen_packsidi2 (dst, src0, src1);
1668 break;
1669 case E_DFmode:
1670 res = gen_packsidf2 (dst, src0, src1);
1671 break;
1672 default: gcc_unreachable ();
1674 return res;
1677 /* Generate an instruction or sequence to broadcast register REG
1678 across the vectors of a single warp. */
1681 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1683 rtx res;
1685 switch (GET_MODE (dst))
1687 case E_SImode:
1688 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1689 break;
1690 case E_SFmode:
1691 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1692 break;
1693 case E_DImode:
1694 case E_DFmode:
1696 rtx tmp0 = gen_reg_rtx (SImode);
1697 rtx tmp1 = gen_reg_rtx (SImode);
1699 start_sequence ();
1700 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1701 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1702 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1703 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1704 res = get_insns ();
1705 end_sequence ();
1707 break;
1708 case E_BImode:
1710 rtx tmp = gen_reg_rtx (SImode);
1712 start_sequence ();
1713 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1714 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1715 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1716 res = get_insns ();
1717 end_sequence ();
1719 break;
1720 case E_QImode:
1721 case E_HImode:
1723 rtx tmp = gen_reg_rtx (SImode);
1725 start_sequence ();
1726 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1727 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1728 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1729 tmp)));
1730 res = get_insns ();
1731 end_sequence ();
1733 break;
1735 default:
1736 gcc_unreachable ();
1738 return res;
1741 /* Generate an instruction or sequence to broadcast register REG
1742 across the vectors of a single warp. */
1744 static rtx
1745 nvptx_gen_vcast (rtx reg)
1747 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1750 /* Structure used when generating a worker-level spill or fill. */
1752 struct wcast_data_t
1754 rtx base; /* Register holding base addr of buffer. */
1755 rtx ptr; /* Iteration var, if needed. */
1756 unsigned offset; /* Offset into worker buffer. */
1759 /* Direction of the spill/fill and looping setup/teardown indicator. */
1761 enum propagate_mask
1763 PM_read = 1 << 0,
1764 PM_write = 1 << 1,
1765 PM_loop_begin = 1 << 2,
1766 PM_loop_end = 1 << 3,
1768 PM_read_write = PM_read | PM_write
1771 /* Generate instruction(s) to spill or fill register REG to/from the
1772 worker broadcast array. PM indicates what is to be done, REP
1773 how many loop iterations will be executed (0 for not a loop). */
1775 static rtx
1776 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1778 rtx res;
1779 machine_mode mode = GET_MODE (reg);
1781 switch (mode)
1783 case E_BImode:
1785 rtx tmp = gen_reg_rtx (SImode);
1787 start_sequence ();
1788 if (pm & PM_read)
1789 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1790 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1791 if (pm & PM_write)
1792 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1793 res = get_insns ();
1794 end_sequence ();
1796 break;
1798 default:
1800 rtx addr = data->ptr;
1802 if (!addr)
1804 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1806 if (align > worker_bcast_align)
1807 worker_bcast_align = align;
1808 data->offset = (data->offset + align - 1) & ~(align - 1);
1809 addr = data->base;
1810 if (data->offset)
1811 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1814 addr = gen_rtx_MEM (mode, addr);
1815 if (pm == PM_read)
1816 res = gen_rtx_SET (addr, reg);
1817 else if (pm == PM_write)
1818 res = gen_rtx_SET (reg, addr);
1819 else
1820 gcc_unreachable ();
1822 if (data->ptr)
1824 /* We're using a ptr, increment it. */
1825 start_sequence ();
1827 emit_insn (res);
1828 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1829 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1830 res = get_insns ();
1831 end_sequence ();
1833 else
1834 rep = 1;
1835 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1837 break;
1839 return res;
1842 /* Returns true if X is a valid address for use in a memory reference. */
1844 static bool
1845 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1847 enum rtx_code code = GET_CODE (x);
1849 switch (code)
1851 case REG:
1852 return true;
1854 case PLUS:
1855 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1856 return true;
1857 return false;
1859 case CONST:
1860 case SYMBOL_REF:
1861 case LABEL_REF:
1862 return true;
1864 default:
1865 return false;
1869 /* Machinery to output constant initializers. When beginning an
1870 initializer, we decide on a fragment size (which is visible in ptx
1871 in the type used), and then all initializer data is buffered until
1872 a fragment is filled and ready to be written out. */
1874 static struct
1876 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1877 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1878 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1879 out. */
1880 unsigned size; /* Fragment size to accumulate. */
1881 unsigned offset; /* Offset within current fragment. */
1882 bool started; /* Whether we've output any initializer. */
1883 } init_frag;
1885 /* The current fragment is full, write it out. SYM may provide a
1886 symbolic reference we should output, in which case the fragment
1887 value is the addend. */
1889 static void
1890 output_init_frag (rtx sym)
1892 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1893 unsigned HOST_WIDE_INT val = init_frag.val;
1895 init_frag.started = true;
1896 init_frag.val = 0;
1897 init_frag.offset = 0;
1898 init_frag.remaining--;
1900 if (sym)
1902 bool function = (SYMBOL_REF_DECL (sym)
1903 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
1904 if (!function)
1905 fprintf (asm_out_file, "generic(");
1906 output_address (VOIDmode, sym);
1907 if (!function)
1908 fprintf (asm_out_file, ")");
1909 if (val)
1910 fprintf (asm_out_file, " + ");
1913 if (!sym || val)
1914 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1917 /* Add value VAL of size SIZE to the data we're emitting, and keep
1918 writing out chunks as they fill up. */
1920 static void
1921 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1923 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1925 for (unsigned part = 0; size; size -= part)
1927 val >>= part * BITS_PER_UNIT;
1928 part = init_frag.size - init_frag.offset;
1929 if (part > size)
1930 part = size;
1932 unsigned HOST_WIDE_INT partial
1933 = val << (init_frag.offset * BITS_PER_UNIT);
1934 init_frag.val |= partial & init_frag.mask;
1935 init_frag.offset += part;
1937 if (init_frag.offset == init_frag.size)
1938 output_init_frag (NULL);
1942 /* Target hook for assembling integer object X of size SIZE. */
1944 static bool
1945 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1947 HOST_WIDE_INT val = 0;
1949 switch (GET_CODE (x))
1951 default:
1952 /* Let the generic machinery figure it out, usually for a
1953 CONST_WIDE_INT. */
1954 return false;
1956 case CONST_INT:
1957 nvptx_assemble_value (INTVAL (x), size);
1958 break;
1960 case CONST:
1961 x = XEXP (x, 0);
1962 gcc_assert (GET_CODE (x) == PLUS);
1963 val = INTVAL (XEXP (x, 1));
1964 x = XEXP (x, 0);
1965 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1966 /* FALLTHROUGH */
1968 case SYMBOL_REF:
1969 gcc_assert (size == init_frag.size);
1970 if (init_frag.offset)
1971 sorry ("cannot emit unaligned pointers in ptx assembly");
1973 nvptx_maybe_record_fnsym (x);
1974 init_frag.val = val;
1975 output_init_frag (x);
1976 break;
1979 return true;
1982 /* Output SIZE zero bytes. We ignore the FILE argument since the
1983 functions we're calling to perform the output just use
1984 asm_out_file. */
1986 void
1987 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1989 /* Finish the current fragment, if it's started. */
1990 if (init_frag.offset)
1992 unsigned part = init_frag.size - init_frag.offset;
1993 if (part > size)
1994 part = (unsigned) size;
1995 size -= part;
1996 nvptx_assemble_value (0, part);
1999 /* If this skip doesn't terminate the initializer, write as many
2000 remaining pieces as possible directly. */
2001 if (size < init_frag.remaining * init_frag.size)
2003 while (size >= init_frag.size)
2005 size -= init_frag.size;
2006 output_init_frag (NULL_RTX);
2008 if (size)
2009 nvptx_assemble_value (0, size);
2013 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2014 ignore the FILE arg. */
2016 void
2017 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2019 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2020 nvptx_assemble_value (str[i], 1);
2023 /* Emit a PTX variable decl and prepare for emission of its
2024 initializer. NAME is the symbol name and SETION the PTX data
2025 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2026 The caller has already emitted any indentation and linkage
2027 specifier. It is responsible for any initializer, terminating ;
2028 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2029 this is the opposite way round that PTX wants them! */
2031 static void
2032 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2033 const_tree type, HOST_WIDE_INT size, unsigned align)
2035 while (TREE_CODE (type) == ARRAY_TYPE)
2036 type = TREE_TYPE (type);
2038 if (TREE_CODE (type) == VECTOR_TYPE
2039 || TREE_CODE (type) == COMPLEX_TYPE)
2040 /* Neither vector nor complex types can contain the other. */
2041 type = TREE_TYPE (type);
2043 unsigned elt_size = int_size_in_bytes (type);
2045 /* Largest mode we're prepared to accept. For BLKmode types we
2046 don't know if it'll contain pointer constants, so have to choose
2047 pointer size, otherwise we can choose DImode. */
2048 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2050 elt_size |= GET_MODE_SIZE (elt_mode);
2051 elt_size &= -elt_size; /* Extract LSB set. */
2053 init_frag.size = elt_size;
2054 /* Avoid undefined shift behavior by using '2'. */
2055 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2056 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2057 init_frag.val = 0;
2058 init_frag.offset = 0;
2059 init_frag.started = false;
2060 /* Size might not be a multiple of elt size, if there's an
2061 initialized trailing struct array with smaller type than
2062 elt_size. */
2063 init_frag.remaining = (size + elt_size - 1) / elt_size;
2065 fprintf (file, "%s .align %d .u%d ",
2066 section, align / BITS_PER_UNIT,
2067 elt_size * BITS_PER_UNIT);
2068 assemble_name (file, name);
2070 if (size)
2071 /* We make everything an array, to simplify any initialization
2072 emission. */
2073 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2076 /* Called when the initializer for a decl has been completely output through
2077 combinations of the three functions above. */
2079 static void
2080 nvptx_assemble_decl_end (void)
2082 if (init_frag.offset)
2083 /* This can happen with a packed struct with trailing array member. */
2084 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2085 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2088 /* Output an uninitialized common or file-scope variable. */
2090 void
2091 nvptx_output_aligned_decl (FILE *file, const char *name,
2092 const_tree decl, HOST_WIDE_INT size, unsigned align)
2094 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2096 /* If this is public, it is common. The nearest thing we have to
2097 common is weak. */
2098 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2100 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2101 TREE_TYPE (decl), size, align);
2102 nvptx_assemble_decl_end ();
2105 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2106 writing a constant variable EXP with NAME and SIZE and its
2107 initializer to FILE. */
2109 static void
2110 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2111 const_tree exp, HOST_WIDE_INT obj_size)
2113 write_var_marker (file, true, false, name);
2115 fprintf (file, "\t");
2117 tree type = TREE_TYPE (exp);
2118 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2119 TYPE_ALIGN (type));
2122 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2123 a variable DECL with NAME to FILE. */
2125 void
2126 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2128 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2130 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2131 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2133 tree type = TREE_TYPE (decl);
2134 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2135 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2136 type, obj_size, DECL_ALIGN (decl));
2139 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2141 static void
2142 nvptx_globalize_label (FILE *, const char *)
2146 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2147 declaration only for variable DECL with NAME to FILE. */
2149 static void
2150 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2152 /* The middle end can place constant pool decls into the varpool as
2153 undefined. Until that is fixed, catch the problem here. */
2154 if (DECL_IN_CONSTANT_POOL (decl))
2155 return;
2157 /* We support weak defintions, and hence have the right
2158 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2159 if (DECL_WEAK (decl))
2160 error_at (DECL_SOURCE_LOCATION (decl),
2161 "PTX does not support weak declarations"
2162 " (only weak definitions)");
2163 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2165 fprintf (file, "\t.extern ");
2166 tree size = DECL_SIZE_UNIT (decl);
2167 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2168 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2169 DECL_ALIGN (decl));
2170 nvptx_assemble_decl_end ();
2173 /* Output a pattern for a move instruction. */
2175 const char *
2176 nvptx_output_mov_insn (rtx dst, rtx src)
2178 machine_mode dst_mode = GET_MODE (dst);
2179 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2180 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2181 machine_mode src_inner = (GET_CODE (src) == SUBREG
2182 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2184 rtx sym = src;
2185 if (GET_CODE (sym) == CONST)
2186 sym = XEXP (XEXP (sym, 0), 0);
2187 if (SYMBOL_REF_P (sym))
2189 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2190 return "%.\tcvta%D1%t0\t%0, %1;";
2191 nvptx_maybe_record_fnsym (sym);
2194 if (src_inner == dst_inner)
2195 return "%.\tmov%t0\t%0, %1;";
2197 if (CONSTANT_P (src))
2198 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2199 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2200 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2202 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2204 if (GET_MODE_BITSIZE (dst_mode) == 128
2205 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2207 /* mov.b128 is not supported. */
2208 if (dst_inner == V2DImode && src_inner == TImode)
2209 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2210 else if (dst_inner == TImode && src_inner == V2DImode)
2211 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2213 gcc_unreachable ();
2215 return "%.\tmov.b%T0\t%0, %1;";
2218 return "%.\tcvt%t0%t1\t%0, %1;";
2221 static void nvptx_print_operand (FILE *, rtx, int);
2223 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2224 involves writing .param declarations and in/out copies into them. For
2225 indirect calls, also write the .callprototype. */
2227 const char *
2228 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2230 char buf[16];
2231 static int labelno;
2232 bool needs_tgt = register_operand (callee, Pmode);
2233 rtx pat = PATTERN (insn);
2234 if (GET_CODE (pat) == COND_EXEC)
2235 pat = COND_EXEC_CODE (pat);
2236 int arg_end = XVECLEN (pat, 0);
2237 tree decl = NULL_TREE;
2239 fprintf (asm_out_file, "\t{\n");
2240 if (result != NULL)
2241 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2242 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2243 reg_names[NVPTX_RETURN_REGNUM]);
2245 /* Ensure we have a ptx declaration in the output if necessary. */
2246 if (GET_CODE (callee) == SYMBOL_REF)
2248 decl = SYMBOL_REF_DECL (callee);
2249 if (!decl
2250 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2251 nvptx_record_libfunc (callee, result, pat);
2252 else if (DECL_EXTERNAL (decl))
2253 nvptx_record_fndecl (decl);
2256 if (needs_tgt)
2258 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2259 labelno++;
2260 ASM_OUTPUT_LABEL (asm_out_file, buf);
2261 std::stringstream s;
2262 write_fn_proto_from_insn (s, NULL, result, pat);
2263 fputs (s.str().c_str(), asm_out_file);
2266 for (int argno = 1; argno < arg_end; argno++)
2268 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2269 machine_mode mode = GET_MODE (t);
2270 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2272 /* Mode splitting has already been done. */
2273 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2274 "\t\tst.param%s [%%out_arg%d], ",
2275 ptx_type, argno, ptx_type, argno);
2276 output_reg (asm_out_file, REGNO (t), VOIDmode);
2277 fprintf (asm_out_file, ";\n");
2280 /* The '.' stands for the call's predicate, if any. */
2281 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2282 fprintf (asm_out_file, "\t\tcall ");
2283 if (result != NULL_RTX)
2284 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2286 if (decl)
2288 const char *name = get_fnname_from_decl (decl);
2289 name = nvptx_name_replacement (name);
2290 assemble_name (asm_out_file, name);
2292 else
2293 output_address (VOIDmode, callee);
2295 const char *open = "(";
2296 for (int argno = 1; argno < arg_end; argno++)
2298 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2299 open = "";
2301 if (decl && DECL_STATIC_CHAIN (decl))
2303 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2304 open = "";
2306 if (!open[0])
2307 fprintf (asm_out_file, ")");
2309 if (needs_tgt)
2311 fprintf (asm_out_file, ", ");
2312 assemble_name (asm_out_file, buf);
2314 fprintf (asm_out_file, ";\n");
2316 if (find_reg_note (insn, REG_NORETURN, NULL))
2318 /* No return functions confuse the PTX JIT, as it doesn't realize
2319 the flow control barrier they imply. It can seg fault if it
2320 encounters what looks like an unexitable loop. Emit a trailing
2321 trap and exit, which it does grok. */
2322 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2323 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2326 if (result)
2328 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2330 if (!rval[0])
2331 /* We must escape the '%' that starts RETURN_REGNUM. */
2332 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2333 reg_names[NVPTX_RETURN_REGNUM]);
2334 return rval;
2337 return "}";
2340 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2342 static bool
2343 nvptx_print_operand_punct_valid_p (unsigned char c)
2345 return c == '.' || c== '#';
2348 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2350 static void
2351 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2353 rtx off;
2354 if (GET_CODE (x) == CONST)
2355 x = XEXP (x, 0);
2356 switch (GET_CODE (x))
2358 case PLUS:
2359 off = XEXP (x, 1);
2360 output_address (VOIDmode, XEXP (x, 0));
2361 fprintf (file, "+");
2362 output_address (VOIDmode, off);
2363 break;
2365 case SYMBOL_REF:
2366 case LABEL_REF:
2367 output_addr_const (file, x);
2368 break;
2370 default:
2371 gcc_assert (GET_CODE (x) != MEM);
2372 nvptx_print_operand (file, x, 0);
2373 break;
2377 /* Write assembly language output for the address ADDR to FILE. */
2379 static void
2380 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2382 nvptx_print_address_operand (file, addr, mode);
2385 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2387 Meaning of CODE:
2388 . -- print the predicate for the instruction or an emptry string for an
2389 unconditional one.
2390 # -- print a rounding mode for the instruction
2392 A -- print a data area for a MEM
2393 c -- print an opcode suffix for a comparison operator, including a type code
2394 D -- print a data area for a MEM operand
2395 S -- print a shuffle kind specified by CONST_INT
2396 t -- print a type opcode suffix, promoting QImode to 32 bits
2397 T -- print a type size in bits
2398 u -- print a type opcode suffix without promotions. */
2400 static void
2401 nvptx_print_operand (FILE *file, rtx x, int code)
2403 if (code == '.')
2405 x = current_insn_predicate;
2406 if (x)
2408 fputs ("@", file);
2409 if (GET_CODE (x) == EQ)
2410 fputs ("!", file);
2411 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2413 return;
2415 else if (code == '#')
2417 fputs (".rn", file);
2418 return;
2421 enum rtx_code x_code = GET_CODE (x);
2422 machine_mode mode = GET_MODE (x);
2424 switch (code)
2426 case 'A':
2427 x = XEXP (x, 0);
2428 /* FALLTHROUGH. */
2430 case 'D':
2431 if (GET_CODE (x) == CONST)
2432 x = XEXP (x, 0);
2433 if (GET_CODE (x) == PLUS)
2434 x = XEXP (x, 0);
2436 if (GET_CODE (x) == SYMBOL_REF)
2437 fputs (section_for_sym (x), file);
2438 break;
2440 case 't':
2441 case 'u':
2442 if (x_code == SUBREG)
2444 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2445 if (VECTOR_MODE_P (inner_mode)
2446 && (GET_MODE_SIZE (mode)
2447 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2448 mode = GET_MODE_INNER (inner_mode);
2449 else if (split_mode_p (inner_mode))
2450 mode = maybe_split_mode (inner_mode);
2451 else
2452 mode = inner_mode;
2454 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2455 break;
2457 case 'H':
2458 case 'L':
2460 rtx inner_x = SUBREG_REG (x);
2461 machine_mode inner_mode = GET_MODE (inner_x);
2462 machine_mode split = maybe_split_mode (inner_mode);
2464 output_reg (file, REGNO (inner_x), split,
2465 (code == 'H'
2466 ? GET_MODE_SIZE (inner_mode) / 2
2467 : 0));
2469 break;
2471 case 'S':
2473 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2474 /* Same order as nvptx_shuffle_kind. */
2475 static const char *const kinds[] =
2476 {".up", ".down", ".bfly", ".idx"};
2477 fputs (kinds[kind], file);
2479 break;
2481 case 'T':
2482 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2483 break;
2485 case 'j':
2486 fprintf (file, "@");
2487 goto common;
2489 case 'J':
2490 fprintf (file, "@!");
2491 goto common;
2493 case 'c':
2494 mode = GET_MODE (XEXP (x, 0));
2495 switch (x_code)
2497 case EQ:
2498 fputs (".eq", file);
2499 break;
2500 case NE:
2501 if (FLOAT_MODE_P (mode))
2502 fputs (".neu", file);
2503 else
2504 fputs (".ne", file);
2505 break;
2506 case LE:
2507 case LEU:
2508 fputs (".le", file);
2509 break;
2510 case GE:
2511 case GEU:
2512 fputs (".ge", file);
2513 break;
2514 case LT:
2515 case LTU:
2516 fputs (".lt", file);
2517 break;
2518 case GT:
2519 case GTU:
2520 fputs (".gt", file);
2521 break;
2522 case LTGT:
2523 fputs (".ne", file);
2524 break;
2525 case UNEQ:
2526 fputs (".equ", file);
2527 break;
2528 case UNLE:
2529 fputs (".leu", file);
2530 break;
2531 case UNGE:
2532 fputs (".geu", file);
2533 break;
2534 case UNLT:
2535 fputs (".ltu", file);
2536 break;
2537 case UNGT:
2538 fputs (".gtu", file);
2539 break;
2540 case UNORDERED:
2541 fputs (".nan", file);
2542 break;
2543 case ORDERED:
2544 fputs (".num", file);
2545 break;
2546 default:
2547 gcc_unreachable ();
2549 if (FLOAT_MODE_P (mode)
2550 || x_code == EQ || x_code == NE
2551 || x_code == GEU || x_code == GTU
2552 || x_code == LEU || x_code == LTU)
2553 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2554 else
2555 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2556 break;
2557 default:
2558 common:
2559 switch (x_code)
2561 case SUBREG:
2563 rtx inner_x = SUBREG_REG (x);
2564 machine_mode inner_mode = GET_MODE (inner_x);
2565 machine_mode split = maybe_split_mode (inner_mode);
2567 if (VECTOR_MODE_P (inner_mode)
2568 && (GET_MODE_SIZE (mode)
2569 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2571 output_reg (file, REGNO (inner_x), VOIDmode);
2572 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2574 else if (split_mode_p (inner_mode)
2575 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2576 output_reg (file, REGNO (inner_x), split);
2577 else
2578 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2580 break;
2582 case REG:
2583 output_reg (file, REGNO (x), maybe_split_mode (mode));
2584 break;
2586 case MEM:
2587 fputc ('[', file);
2588 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2589 fputc (']', file);
2590 break;
2592 case CONST_INT:
2593 output_addr_const (file, x);
2594 break;
2596 case CONST:
2597 case SYMBOL_REF:
2598 case LABEL_REF:
2599 /* We could use output_addr_const, but that can print things like
2600 "x-8", which breaks ptxas. Need to ensure it is output as
2601 "x+-8". */
2602 nvptx_print_address_operand (file, x, VOIDmode);
2603 break;
2605 case CONST_DOUBLE:
2606 long vals[2];
2607 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2608 vals[0] &= 0xffffffff;
2609 vals[1] &= 0xffffffff;
2610 if (mode == SFmode)
2611 fprintf (file, "0f%08lx", vals[0]);
2612 else
2613 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2614 break;
2616 case CONST_VECTOR:
2618 unsigned n = CONST_VECTOR_NUNITS (x);
2619 fprintf (file, "{ ");
2620 for (unsigned i = 0; i < n; ++i)
2622 if (i != 0)
2623 fprintf (file, ", ");
2625 rtx elem = CONST_VECTOR_ELT (x, i);
2626 output_addr_const (file, elem);
2628 fprintf (file, " }");
2630 break;
2632 default:
2633 output_addr_const (file, x);
2638 /* Record replacement regs used to deal with subreg operands. */
2639 struct reg_replace
2641 rtx replacement[MAX_RECOG_OPERANDS];
2642 machine_mode mode;
2643 int n_allocated;
2644 int n_in_use;
2647 /* Allocate or reuse a replacement in R and return the rtx. */
2649 static rtx
2650 get_replacement (struct reg_replace *r)
2652 if (r->n_allocated == r->n_in_use)
2653 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2654 return r->replacement[r->n_in_use++];
2657 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2658 the presence of subregs would break the rules for most instructions.
2659 Replace them with a suitable new register of the right size, plus
2660 conversion copyin/copyout instructions. */
2662 static void
2663 nvptx_reorg_subreg (void)
2665 struct reg_replace qiregs, hiregs, siregs, diregs;
2666 rtx_insn *insn, *next;
2668 qiregs.n_allocated = 0;
2669 hiregs.n_allocated = 0;
2670 siregs.n_allocated = 0;
2671 diregs.n_allocated = 0;
2672 qiregs.mode = QImode;
2673 hiregs.mode = HImode;
2674 siregs.mode = SImode;
2675 diregs.mode = DImode;
2677 for (insn = get_insns (); insn; insn = next)
2679 next = NEXT_INSN (insn);
2680 if (!NONDEBUG_INSN_P (insn)
2681 || asm_noperands (PATTERN (insn)) >= 0
2682 || GET_CODE (PATTERN (insn)) == USE
2683 || GET_CODE (PATTERN (insn)) == CLOBBER)
2684 continue;
2686 qiregs.n_in_use = 0;
2687 hiregs.n_in_use = 0;
2688 siregs.n_in_use = 0;
2689 diregs.n_in_use = 0;
2690 extract_insn (insn);
2691 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2693 for (int i = 0; i < recog_data.n_operands; i++)
2695 rtx op = recog_data.operand[i];
2696 if (GET_CODE (op) != SUBREG)
2697 continue;
2699 rtx inner = SUBREG_REG (op);
2701 machine_mode outer_mode = GET_MODE (op);
2702 machine_mode inner_mode = GET_MODE (inner);
2703 gcc_assert (s_ok);
2704 if (s_ok
2705 && (GET_MODE_PRECISION (inner_mode)
2706 >= GET_MODE_PRECISION (outer_mode)))
2707 continue;
2708 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2709 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2710 : outer_mode == HImode ? &hiregs
2711 : outer_mode == SImode ? &siregs
2712 : &diregs);
2713 rtx new_reg = get_replacement (r);
2715 if (recog_data.operand_type[i] != OP_OUT)
2717 enum rtx_code code;
2718 if (GET_MODE_PRECISION (inner_mode)
2719 < GET_MODE_PRECISION (outer_mode))
2720 code = ZERO_EXTEND;
2721 else
2722 code = TRUNCATE;
2724 rtx pat = gen_rtx_SET (new_reg,
2725 gen_rtx_fmt_e (code, outer_mode, inner));
2726 emit_insn_before (pat, insn);
2729 if (recog_data.operand_type[i] != OP_IN)
2731 enum rtx_code code;
2732 if (GET_MODE_PRECISION (inner_mode)
2733 < GET_MODE_PRECISION (outer_mode))
2734 code = TRUNCATE;
2735 else
2736 code = ZERO_EXTEND;
2738 rtx pat = gen_rtx_SET (inner,
2739 gen_rtx_fmt_e (code, inner_mode, new_reg));
2740 emit_insn_after (pat, insn);
2742 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2747 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2748 first use. */
2750 static rtx
2751 nvptx_get_unisimt_master ()
2753 rtx &master = cfun->machine->unisimt_master;
2754 return master ? master : master = gen_reg_rtx (SImode);
2757 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2759 static rtx
2760 nvptx_get_unisimt_predicate ()
2762 rtx &pred = cfun->machine->unisimt_predicate;
2763 return pred ? pred : pred = gen_reg_rtx (BImode);
2766 /* Return true if given call insn references one of the functions provided by
2767 the CUDA runtime: malloc, free, vprintf. */
2769 static bool
2770 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2772 rtx pat = PATTERN (insn);
2773 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2774 pat = XVECEXP (pat, 0, 0);
2775 if (GET_CODE (pat) == SET)
2776 pat = SET_SRC (pat);
2777 gcc_checking_assert (GET_CODE (pat) == CALL
2778 && GET_CODE (XEXP (pat, 0)) == MEM);
2779 rtx addr = XEXP (XEXP (pat, 0), 0);
2780 if (GET_CODE (addr) != SYMBOL_REF)
2781 return false;
2782 const char *name = XSTR (addr, 0);
2783 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2784 references with forced assembler name refer to PTX syscalls. For vprintf,
2785 accept both normal and forced-assembler-name references. */
2786 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2787 || !strcmp (name, "*malloc")
2788 || !strcmp (name, "*free"));
2791 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2792 propagate its value from lane MASTER to current lane. */
2794 static void
2795 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2797 rtx reg;
2798 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2799 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2802 /* Adjust code for uniform-simt code generation variant by making atomics and
2803 "syscalls" conditionally executed, and inserting shuffle-based propagation
2804 for registers being set. */
2806 static void
2807 nvptx_reorg_uniform_simt ()
2809 rtx_insn *insn, *next;
2811 for (insn = get_insns (); insn; insn = next)
2813 next = NEXT_INSN (insn);
2814 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2815 && !(NONJUMP_INSN_P (insn)
2816 && GET_CODE (PATTERN (insn)) == PARALLEL
2817 && get_attr_atomic (insn)))
2818 continue;
2819 rtx pat = PATTERN (insn);
2820 rtx master = nvptx_get_unisimt_master ();
2821 for (int i = 0; i < XVECLEN (pat, 0); i++)
2822 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2823 rtx pred = nvptx_get_unisimt_predicate ();
2824 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2825 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2826 validate_change (insn, &PATTERN (insn), pat, false);
2830 /* Loop structure of the function. The entire function is described as
2831 a NULL loop. */
2833 struct parallel
2835 /* Parent parallel. */
2836 parallel *parent;
2838 /* Next sibling parallel. */
2839 parallel *next;
2841 /* First child parallel. */
2842 parallel *inner;
2844 /* Partitioning mask of the parallel. */
2845 unsigned mask;
2847 /* Partitioning used within inner parallels. */
2848 unsigned inner_mask;
2850 /* Location of parallel forked and join. The forked is the first
2851 block in the parallel and the join is the first block after of
2852 the partition. */
2853 basic_block forked_block;
2854 basic_block join_block;
2856 rtx_insn *forked_insn;
2857 rtx_insn *join_insn;
2859 rtx_insn *fork_insn;
2860 rtx_insn *joining_insn;
2862 /* Basic blocks in this parallel, but not in child parallels. The
2863 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2864 blocks are not. */
2865 auto_vec<basic_block> blocks;
2867 public:
2868 parallel (parallel *parent, unsigned mode);
2869 ~parallel ();
2872 /* Constructor links the new parallel into it's parent's chain of
2873 children. */
2875 parallel::parallel (parallel *parent_, unsigned mask_)
2876 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2878 forked_block = join_block = 0;
2879 forked_insn = join_insn = 0;
2880 fork_insn = joining_insn = 0;
2882 if (parent)
2884 next = parent->inner;
2885 parent->inner = this;
2889 parallel::~parallel ()
2891 delete inner;
2892 delete next;
2895 /* Map of basic blocks to insns */
2896 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2898 /* A tuple of an insn of interest and the BB in which it resides. */
2899 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2900 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2902 /* Split basic blocks such that each forked and join unspecs are at
2903 the start of their basic blocks. Thus afterwards each block will
2904 have a single partitioning mode. We also do the same for return
2905 insns, as they are executed by every thread. Return the
2906 partitioning mode of the function as a whole. Populate MAP with
2907 head and tail blocks. We also clear the BB visited flag, which is
2908 used when finding partitions. */
2910 static void
2911 nvptx_split_blocks (bb_insn_map_t *map)
2913 insn_bb_vec_t worklist;
2914 basic_block block;
2915 rtx_insn *insn;
2917 /* Locate all the reorg instructions of interest. */
2918 FOR_ALL_BB_FN (block, cfun)
2920 bool seen_insn = false;
2922 /* Clear visited flag, for use by parallel locator */
2923 block->flags &= ~BB_VISITED;
2925 FOR_BB_INSNS (block, insn)
2927 if (!INSN_P (insn))
2928 continue;
2929 switch (recog_memoized (insn))
2931 default:
2932 seen_insn = true;
2933 continue;
2934 case CODE_FOR_nvptx_forked:
2935 case CODE_FOR_nvptx_join:
2936 break;
2938 case CODE_FOR_return:
2939 /* We also need to split just before return insns, as
2940 that insn needs executing by all threads, but the
2941 block it is in probably does not. */
2942 break;
2945 if (seen_insn)
2946 /* We've found an instruction that must be at the start of
2947 a block, but isn't. Add it to the worklist. */
2948 worklist.safe_push (insn_bb_t (insn, block));
2949 else
2950 /* It was already the first instruction. Just add it to
2951 the map. */
2952 map->get_or_insert (block) = insn;
2953 seen_insn = true;
2957 /* Split blocks on the worklist. */
2958 unsigned ix;
2959 insn_bb_t *elt;
2960 basic_block remap = 0;
2961 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2963 if (remap != elt->second)
2965 block = elt->second;
2966 remap = block;
2969 /* Split block before insn. The insn is in the new block */
2970 edge e = split_block (block, PREV_INSN (elt->first));
2972 block = e->dest;
2973 map->get_or_insert (block) = elt->first;
2977 /* BLOCK is a basic block containing a head or tail instruction.
2978 Locate the associated prehead or pretail instruction, which must be
2979 in the single predecessor block. */
2981 static rtx_insn *
2982 nvptx_discover_pre (basic_block block, int expected)
2984 gcc_assert (block->preds->length () == 1);
2985 basic_block pre_block = (*block->preds)[0]->src;
2986 rtx_insn *pre_insn;
2988 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2989 pre_insn = PREV_INSN (pre_insn))
2990 gcc_assert (pre_insn != BB_HEAD (pre_block));
2992 gcc_assert (recog_memoized (pre_insn) == expected);
2993 return pre_insn;
2996 /* Dump this parallel and all its inner parallels. */
2998 static void
2999 nvptx_dump_pars (parallel *par, unsigned depth)
3001 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3002 depth, par->mask,
3003 par->forked_block ? par->forked_block->index : -1,
3004 par->join_block ? par->join_block->index : -1);
3006 fprintf (dump_file, " blocks:");
3008 basic_block block;
3009 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3010 fprintf (dump_file, " %d", block->index);
3011 fprintf (dump_file, "\n");
3012 if (par->inner)
3013 nvptx_dump_pars (par->inner, depth + 1);
3015 if (par->next)
3016 nvptx_dump_pars (par->next, depth);
3019 /* If BLOCK contains a fork/join marker, process it to create or
3020 terminate a loop structure. Add this block to the current loop,
3021 and then walk successor blocks. */
3023 static parallel *
3024 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3026 if (block->flags & BB_VISITED)
3027 return par;
3028 block->flags |= BB_VISITED;
3030 if (rtx_insn **endp = map->get (block))
3032 rtx_insn *end = *endp;
3034 /* This is a block head or tail, or return instruction. */
3035 switch (recog_memoized (end))
3037 case CODE_FOR_return:
3038 /* Return instructions are in their own block, and we
3039 don't need to do anything more. */
3040 return par;
3042 case CODE_FOR_nvptx_forked:
3043 /* Loop head, create a new inner loop and add it into
3044 our parent's child list. */
3046 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3048 gcc_assert (mask);
3049 par = new parallel (par, mask);
3050 par->forked_block = block;
3051 par->forked_insn = end;
3052 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3053 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
3054 par->fork_insn
3055 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3057 break;
3059 case CODE_FOR_nvptx_join:
3060 /* A loop tail. Finish the current loop and return to
3061 parent. */
3063 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3065 gcc_assert (par->mask == mask);
3066 par->join_block = block;
3067 par->join_insn = end;
3068 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3069 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
3070 par->joining_insn
3071 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3072 par = par->parent;
3074 break;
3076 default:
3077 gcc_unreachable ();
3081 if (par)
3082 /* Add this block onto the current loop's list of blocks. */
3083 par->blocks.safe_push (block);
3084 else
3085 /* This must be the entry block. Create a NULL parallel. */
3086 par = new parallel (0, 0);
3088 /* Walk successor blocks. */
3089 edge e;
3090 edge_iterator ei;
3092 FOR_EACH_EDGE (e, ei, block->succs)
3093 nvptx_find_par (map, par, e->dest);
3095 return par;
3098 /* DFS walk the CFG looking for fork & join markers. Construct
3099 loop structures as we go. MAP is a mapping of basic blocks
3100 to head & tail markers, discovered when splitting blocks. This
3101 speeds up the discovery. We rely on the BB visited flag having
3102 been cleared when splitting blocks. */
3104 static parallel *
3105 nvptx_discover_pars (bb_insn_map_t *map)
3107 basic_block block;
3109 /* Mark exit blocks as visited. */
3110 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3111 block->flags |= BB_VISITED;
3113 /* And entry block as not. */
3114 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3115 block->flags &= ~BB_VISITED;
3117 parallel *par = nvptx_find_par (map, 0, block);
3119 if (dump_file)
3121 fprintf (dump_file, "\nLoops\n");
3122 nvptx_dump_pars (par, 0);
3123 fprintf (dump_file, "\n");
3126 return par;
3129 /* Analyse a group of BBs within a partitioned region and create N
3130 Single-Entry-Single-Exit regions. Some of those regions will be
3131 trivial ones consisting of a single BB. The blocks of a
3132 partitioned region might form a set of disjoint graphs -- because
3133 the region encloses a differently partitoned sub region.
3135 We use the linear time algorithm described in 'Finding Regions Fast:
3136 Single Entry Single Exit and control Regions in Linear Time'
3137 Johnson, Pearson & Pingali. That algorithm deals with complete
3138 CFGs, where a back edge is inserted from END to START, and thus the
3139 problem becomes one of finding equivalent loops.
3141 In this case we have a partial CFG. We complete it by redirecting
3142 any incoming edge to the graph to be from an arbitrary external BB,
3143 and similarly redirecting any outgoing edge to be to that BB.
3144 Thus we end up with a closed graph.
3146 The algorithm works by building a spanning tree of an undirected
3147 graph and keeping track of back edges from nodes further from the
3148 root in the tree to nodes nearer to the root in the tree. In the
3149 description below, the root is up and the tree grows downwards.
3151 We avoid having to deal with degenerate back-edges to the same
3152 block, by splitting each BB into 3 -- one for input edges, one for
3153 the node itself and one for the output edges. Such back edges are
3154 referred to as 'Brackets'. Cycle equivalent nodes will have the
3155 same set of brackets.
3157 Determining bracket equivalency is done by maintaining a list of
3158 brackets in such a manner that the list length and final bracket
3159 uniquely identify the set.
3161 We use coloring to mark all BBs with cycle equivalency with the
3162 same color. This is the output of the 'Finding Regions Fast'
3163 algorithm. Notice it doesn't actually find the set of nodes within
3164 a particular region, just unorderd sets of nodes that are the
3165 entries and exits of SESE regions.
3167 After determining cycle equivalency, we need to find the minimal
3168 set of SESE regions. Do this with a DFS coloring walk of the
3169 complete graph. We're either 'looking' or 'coloring'. When
3170 looking, and we're in the subgraph, we start coloring the color of
3171 the current node, and remember that node as the start of the
3172 current color's SESE region. Every time we go to a new node, we
3173 decrement the count of nodes with thet color. If it reaches zero,
3174 we remember that node as the end of the current color's SESE region
3175 and return to 'looking'. Otherwise we color the node the current
3176 color.
3178 This way we end up with coloring the inside of non-trivial SESE
3179 regions with the color of that region. */
3181 /* A pair of BBs. We use this to represent SESE regions. */
3182 typedef std::pair<basic_block, basic_block> bb_pair_t;
3183 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3185 /* A node in the undirected CFG. The discriminator SECOND indicates just
3186 above or just below the BB idicated by FIRST. */
3187 typedef std::pair<basic_block, int> pseudo_node_t;
3189 /* A bracket indicates an edge towards the root of the spanning tree of the
3190 undirected graph. Each bracket has a color, determined
3191 from the currrent set of brackets. */
3192 struct bracket
3194 pseudo_node_t back; /* Back target */
3196 /* Current color and size of set. */
3197 unsigned color;
3198 unsigned size;
3200 bracket (pseudo_node_t back_)
3201 : back (back_), color (~0u), size (~0u)
3205 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3207 if (length != size)
3209 size = length;
3210 color = color_counts.length ();
3211 color_counts.quick_push (0);
3213 color_counts[color]++;
3214 return color;
3218 typedef auto_vec<bracket> bracket_vec_t;
3220 /* Basic block info for finding SESE regions. */
3222 struct bb_sese
3224 int node; /* Node number in spanning tree. */
3225 int parent; /* Parent node number. */
3227 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3228 edges arrive at pseudo-node Ai and the outgoing edges leave at
3229 pseudo-node Ao. We have to remember which way we arrived at a
3230 particular node when generating the spanning tree. dir > 0 means
3231 we arrived at Ai, dir < 0 means we arrived at Ao. */
3232 int dir;
3234 /* Lowest numbered pseudo-node reached via a backedge from thsis
3235 node, or any descendant. */
3236 pseudo_node_t high;
3238 int color; /* Cycle-equivalence color */
3240 /* Stack of brackets for this node. */
3241 bracket_vec_t brackets;
3243 bb_sese (unsigned node_, unsigned p, int dir_)
3244 :node (node_), parent (p), dir (dir_)
3247 ~bb_sese ();
3249 /* Push a bracket ending at BACK. */
3250 void push (const pseudo_node_t &back)
3252 if (dump_file)
3253 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3254 back.first ? back.first->index : 0, back.second);
3255 brackets.safe_push (bracket (back));
3258 void append (bb_sese *child);
3259 void remove (const pseudo_node_t &);
3261 /* Set node's color. */
3262 void set_color (auto_vec<unsigned> &color_counts)
3264 color = brackets.last ().get_color (color_counts, brackets.length ());
3268 bb_sese::~bb_sese ()
3272 /* Destructively append CHILD's brackets. */
3274 void
3275 bb_sese::append (bb_sese *child)
3277 if (int len = child->brackets.length ())
3279 int ix;
3281 if (dump_file)
3283 for (ix = 0; ix < len; ix++)
3285 const pseudo_node_t &pseudo = child->brackets[ix].back;
3286 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3287 child->node, pseudo.first ? pseudo.first->index : 0,
3288 pseudo.second);
3291 if (!brackets.length ())
3292 std::swap (brackets, child->brackets);
3293 else
3295 brackets.reserve (len);
3296 for (ix = 0; ix < len; ix++)
3297 brackets.quick_push (child->brackets[ix]);
3302 /* Remove brackets that terminate at PSEUDO. */
3304 void
3305 bb_sese::remove (const pseudo_node_t &pseudo)
3307 unsigned removed = 0;
3308 int len = brackets.length ();
3310 for (int ix = 0; ix < len; ix++)
3312 if (brackets[ix].back == pseudo)
3314 if (dump_file)
3315 fprintf (dump_file, "Removing backedge %d:%+d\n",
3316 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3317 removed++;
3319 else if (removed)
3320 brackets[ix-removed] = brackets[ix];
3322 while (removed--)
3323 brackets.pop ();
3326 /* Accessors for BB's aux pointer. */
3327 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3328 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3330 /* DFS walk creating SESE data structures. Only cover nodes with
3331 BB_VISITED set. Append discovered blocks to LIST. We number in
3332 increments of 3 so that the above and below pseudo nodes can be
3333 implicitly numbered too. */
3335 static int
3336 nvptx_sese_number (int n, int p, int dir, basic_block b,
3337 auto_vec<basic_block> *list)
3339 if (BB_GET_SESE (b))
3340 return n;
3342 if (dump_file)
3343 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3344 b->index, n, p, dir);
3346 BB_SET_SESE (b, new bb_sese (n, p, dir));
3347 p = n;
3349 n += 3;
3350 list->quick_push (b);
3352 /* First walk the nodes on the 'other side' of this node, then walk
3353 the nodes on the same side. */
3354 for (unsigned ix = 2; ix; ix--)
3356 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3357 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3358 : offsetof (edge_def, src));
3359 edge e;
3360 edge_iterator (ei);
3362 FOR_EACH_EDGE (e, ei, edges)
3364 basic_block target = *(basic_block *)((char *)e + offset);
3366 if (target->flags & BB_VISITED)
3367 n = nvptx_sese_number (n, p, dir, target, list);
3369 dir = -dir;
3371 return n;
3374 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3375 EDGES are the outgoing edges and OFFSET is the offset to the src
3376 or dst block on the edges. */
3378 static void
3379 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3380 vec<edge, va_gc> *edges, size_t offset)
3382 edge e;
3383 edge_iterator (ei);
3384 int hi_back = depth;
3385 pseudo_node_t node_back (0, depth);
3386 int hi_child = depth;
3387 pseudo_node_t node_child (0, depth);
3388 basic_block child = NULL;
3389 unsigned num_children = 0;
3390 int usd = -dir * sese->dir;
3392 if (dump_file)
3393 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3394 me->index, sese->node, dir);
3396 if (dir < 0)
3398 /* This is the above pseudo-child. It has the BB itself as an
3399 additional child node. */
3400 node_child = sese->high;
3401 hi_child = node_child.second;
3402 if (node_child.first)
3403 hi_child += BB_GET_SESE (node_child.first)->node;
3404 num_children++;
3407 /* Examine each edge.
3408 - if it is a child (a) append its bracket list and (b) record
3409 whether it is the child with the highest reaching bracket.
3410 - if it is an edge to ancestor, record whether it's the highest
3411 reaching backlink. */
3412 FOR_EACH_EDGE (e, ei, edges)
3414 basic_block target = *(basic_block *)((char *)e + offset);
3416 if (bb_sese *t_sese = BB_GET_SESE (target))
3418 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3420 /* Child node. Append its bracket list. */
3421 num_children++;
3422 sese->append (t_sese);
3424 /* Compare it's hi value. */
3425 int t_hi = t_sese->high.second;
3427 if (basic_block child_hi_block = t_sese->high.first)
3428 t_hi += BB_GET_SESE (child_hi_block)->node;
3430 if (hi_child > t_hi)
3432 hi_child = t_hi;
3433 node_child = t_sese->high;
3434 child = target;
3437 else if (t_sese->node < sese->node + dir
3438 && !(dir < 0 && sese->parent == t_sese->node))
3440 /* Non-parental ancestor node -- a backlink. */
3441 int d = usd * t_sese->dir;
3442 int back = t_sese->node + d;
3444 if (hi_back > back)
3446 hi_back = back;
3447 node_back = pseudo_node_t (target, d);
3451 else
3452 { /* Fallen off graph, backlink to entry node. */
3453 hi_back = 0;
3454 node_back = pseudo_node_t (0, 0);
3458 /* Remove any brackets that terminate at this pseudo node. */
3459 sese->remove (pseudo_node_t (me, dir));
3461 /* Now push any backlinks from this pseudo node. */
3462 FOR_EACH_EDGE (e, ei, edges)
3464 basic_block target = *(basic_block *)((char *)e + offset);
3465 if (bb_sese *t_sese = BB_GET_SESE (target))
3467 if (t_sese->node < sese->node + dir
3468 && !(dir < 0 && sese->parent == t_sese->node))
3469 /* Non-parental ancestor node - backedge from me. */
3470 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3472 else
3474 /* back edge to entry node */
3475 sese->push (pseudo_node_t (0, 0));
3479 /* If this node leads directly or indirectly to a no-return region of
3480 the graph, then fake a backedge to entry node. */
3481 if (!sese->brackets.length () || !edges || !edges->length ())
3483 hi_back = 0;
3484 node_back = pseudo_node_t (0, 0);
3485 sese->push (node_back);
3488 /* Record the highest reaching backedge from us or a descendant. */
3489 sese->high = hi_back < hi_child ? node_back : node_child;
3491 if (num_children > 1)
3493 /* There is more than one child -- this is a Y shaped piece of
3494 spanning tree. We have to insert a fake backedge from this
3495 node to the highest ancestor reached by not-the-highest
3496 reaching child. Note that there may be multiple children
3497 with backedges to the same highest node. That's ok and we
3498 insert the edge to that highest node. */
3499 hi_child = depth;
3500 if (dir < 0 && child)
3502 node_child = sese->high;
3503 hi_child = node_child.second;
3504 if (node_child.first)
3505 hi_child += BB_GET_SESE (node_child.first)->node;
3508 FOR_EACH_EDGE (e, ei, edges)
3510 basic_block target = *(basic_block *)((char *)e + offset);
3512 if (target == child)
3513 /* Ignore the highest child. */
3514 continue;
3516 bb_sese *t_sese = BB_GET_SESE (target);
3517 if (!t_sese)
3518 continue;
3519 if (t_sese->parent != sese->node)
3520 /* Not a child. */
3521 continue;
3523 /* Compare its hi value. */
3524 int t_hi = t_sese->high.second;
3526 if (basic_block child_hi_block = t_sese->high.first)
3527 t_hi += BB_GET_SESE (child_hi_block)->node;
3529 if (hi_child > t_hi)
3531 hi_child = t_hi;
3532 node_child = t_sese->high;
3536 sese->push (node_child);
3541 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3542 proceed to successors. Set SESE entry and exit nodes of
3543 REGIONS. */
3545 static void
3546 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3547 basic_block block, int coloring)
3549 bb_sese *sese = BB_GET_SESE (block);
3551 if (block->flags & BB_VISITED)
3553 /* If we've already encountered this block, either we must not
3554 be coloring, or it must have been colored the current color. */
3555 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3556 return;
3559 block->flags |= BB_VISITED;
3561 if (sese)
3563 if (coloring < 0)
3565 /* Start coloring a region. */
3566 regions[sese->color].first = block;
3567 coloring = sese->color;
3570 if (!--color_counts[sese->color] && sese->color == coloring)
3572 /* Found final block of SESE region. */
3573 regions[sese->color].second = block;
3574 coloring = -1;
3576 else
3577 /* Color the node, so we can assert on revisiting the node
3578 that the graph is indeed SESE. */
3579 sese->color = coloring;
3581 else
3582 /* Fallen off the subgraph, we cannot be coloring. */
3583 gcc_assert (coloring < 0);
3585 /* Walk each successor block. */
3586 if (block->succs && block->succs->length ())
3588 edge e;
3589 edge_iterator ei;
3591 FOR_EACH_EDGE (e, ei, block->succs)
3592 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3594 else
3595 gcc_assert (coloring < 0);
3598 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3599 end up with NULL entries in it. */
3601 static void
3602 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3604 basic_block block;
3605 int ix;
3607 /* First clear each BB of the whole function. */
3608 FOR_ALL_BB_FN (block, cfun)
3610 block->flags &= ~BB_VISITED;
3611 BB_SET_SESE (block, 0);
3614 /* Mark blocks in the function that are in this graph. */
3615 for (ix = 0; blocks.iterate (ix, &block); ix++)
3616 block->flags |= BB_VISITED;
3618 /* Counts of nodes assigned to each color. There cannot be more
3619 colors than blocks (and hopefully there will be fewer). */
3620 auto_vec<unsigned> color_counts;
3621 color_counts.reserve (blocks.length ());
3623 /* Worklist of nodes in the spanning tree. Again, there cannot be
3624 more nodes in the tree than blocks (there will be fewer if the
3625 CFG of blocks is disjoint). */
3626 auto_vec<basic_block> spanlist;
3627 spanlist.reserve (blocks.length ());
3629 /* Make sure every block has its cycle class determined. */
3630 for (ix = 0; blocks.iterate (ix, &block); ix++)
3632 if (BB_GET_SESE (block))
3633 /* We already met this block in an earlier graph solve. */
3634 continue;
3636 if (dump_file)
3637 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3639 /* Number the nodes reachable from block initial DFS order. */
3640 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3642 /* Now walk in reverse DFS order to find cycle equivalents. */
3643 while (spanlist.length ())
3645 block = spanlist.pop ();
3646 bb_sese *sese = BB_GET_SESE (block);
3648 /* Do the pseudo node below. */
3649 nvptx_sese_pseudo (block, sese, depth, +1,
3650 sese->dir > 0 ? block->succs : block->preds,
3651 (sese->dir > 0 ? offsetof (edge_def, dest)
3652 : offsetof (edge_def, src)));
3653 sese->set_color (color_counts);
3654 /* Do the pseudo node above. */
3655 nvptx_sese_pseudo (block, sese, depth, -1,
3656 sese->dir < 0 ? block->succs : block->preds,
3657 (sese->dir < 0 ? offsetof (edge_def, dest)
3658 : offsetof (edge_def, src)));
3660 if (dump_file)
3661 fprintf (dump_file, "\n");
3664 if (dump_file)
3666 unsigned count;
3667 const char *comma = "";
3669 fprintf (dump_file, "Found %d cycle equivalents\n",
3670 color_counts.length ());
3671 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3673 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3675 comma = "";
3676 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3677 if (BB_GET_SESE (block)->color == ix)
3679 block->flags |= BB_VISITED;
3680 fprintf (dump_file, "%s%d", comma, block->index);
3681 comma=",";
3683 fprintf (dump_file, "}");
3684 comma = ", ";
3686 fprintf (dump_file, "\n");
3689 /* Now we've colored every block in the subgraph. We now need to
3690 determine the minimal set of SESE regions that cover that
3691 subgraph. Do this with a DFS walk of the complete function.
3692 During the walk we're either 'looking' or 'coloring'. When we
3693 reach the last node of a particular color, we stop coloring and
3694 return to looking. */
3696 /* There cannot be more SESE regions than colors. */
3697 regions.reserve (color_counts.length ());
3698 for (ix = color_counts.length (); ix--;)
3699 regions.quick_push (bb_pair_t (0, 0));
3701 for (ix = 0; blocks.iterate (ix, &block); ix++)
3702 block->flags &= ~BB_VISITED;
3704 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3706 if (dump_file)
3708 const char *comma = "";
3709 int len = regions.length ();
3711 fprintf (dump_file, "SESE regions:");
3712 for (ix = 0; ix != len; ix++)
3714 basic_block from = regions[ix].first;
3715 basic_block to = regions[ix].second;
3717 if (from)
3719 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3720 if (to != from)
3721 fprintf (dump_file, "->%d", to->index);
3723 int color = BB_GET_SESE (from)->color;
3725 /* Print the blocks within the region (excluding ends). */
3726 FOR_EACH_BB_FN (block, cfun)
3728 bb_sese *sese = BB_GET_SESE (block);
3730 if (sese && sese->color == color
3731 && block != from && block != to)
3732 fprintf (dump_file, ".%d", block->index);
3734 fprintf (dump_file, "}");
3736 comma = ",";
3738 fprintf (dump_file, "\n\n");
3741 for (ix = 0; blocks.iterate (ix, &block); ix++)
3742 delete BB_GET_SESE (block);
3745 #undef BB_SET_SESE
3746 #undef BB_GET_SESE
3748 /* Propagate live state at the start of a partitioned region. BLOCK
3749 provides the live register information, and might not contain
3750 INSN. Propagation is inserted just after INSN. RW indicates whether
3751 we are reading and/or writing state. This
3752 separation is needed for worker-level proppagation where we
3753 essentially do a spill & fill. FN is the underlying worker
3754 function to generate the propagation instructions for single
3755 register. DATA is user data.
3757 We propagate the live register set and the entire frame. We could
3758 do better by (a) propagating just the live set that is used within
3759 the partitioned regions and (b) only propagating stack entries that
3760 are used. The latter might be quite hard to determine. */
3762 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3764 static void
3765 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3766 propagator_fn fn, void *data)
3768 bitmap live = DF_LIVE_IN (block);
3769 bitmap_iterator iterator;
3770 unsigned ix;
3772 /* Copy the frame array. */
3773 HOST_WIDE_INT fs = get_frame_size ();
3774 if (fs)
3776 rtx tmp = gen_reg_rtx (DImode);
3777 rtx idx = NULL_RTX;
3778 rtx ptr = gen_reg_rtx (Pmode);
3779 rtx pred = NULL_RTX;
3780 rtx_code_label *label = NULL;
3782 /* The frame size might not be DImode compatible, but the frame
3783 array's declaration will be. So it's ok to round up here. */
3784 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3785 /* Detect single iteration loop. */
3786 if (fs == 1)
3787 fs = 0;
3789 start_sequence ();
3790 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3791 if (fs)
3793 idx = gen_reg_rtx (SImode);
3794 pred = gen_reg_rtx (BImode);
3795 label = gen_label_rtx ();
3797 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3798 /* Allow worker function to initialize anything needed. */
3799 rtx init = fn (tmp, PM_loop_begin, fs, data);
3800 if (init)
3801 emit_insn (init);
3802 emit_label (label);
3803 LABEL_NUSES (label)++;
3804 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3806 if (rw & PM_read)
3807 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3808 emit_insn (fn (tmp, rw, fs, data));
3809 if (rw & PM_write)
3810 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3811 if (fs)
3813 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3814 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3815 emit_insn (gen_br_true_uni (pred, label));
3816 rtx fini = fn (tmp, PM_loop_end, fs, data);
3817 if (fini)
3818 emit_insn (fini);
3819 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3821 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3822 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3823 rtx cpy = get_insns ();
3824 end_sequence ();
3825 insn = emit_insn_after (cpy, insn);
3828 /* Copy live registers. */
3829 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3831 rtx reg = regno_reg_rtx[ix];
3833 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3835 rtx bcast = fn (reg, rw, 0, data);
3837 insn = emit_insn_after (bcast, insn);
3842 /* Worker for nvptx_vpropagate. */
3844 static rtx
3845 vprop_gen (rtx reg, propagate_mask pm,
3846 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3848 if (!(pm & PM_read_write))
3849 return 0;
3851 return nvptx_gen_vcast (reg);
3854 /* Propagate state that is live at start of BLOCK across the vectors
3855 of a single warp. Propagation is inserted just after INSN. */
3857 static void
3858 nvptx_vpropagate (basic_block block, rtx_insn *insn)
3860 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3863 /* Worker for nvptx_wpropagate. */
3865 static rtx
3866 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3868 wcast_data_t *data = (wcast_data_t *)data_;
3870 if (pm & PM_loop_begin)
3872 /* Starting a loop, initialize pointer. */
3873 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3875 if (align > worker_bcast_align)
3876 worker_bcast_align = align;
3877 data->offset = (data->offset + align - 1) & ~(align - 1);
3879 data->ptr = gen_reg_rtx (Pmode);
3881 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3883 else if (pm & PM_loop_end)
3885 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3886 data->ptr = NULL_RTX;
3887 return clobber;
3889 else
3890 return nvptx_gen_wcast (reg, pm, rep, data);
3893 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3894 indicates if this is just before partitioned mode (do spill), or
3895 just after it starts (do fill). Sequence is inserted just after
3896 INSN. */
3898 static void
3899 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3901 wcast_data_t data;
3903 data.base = gen_reg_rtx (Pmode);
3904 data.offset = 0;
3905 data.ptr = NULL_RTX;
3907 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3908 if (data.offset)
3910 /* Stuff was emitted, initialize the base pointer now. */
3911 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3912 emit_insn_after (init, insn);
3914 if (worker_bcast_size < data.offset)
3915 worker_bcast_size = data.offset;
3919 /* Emit a worker-level synchronization barrier. We use different
3920 markers for before and after synchronizations. */
3922 static rtx
3923 nvptx_wsync (bool after)
3925 return gen_nvptx_barsync (GEN_INT (after));
3928 #if WORKAROUND_PTXJIT_BUG
3929 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
3930 real insns. */
3932 static rtx_insn *
3933 bb_first_real_insn (basic_block bb)
3935 rtx_insn *insn;
3937 /* Find first insn of from block. */
3938 FOR_BB_INSNS (bb, insn)
3939 if (INSN_P (insn))
3940 return insn;
3942 return 0;
3944 #endif
3946 /* Single neutering according to MASK. FROM is the incoming block and
3947 TO is the outgoing block. These may be the same block. Insert at
3948 start of FROM:
3950 if (tid.<axis>) goto end.
3952 and insert before ending branch of TO (if there is such an insn):
3954 end:
3955 <possibly-broadcast-cond>
3956 <branch>
3958 We currently only use differnt FROM and TO when skipping an entire
3959 loop. We could do more if we detected superblocks. */
3961 static void
3962 nvptx_single (unsigned mask, basic_block from, basic_block to)
3964 rtx_insn *head = BB_HEAD (from);
3965 rtx_insn *tail = BB_END (to);
3966 unsigned skip_mask = mask;
3968 while (true)
3970 /* Find first insn of from block. */
3971 while (head != BB_END (from) && !INSN_P (head))
3972 head = NEXT_INSN (head);
3974 if (from == to)
3975 break;
3977 if (!(JUMP_P (head) && single_succ_p (from)))
3978 break;
3980 basic_block jump_target = single_succ (from);
3981 if (!single_pred_p (jump_target))
3982 break;
3984 from = jump_target;
3985 head = BB_HEAD (from);
3988 /* Find last insn of to block */
3989 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3990 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3991 tail = PREV_INSN (tail);
3993 /* Detect if tail is a branch. */
3994 rtx tail_branch = NULL_RTX;
3995 rtx cond_branch = NULL_RTX;
3996 if (tail && INSN_P (tail))
3998 tail_branch = PATTERN (tail);
3999 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4000 tail_branch = NULL_RTX;
4001 else
4003 cond_branch = SET_SRC (tail_branch);
4004 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4005 cond_branch = NULL_RTX;
4009 if (tail == head)
4011 /* If this is empty, do nothing. */
4012 if (!head || !INSN_P (head))
4013 return;
4015 /* If this is a dummy insn, do nothing. */
4016 switch (recog_memoized (head))
4018 default:
4019 break;
4020 case CODE_FOR_nvptx_fork:
4021 case CODE_FOR_nvptx_forked:
4022 case CODE_FOR_nvptx_joining:
4023 case CODE_FOR_nvptx_join:
4024 return;
4027 if (cond_branch)
4029 /* If we're only doing vector single, there's no need to
4030 emit skip code because we'll not insert anything. */
4031 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4032 skip_mask = 0;
4034 else if (tail_branch)
4035 /* Block with only unconditional branch. Nothing to do. */
4036 return;
4039 /* Insert the vector test inside the worker test. */
4040 unsigned mode;
4041 rtx_insn *before = tail;
4042 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4043 if (GOMP_DIM_MASK (mode) & skip_mask)
4045 rtx_code_label *label = gen_label_rtx ();
4046 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4048 if (!pred)
4050 pred = gen_reg_rtx (BImode);
4051 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4054 rtx br;
4055 if (mode == GOMP_DIM_VECTOR)
4056 br = gen_br_true (pred, label);
4057 else
4058 br = gen_br_true_uni (pred, label);
4059 emit_insn_before (br, head);
4061 LABEL_NUSES (label)++;
4062 if (tail_branch)
4063 before = emit_label_before (label, before);
4064 else
4065 emit_label_after (label, tail);
4068 /* Now deal with propagating the branch condition. */
4069 if (cond_branch)
4071 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4073 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
4075 /* Vector mode only, do a shuffle. */
4076 #if WORKAROUND_PTXJIT_BUG
4077 /* The branch condition %rcond is propagated like this:
4080 .reg .u32 %x;
4081 mov.u32 %x,%tid.x;
4082 setp.ne.u32 %rnotvzero,%x,0;
4085 @%rnotvzero bra Lskip;
4086 setp.<op>.<type> %rcond,op1,op2;
4087 Lskip:
4088 selp.u32 %rcondu32,1,0,%rcond;
4089 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4090 setp.ne.u32 %rcond,%rcondu32,0;
4092 There seems to be a bug in the ptx JIT compiler (observed at driver
4093 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4094 unless %rcond is initialized to something before 'bra Lskip'. The
4095 bug is not observed with ptxas from cuda 8.0.61.
4097 It is true that the code is non-trivial: at Lskip, %rcond is
4098 uninitialized in threads 1-31, and after the selp the same holds
4099 for %rcondu32. But shfl propagates the defined value in thread 0
4100 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4101 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4103 There is nothing in the PTX spec to suggest that this is wrong, or
4104 to explain why the extra initialization is needed. So, we classify
4105 it as a JIT bug, and the extra initialization as workaround:
4108 .reg .u32 %x;
4109 mov.u32 %x,%tid.x;
4110 setp.ne.u32 %rnotvzero,%x,0;
4113 +.reg .pred %rcond2;
4114 +setp.eq.u32 %rcond2, 1, 0;
4116 @%rnotvzero bra Lskip;
4117 setp.<op>.<type> %rcond,op1,op2;
4118 +mov.pred %rcond2, %rcond;
4119 Lskip:
4120 +mov.pred %rcond, %rcond2;
4121 selp.u32 %rcondu32,1,0,%rcond;
4122 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4123 setp.ne.u32 %rcond,%rcondu32,0;
4125 rtx_insn *label = PREV_INSN (tail);
4126 gcc_assert (label && LABEL_P (label));
4127 rtx tmp = gen_reg_rtx (BImode);
4128 emit_insn_before (gen_movbi (tmp, const0_rtx),
4129 bb_first_real_insn (from));
4130 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4131 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4132 #endif
4133 emit_insn_before (nvptx_gen_vcast (pvar), tail);
4135 else
4137 /* Includes worker mode, do spill & fill. By construction
4138 we should never have worker mode only. */
4139 wcast_data_t data;
4141 data.base = worker_bcast_sym;
4142 data.ptr = 0;
4144 if (worker_bcast_size < GET_MODE_SIZE (SImode))
4145 worker_bcast_size = GET_MODE_SIZE (SImode);
4147 data.offset = 0;
4148 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
4149 before);
4150 /* Barrier so other workers can see the write. */
4151 emit_insn_before (nvptx_wsync (false), tail);
4152 data.offset = 0;
4153 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
4154 /* This barrier is needed to avoid worker zero clobbering
4155 the broadcast buffer before all the other workers have
4156 had a chance to read this instance of it. */
4157 emit_insn_before (nvptx_wsync (true), tail);
4160 extract_insn (tail);
4161 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4162 UNSPEC_BR_UNIFIED);
4163 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4167 /* PAR is a parallel that is being skipped in its entirety according to
4168 MASK. Treat this as skipping a superblock starting at forked
4169 and ending at joining. */
4171 static void
4172 nvptx_skip_par (unsigned mask, parallel *par)
4174 basic_block tail = par->join_block;
4175 gcc_assert (tail->preds->length () == 1);
4177 basic_block pre_tail = (*tail->preds)[0]->src;
4178 gcc_assert (pre_tail->succs->length () == 1);
4180 nvptx_single (mask, par->forked_block, pre_tail);
4183 /* If PAR has a single inner parallel and PAR itself only contains
4184 empty entry and exit blocks, swallow the inner PAR. */
4186 static void
4187 nvptx_optimize_inner (parallel *par)
4189 parallel *inner = par->inner;
4191 /* We mustn't be the outer dummy par. */
4192 if (!par->mask)
4193 return;
4195 /* We must have a single inner par. */
4196 if (!inner || inner->next)
4197 return;
4199 /* We must only contain 2 blocks ourselves -- the head and tail of
4200 the inner par. */
4201 if (par->blocks.length () != 2)
4202 return;
4204 /* We must be disjoint partitioning. As we only have vector and
4205 worker partitioning, this is sufficient to guarantee the pars
4206 have adjacent partitioning. */
4207 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4208 /* This indicates malformed code generation. */
4209 return;
4211 /* The outer forked insn should be immediately followed by the inner
4212 fork insn. */
4213 rtx_insn *forked = par->forked_insn;
4214 rtx_insn *fork = BB_END (par->forked_block);
4216 if (NEXT_INSN (forked) != fork)
4217 return;
4218 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4220 /* The outer joining insn must immediately follow the inner join
4221 insn. */
4222 rtx_insn *joining = par->joining_insn;
4223 rtx_insn *join = inner->join_insn;
4224 if (NEXT_INSN (join) != joining)
4225 return;
4227 /* Preconditions met. Swallow the inner par. */
4228 if (dump_file)
4229 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4230 inner->mask, inner->forked_block->index,
4231 inner->join_block->index,
4232 par->mask, par->forked_block->index, par->join_block->index);
4234 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4236 par->blocks.reserve (inner->blocks.length ());
4237 while (inner->blocks.length ())
4238 par->blocks.quick_push (inner->blocks.pop ());
4240 par->inner = inner->inner;
4241 inner->inner = NULL;
4243 delete inner;
4246 /* Process the parallel PAR and all its contained
4247 parallels. We do everything but the neutering. Return mask of
4248 partitioned modes used within this parallel. */
4250 static unsigned
4251 nvptx_process_pars (parallel *par)
4253 if (nvptx_optimize)
4254 nvptx_optimize_inner (par);
4256 unsigned inner_mask = par->mask;
4258 /* Do the inner parallels first. */
4259 if (par->inner)
4261 par->inner_mask = nvptx_process_pars (par->inner);
4262 inner_mask |= par->inner_mask;
4265 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
4266 /* No propagation needed for a call. */;
4267 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4269 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
4270 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
4271 /* Insert begin and end synchronizations. */
4272 emit_insn_after (nvptx_wsync (false), par->forked_insn);
4273 emit_insn_before (nvptx_wsync (true), par->joining_insn);
4275 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4276 nvptx_vpropagate (par->forked_block, par->forked_insn);
4278 /* Now do siblings. */
4279 if (par->next)
4280 inner_mask |= nvptx_process_pars (par->next);
4281 return inner_mask;
4284 /* Neuter the parallel described by PAR. We recurse in depth-first
4285 order. MODES are the partitioning of the execution and OUTER is
4286 the partitioning of the parallels we are contained in. */
4288 static void
4289 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4291 unsigned me = (par->mask
4292 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4293 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4294 unsigned skip_mask = 0, neuter_mask = 0;
4296 if (par->inner)
4297 nvptx_neuter_pars (par->inner, modes, outer | me);
4299 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4301 if ((outer | me) & GOMP_DIM_MASK (mode))
4302 {} /* Mode is partitioned: no neutering. */
4303 else if (!(modes & GOMP_DIM_MASK (mode)))
4304 {} /* Mode is not used: nothing to do. */
4305 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4306 || !par->forked_insn)
4307 /* Partitioned in inner parallels, or we're not a partitioned
4308 at all: neuter individual blocks. */
4309 neuter_mask |= GOMP_DIM_MASK (mode);
4310 else if (!par->parent || !par->parent->forked_insn
4311 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4312 /* Parent isn't a parallel or contains this paralleling: skip
4313 parallel at this level. */
4314 skip_mask |= GOMP_DIM_MASK (mode);
4315 else
4316 {} /* Parent will skip this parallel itself. */
4319 if (neuter_mask)
4321 int ix, len;
4323 if (nvptx_optimize)
4325 /* Neuter whole SESE regions. */
4326 bb_pair_vec_t regions;
4328 nvptx_find_sese (par->blocks, regions);
4329 len = regions.length ();
4330 for (ix = 0; ix != len; ix++)
4332 basic_block from = regions[ix].first;
4333 basic_block to = regions[ix].second;
4335 if (from)
4336 nvptx_single (neuter_mask, from, to);
4337 else
4338 gcc_assert (!to);
4341 else
4343 /* Neuter each BB individually. */
4344 len = par->blocks.length ();
4345 for (ix = 0; ix != len; ix++)
4347 basic_block block = par->blocks[ix];
4349 nvptx_single (neuter_mask, block, block);
4354 if (skip_mask)
4355 nvptx_skip_par (skip_mask, par);
4357 if (par->next)
4358 nvptx_neuter_pars (par->next, modes, outer);
4361 /* PTX-specific reorganization
4362 - Split blocks at fork and join instructions
4363 - Compute live registers
4364 - Mark now-unused registers, so function begin doesn't declare
4365 unused registers.
4366 - Insert state propagation when entering partitioned mode
4367 - Insert neutering instructions when in single mode
4368 - Replace subregs with suitable sequences.
4371 static void
4372 nvptx_reorg (void)
4374 /* We are freeing block_for_insn in the toplev to keep compatibility
4375 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4376 compute_bb_for_insn ();
4378 thread_prologue_and_epilogue_insns ();
4380 /* Split blocks and record interesting unspecs. */
4381 bb_insn_map_t bb_insn_map;
4383 nvptx_split_blocks (&bb_insn_map);
4385 /* Compute live regs */
4386 df_clear_flags (DF_LR_RUN_DCE);
4387 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
4388 df_live_add_problem ();
4389 df_live_set_all_dirty ();
4390 df_analyze ();
4391 regstat_init_n_sets_and_refs ();
4393 if (dump_file)
4394 df_dump (dump_file);
4396 /* Mark unused regs as unused. */
4397 int max_regs = max_reg_num ();
4398 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
4399 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4400 regno_reg_rtx[i] = const0_rtx;
4402 /* Determine launch dimensions of the function. If it is not an
4403 offloaded function (i.e. this is a regular compiler), the
4404 function has no neutering. */
4405 tree attr = oacc_get_fn_attrib (current_function_decl);
4406 if (attr)
4408 /* If we determined this mask before RTL expansion, we could
4409 elide emission of some levels of forks and joins. */
4410 unsigned mask = 0;
4411 tree dims = TREE_VALUE (attr);
4412 unsigned ix;
4414 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4416 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4417 tree allowed = TREE_PURPOSE (dims);
4419 if (size != 1 && !(allowed && integer_zerop (allowed)))
4420 mask |= GOMP_DIM_MASK (ix);
4422 /* If there is worker neutering, there must be vector
4423 neutering. Otherwise the hardware will fail. */
4424 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4425 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4427 /* Discover & process partitioned regions. */
4428 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4429 nvptx_process_pars (pars);
4430 nvptx_neuter_pars (pars, mask, 0);
4431 delete pars;
4434 /* Replace subregs. */
4435 nvptx_reorg_subreg ();
4437 if (TARGET_UNIFORM_SIMT)
4438 nvptx_reorg_uniform_simt ();
4440 regstat_free_n_sets_and_refs ();
4442 df_finish_pass (true);
4445 /* Handle a "kernel" attribute; arguments as in
4446 struct attribute_spec.handler. */
4448 static tree
4449 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4450 int ARG_UNUSED (flags), bool *no_add_attrs)
4452 tree decl = *node;
4454 if (TREE_CODE (decl) != FUNCTION_DECL)
4456 error ("%qE attribute only applies to functions", name);
4457 *no_add_attrs = true;
4459 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
4461 error ("%qE attribute requires a void return type", name);
4462 *no_add_attrs = true;
4465 return NULL_TREE;
4468 /* Handle a "shared" attribute; arguments as in
4469 struct attribute_spec.handler. */
4471 static tree
4472 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4473 int ARG_UNUSED (flags), bool *no_add_attrs)
4475 tree decl = *node;
4477 if (TREE_CODE (decl) != VAR_DECL)
4479 error ("%qE attribute only applies to variables", name);
4480 *no_add_attrs = true;
4482 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4484 error ("%qE attribute not allowed with auto storage class", name);
4485 *no_add_attrs = true;
4488 return NULL_TREE;
4491 /* Table of valid machine attributes. */
4492 static const struct attribute_spec nvptx_attribute_table[] =
4494 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
4495 affects_type_identity, handler, exclude } */
4496 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
4497 NULL },
4498 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
4499 NULL },
4500 { NULL, 0, 0, false, false, false, false, NULL, NULL }
4503 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
4505 static HOST_WIDE_INT
4506 nvptx_vector_alignment (const_tree type)
4508 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4510 return MIN (align, BIGGEST_ALIGNMENT);
4513 /* Indicate that INSN cannot be duplicated. */
4515 static bool
4516 nvptx_cannot_copy_insn_p (rtx_insn *insn)
4518 switch (recog_memoized (insn))
4520 case CODE_FOR_nvptx_shufflesi:
4521 case CODE_FOR_nvptx_shufflesf:
4522 case CODE_FOR_nvptx_barsync:
4523 case CODE_FOR_nvptx_fork:
4524 case CODE_FOR_nvptx_forked:
4525 case CODE_FOR_nvptx_joining:
4526 case CODE_FOR_nvptx_join:
4527 return true;
4528 default:
4529 return false;
4533 /* Section anchors do not work. Initialization for flag_section_anchor
4534 probes the existence of the anchoring target hooks and prevents
4535 anchoring if they don't exist. However, we may be being used with
4536 a host-side compiler that does support anchoring, and hence see
4537 the anchor flag set (as it's not recalculated). So provide an
4538 implementation denying anchoring. */
4540 static bool
4541 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4543 return false;
4546 /* Record a symbol for mkoffload to enter into the mapping table. */
4548 static void
4549 nvptx_record_offload_symbol (tree decl)
4551 switch (TREE_CODE (decl))
4553 case VAR_DECL:
4554 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4555 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4556 break;
4558 case FUNCTION_DECL:
4560 tree attr = oacc_get_fn_attrib (decl);
4561 /* OpenMP offloading does not set this attribute. */
4562 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
4564 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4565 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4567 for (; dims; dims = TREE_CHAIN (dims))
4569 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4571 gcc_assert (!TREE_PURPOSE (dims));
4572 fprintf (asm_out_file, ", %#x", size);
4575 fprintf (asm_out_file, "\n");
4577 break;
4579 default:
4580 gcc_unreachable ();
4584 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4585 at the start of a file. */
4587 static void
4588 nvptx_file_start (void)
4590 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4591 fputs ("\t.version\t3.1\n", asm_out_file);
4592 fputs ("\t.target\tsm_30\n", asm_out_file);
4593 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4594 fputs ("// END PREAMBLE\n", asm_out_file);
4597 /* Emit a declaration for a worker-level buffer in .shared memory. */
4599 static void
4600 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
4602 const char *name = XSTR (sym, 0);
4604 write_var_marker (file, true, false, name);
4605 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4606 align, name, size);
4609 /* Write out the function declarations we've collected and declare storage
4610 for the broadcast buffer. */
4612 static void
4613 nvptx_file_end (void)
4615 hash_table<tree_hasher>::iterator iter;
4616 tree decl;
4617 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
4618 nvptx_record_fndecl (decl);
4619 fputs (func_decls.str().c_str(), asm_out_file);
4621 if (worker_bcast_size)
4622 write_worker_buffer (asm_out_file, worker_bcast_sym,
4623 worker_bcast_align, worker_bcast_size);
4625 if (worker_red_size)
4626 write_worker_buffer (asm_out_file, worker_red_sym,
4627 worker_red_align, worker_red_size);
4629 if (need_softstack_decl)
4631 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4632 /* 32 is the maximum number of warps in a block. Even though it's an
4633 external declaration, emit the array size explicitly; otherwise, it
4634 may fail at PTX JIT time if the definition is later in link order. */
4635 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
4636 POINTER_SIZE);
4638 if (need_unisimt_decl)
4640 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
4641 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
4645 /* Expander for the shuffle builtins. */
4647 static rtx
4648 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4650 if (ignore)
4651 return target;
4653 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4654 NULL_RTX, mode, EXPAND_NORMAL);
4655 if (!REG_P (src))
4656 src = copy_to_mode_reg (mode, src);
4658 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
4659 NULL_RTX, SImode, EXPAND_NORMAL);
4660 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
4661 NULL_RTX, SImode, EXPAND_NORMAL);
4663 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4664 idx = copy_to_mode_reg (SImode, idx);
4666 rtx pat = nvptx_gen_shuffle (target, src, idx,
4667 (nvptx_shuffle_kind) INTVAL (op));
4668 if (pat)
4669 emit_insn (pat);
4671 return target;
4674 /* Worker reduction address expander. */
4676 static rtx
4677 nvptx_expand_worker_addr (tree exp, rtx target,
4678 machine_mode ARG_UNUSED (mode), int ignore)
4680 if (ignore)
4681 return target;
4683 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4684 if (align > worker_red_align)
4685 worker_red_align = align;
4687 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4688 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4689 if (size + offset > worker_red_size)
4690 worker_red_size = size + offset;
4692 rtx addr = worker_red_sym;
4693 if (offset)
4695 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4696 addr = gen_rtx_CONST (Pmode, addr);
4699 emit_move_insn (target, addr);
4701 return target;
4704 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4705 not require taking the address of any object, other than the memory
4706 cell being operated on. */
4708 static rtx
4709 nvptx_expand_cmp_swap (tree exp, rtx target,
4710 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4712 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4714 if (!target)
4715 target = gen_reg_rtx (mode);
4717 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4718 NULL_RTX, Pmode, EXPAND_NORMAL);
4719 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4720 NULL_RTX, mode, EXPAND_NORMAL);
4721 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4722 NULL_RTX, mode, EXPAND_NORMAL);
4723 rtx pat;
4725 mem = gen_rtx_MEM (mode, mem);
4726 if (!REG_P (cmp))
4727 cmp = copy_to_mode_reg (mode, cmp);
4728 if (!REG_P (src))
4729 src = copy_to_mode_reg (mode, src);
4731 if (mode == SImode)
4732 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4733 else
4734 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4736 emit_insn (pat);
4738 return target;
4742 /* Codes for all the NVPTX builtins. */
4743 enum nvptx_builtins
4745 NVPTX_BUILTIN_SHUFFLE,
4746 NVPTX_BUILTIN_SHUFFLELL,
4747 NVPTX_BUILTIN_WORKER_ADDR,
4748 NVPTX_BUILTIN_CMP_SWAP,
4749 NVPTX_BUILTIN_CMP_SWAPLL,
4750 NVPTX_BUILTIN_MAX
4753 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4755 /* Return the NVPTX builtin for CODE. */
4757 static tree
4758 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4760 if (code >= NVPTX_BUILTIN_MAX)
4761 return error_mark_node;
4763 return nvptx_builtin_decls[code];
4766 /* Set up all builtin functions for this target. */
4768 static void
4769 nvptx_init_builtins (void)
4771 #define DEF(ID, NAME, T) \
4772 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4773 = add_builtin_function ("__builtin_nvptx_" NAME, \
4774 build_function_type_list T, \
4775 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4776 #define ST sizetype
4777 #define UINT unsigned_type_node
4778 #define LLUINT long_long_unsigned_type_node
4779 #define PTRVOID ptr_type_node
4781 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4782 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4783 DEF (WORKER_ADDR, "worker_addr",
4784 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4785 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4786 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4788 #undef DEF
4789 #undef ST
4790 #undef UINT
4791 #undef LLUINT
4792 #undef PTRVOID
4795 /* Expand an expression EXP that calls a built-in function,
4796 with result going to TARGET if that's convenient
4797 (and in mode MODE if that's convenient).
4798 SUBTARGET may be used as the target for computing one of EXP's operands.
4799 IGNORE is nonzero if the value is to be ignored. */
4801 static rtx
4802 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4803 machine_mode mode, int ignore)
4805 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4806 switch (DECL_FUNCTION_CODE (fndecl))
4808 case NVPTX_BUILTIN_SHUFFLE:
4809 case NVPTX_BUILTIN_SHUFFLELL:
4810 return nvptx_expand_shuffle (exp, target, mode, ignore);
4812 case NVPTX_BUILTIN_WORKER_ADDR:
4813 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4815 case NVPTX_BUILTIN_CMP_SWAP:
4816 case NVPTX_BUILTIN_CMP_SWAPLL:
4817 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4819 default: gcc_unreachable ();
4823 /* Define dimension sizes for known hardware. */
4824 #define PTX_VECTOR_LENGTH 32
4825 #define PTX_WORKER_LENGTH 32
4826 #define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
4828 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
4830 static int
4831 nvptx_simt_vf ()
4833 return PTX_VECTOR_LENGTH;
4836 /* Validate compute dimensions of an OpenACC offload or routine, fill
4837 in non-unity defaults. FN_LEVEL indicates the level at which a
4838 routine might spawn a loop. It is negative for non-routines. If
4839 DECL is null, we are validating the default dimensions. */
4841 static bool
4842 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4844 bool changed = false;
4846 /* The vector size must be 32, unless this is a SEQ routine. */
4847 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4848 && dims[GOMP_DIM_VECTOR] >= 0
4849 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4851 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
4852 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4853 dims[GOMP_DIM_VECTOR]
4854 ? G_("using vector_length (%d), ignoring %d")
4855 : G_("using vector_length (%d), ignoring runtime setting"),
4856 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4857 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4858 changed = true;
4861 /* Check the num workers is not too large. */
4862 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4864 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4865 "using num_workers (%d), ignoring %d",
4866 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4867 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4868 changed = true;
4871 if (!decl)
4873 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4874 if (dims[GOMP_DIM_WORKER] < 0)
4875 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4876 if (dims[GOMP_DIM_GANG] < 0)
4877 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
4878 changed = true;
4881 return changed;
4884 /* Return maximum dimension size, or zero for unbounded. */
4886 static int
4887 nvptx_dim_limit (int axis)
4889 switch (axis)
4891 case GOMP_DIM_WORKER:
4892 return PTX_WORKER_LENGTH;
4894 case GOMP_DIM_VECTOR:
4895 return PTX_VECTOR_LENGTH;
4897 default:
4898 break;
4900 return 0;
4903 /* Determine whether fork & joins are needed. */
4905 static bool
4906 nvptx_goacc_fork_join (gcall *call, const int dims[],
4907 bool ARG_UNUSED (is_fork))
4909 tree arg = gimple_call_arg (call, 2);
4910 unsigned axis = TREE_INT_CST_LOW (arg);
4912 /* We only care about worker and vector partitioning. */
4913 if (axis < GOMP_DIM_WORKER)
4914 return false;
4916 /* If the size is 1, there's no partitioning. */
4917 if (dims[axis] == 1)
4918 return false;
4920 return true;
4923 /* Generate a PTX builtin function call that returns the address in
4924 the worker reduction buffer at OFFSET. TYPE is the type of the
4925 data at that location. */
4927 static tree
4928 nvptx_get_worker_red_addr (tree type, tree offset)
4930 machine_mode mode = TYPE_MODE (type);
4931 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4932 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4933 tree align = build_int_cst (unsigned_type_node,
4934 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4935 tree call = build_call_expr (fndecl, 3, offset, size, align);
4937 return fold_convert (build_pointer_type (type), call);
4940 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4941 will cast the variable if necessary. */
4943 static void
4944 nvptx_generate_vector_shuffle (location_t loc,
4945 tree dest_var, tree var, unsigned shift,
4946 gimple_seq *seq)
4948 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4949 tree_code code = NOP_EXPR;
4950 tree arg_type = unsigned_type_node;
4951 tree var_type = TREE_TYPE (var);
4952 tree dest_type = var_type;
4954 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4955 var_type = TREE_TYPE (var_type);
4957 if (TREE_CODE (var_type) == REAL_TYPE)
4958 code = VIEW_CONVERT_EXPR;
4960 if (TYPE_SIZE (var_type)
4961 == TYPE_SIZE (long_long_unsigned_type_node))
4963 fn = NVPTX_BUILTIN_SHUFFLELL;
4964 arg_type = long_long_unsigned_type_node;
4967 tree call = nvptx_builtin_decl (fn, true);
4968 tree bits = build_int_cst (unsigned_type_node, shift);
4969 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4970 tree expr;
4972 if (var_type != dest_type)
4974 /* Do real and imaginary parts separately. */
4975 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4976 real = fold_build1 (code, arg_type, real);
4977 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4978 real = fold_build1 (code, var_type, real);
4980 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4981 imag = fold_build1 (code, arg_type, imag);
4982 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4983 imag = fold_build1 (code, var_type, imag);
4985 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4987 else
4989 expr = fold_build1 (code, arg_type, var);
4990 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4991 expr = fold_build1 (code, dest_type, expr);
4994 gimplify_assign (dest_var, expr, seq);
4997 /* Lazily generate the global lock var decl and return its address. */
4999 static tree
5000 nvptx_global_lock_addr ()
5002 tree v = global_lock_var;
5004 if (!v)
5006 tree name = get_identifier ("__reduction_lock");
5007 tree type = build_qualified_type (unsigned_type_node,
5008 TYPE_QUAL_VOLATILE);
5009 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5010 global_lock_var = v;
5011 DECL_ARTIFICIAL (v) = 1;
5012 DECL_EXTERNAL (v) = 1;
5013 TREE_STATIC (v) = 1;
5014 TREE_PUBLIC (v) = 1;
5015 TREE_USED (v) = 1;
5016 mark_addressable (v);
5017 mark_decl_referenced (v);
5020 return build_fold_addr_expr (v);
5023 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5024 GSI. We use a lockless scheme for nearly all case, which looks
5025 like:
5026 actual = initval(OP);
5027 do {
5028 guess = actual;
5029 write = guess OP myval;
5030 actual = cmp&swap (ptr, guess, write)
5031 } while (actual bit-different-to guess);
5032 return write;
5034 This relies on a cmp&swap instruction, which is available for 32-
5035 and 64-bit types. Larger types must use a locking scheme. */
5037 static tree
5038 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5039 tree ptr, tree var, tree_code op)
5041 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5042 tree_code code = NOP_EXPR;
5043 tree arg_type = unsigned_type_node;
5044 tree var_type = TREE_TYPE (var);
5046 if (TREE_CODE (var_type) == COMPLEX_TYPE
5047 || TREE_CODE (var_type) == REAL_TYPE)
5048 code = VIEW_CONVERT_EXPR;
5050 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
5052 arg_type = long_long_unsigned_type_node;
5053 fn = NVPTX_BUILTIN_CMP_SWAPLL;
5056 tree swap_fn = nvptx_builtin_decl (fn, true);
5058 gimple_seq init_seq = NULL;
5059 tree init_var = make_ssa_name (arg_type);
5060 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5061 init_expr = fold_build1 (code, arg_type, init_expr);
5062 gimplify_assign (init_var, init_expr, &init_seq);
5063 gimple *init_end = gimple_seq_last (init_seq);
5065 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5067 /* Split the block just after the init stmts. */
5068 basic_block pre_bb = gsi_bb (*gsi);
5069 edge pre_edge = split_block (pre_bb, init_end);
5070 basic_block loop_bb = pre_edge->dest;
5071 pre_bb = pre_edge->src;
5072 /* Reset the iterator. */
5073 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5075 tree expect_var = make_ssa_name (arg_type);
5076 tree actual_var = make_ssa_name (arg_type);
5077 tree write_var = make_ssa_name (arg_type);
5079 /* Build and insert the reduction calculation. */
5080 gimple_seq red_seq = NULL;
5081 tree write_expr = fold_build1 (code, var_type, expect_var);
5082 write_expr = fold_build2 (op, var_type, write_expr, var);
5083 write_expr = fold_build1 (code, arg_type, write_expr);
5084 gimplify_assign (write_var, write_expr, &red_seq);
5086 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5088 /* Build & insert the cmp&swap sequence. */
5089 gimple_seq latch_seq = NULL;
5090 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5091 ptr, expect_var, write_var);
5092 gimplify_assign (actual_var, swap_expr, &latch_seq);
5094 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5095 NULL_TREE, NULL_TREE);
5096 gimple_seq_add_stmt (&latch_seq, cond);
5098 gimple *latch_end = gimple_seq_last (latch_seq);
5099 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
5101 /* Split the block just after the latch stmts. */
5102 edge post_edge = split_block (loop_bb, latch_end);
5103 basic_block post_bb = post_edge->dest;
5104 loop_bb = post_edge->src;
5105 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5107 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5108 post_edge->probability = profile_probability::even ();
5109 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
5110 loop_edge->probability = profile_probability::even ();
5111 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5112 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5114 gphi *phi = create_phi_node (expect_var, loop_bb);
5115 add_phi_arg (phi, init_var, pre_edge, loc);
5116 add_phi_arg (phi, actual_var, loop_edge, loc);
5118 loop *loop = alloc_loop ();
5119 loop->header = loop_bb;
5120 loop->latch = loop_bb;
5121 add_loop (loop, loop_bb->loop_father);
5123 return fold_build1 (code, var_type, write_var);
5126 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5127 GSI. This is necessary for types larger than 64 bits, where there
5128 is no cmp&swap instruction to implement a lockless scheme. We use
5129 a lock variable in global memory.
5131 while (cmp&swap (&lock_var, 0, 1))
5132 continue;
5133 T accum = *ptr;
5134 accum = accum OP var;
5135 *ptr = accum;
5136 cmp&swap (&lock_var, 1, 0);
5137 return accum;
5139 A lock in global memory is necessary to force execution engine
5140 descheduling and avoid resource starvation that can occur if the
5141 lock is in .shared memory. */
5143 static tree
5144 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5145 tree ptr, tree var, tree_code op)
5147 tree var_type = TREE_TYPE (var);
5148 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5149 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5150 tree uns_locked = build_int_cst (unsigned_type_node, 1);
5152 /* Split the block just before the gsi. Insert a gimple nop to make
5153 this easier. */
5154 gimple *nop = gimple_build_nop ();
5155 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
5156 basic_block entry_bb = gsi_bb (*gsi);
5157 edge entry_edge = split_block (entry_bb, nop);
5158 basic_block lock_bb = entry_edge->dest;
5159 /* Reset the iterator. */
5160 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5162 /* Build and insert the locking sequence. */
5163 gimple_seq lock_seq = NULL;
5164 tree lock_var = make_ssa_name (unsigned_type_node);
5165 tree lock_expr = nvptx_global_lock_addr ();
5166 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
5167 uns_unlocked, uns_locked);
5168 gimplify_assign (lock_var, lock_expr, &lock_seq);
5169 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
5170 NULL_TREE, NULL_TREE);
5171 gimple_seq_add_stmt (&lock_seq, cond);
5172 gimple *lock_end = gimple_seq_last (lock_seq);
5173 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
5175 /* Split the block just after the lock sequence. */
5176 edge locked_edge = split_block (lock_bb, lock_end);
5177 basic_block update_bb = locked_edge->dest;
5178 lock_bb = locked_edge->src;
5179 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5181 /* Create the lock loop ... */
5182 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5183 locked_edge->probability = profile_probability::even ();
5184 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
5185 loop_edge->probability = profile_probability::even ();
5186 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
5187 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
5189 /* ... and the loop structure. */
5190 loop *lock_loop = alloc_loop ();
5191 lock_loop->header = lock_bb;
5192 lock_loop->latch = lock_bb;
5193 lock_loop->nb_iterations_estimate = 1;
5194 lock_loop->any_estimate = true;
5195 add_loop (lock_loop, entry_bb->loop_father);
5197 /* Build and insert the reduction calculation. */
5198 gimple_seq red_seq = NULL;
5199 tree acc_in = make_ssa_name (var_type);
5200 tree ref_in = build_simple_mem_ref (ptr);
5201 TREE_THIS_VOLATILE (ref_in) = 1;
5202 gimplify_assign (acc_in, ref_in, &red_seq);
5204 tree acc_out = make_ssa_name (var_type);
5205 tree update_expr = fold_build2 (op, var_type, ref_in, var);
5206 gimplify_assign (acc_out, update_expr, &red_seq);
5208 tree ref_out = build_simple_mem_ref (ptr);
5209 TREE_THIS_VOLATILE (ref_out) = 1;
5210 gimplify_assign (ref_out, acc_out, &red_seq);
5212 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5214 /* Build & insert the unlock sequence. */
5215 gimple_seq unlock_seq = NULL;
5216 tree unlock_expr = nvptx_global_lock_addr ();
5217 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
5218 uns_locked, uns_unlocked);
5219 gimplify_and_add (unlock_expr, &unlock_seq);
5220 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
5222 return acc_out;
5225 /* Emit a sequence to update a reduction accumlator at *PTR with the
5226 value held in VAR using operator OP. Return the updated value.
5228 TODO: optimize for atomic ops and indepedent complex ops. */
5230 static tree
5231 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
5232 tree ptr, tree var, tree_code op)
5234 tree type = TREE_TYPE (var);
5235 tree size = TYPE_SIZE (type);
5237 if (size == TYPE_SIZE (unsigned_type_node)
5238 || size == TYPE_SIZE (long_long_unsigned_type_node))
5239 return nvptx_lockless_update (loc, gsi, ptr, var, op);
5240 else
5241 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
5244 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5246 static void
5247 nvptx_goacc_reduction_setup (gcall *call)
5249 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5250 tree lhs = gimple_call_lhs (call);
5251 tree var = gimple_call_arg (call, 2);
5252 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5253 gimple_seq seq = NULL;
5255 push_gimplify_context (true);
5257 if (level != GOMP_DIM_GANG)
5259 /* Copy the receiver object. */
5260 tree ref_to_res = gimple_call_arg (call, 1);
5262 if (!integer_zerop (ref_to_res))
5263 var = build_simple_mem_ref (ref_to_res);
5266 if (level == GOMP_DIM_WORKER)
5268 /* Store incoming value to worker reduction buffer. */
5269 tree offset = gimple_call_arg (call, 5);
5270 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5271 tree ptr = make_ssa_name (TREE_TYPE (call));
5273 gimplify_assign (ptr, call, &seq);
5274 tree ref = build_simple_mem_ref (ptr);
5275 TREE_THIS_VOLATILE (ref) = 1;
5276 gimplify_assign (ref, var, &seq);
5279 if (lhs)
5280 gimplify_assign (lhs, var, &seq);
5282 pop_gimplify_context (NULL);
5283 gsi_replace_with_seq (&gsi, seq, true);
5286 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
5288 static void
5289 nvptx_goacc_reduction_init (gcall *call)
5291 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5292 tree lhs = gimple_call_lhs (call);
5293 tree var = gimple_call_arg (call, 2);
5294 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5295 enum tree_code rcode
5296 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5297 tree init = omp_reduction_init_op (gimple_location (call), rcode,
5298 TREE_TYPE (var));
5299 gimple_seq seq = NULL;
5301 push_gimplify_context (true);
5303 if (level == GOMP_DIM_VECTOR)
5305 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5306 tree tid = make_ssa_name (integer_type_node);
5307 tree dim_vector = gimple_call_arg (call, 3);
5308 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
5309 dim_vector);
5310 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
5311 NULL_TREE, NULL_TREE);
5313 gimple_call_set_lhs (tid_call, tid);
5314 gimple_seq_add_stmt (&seq, tid_call);
5315 gimple_seq_add_stmt (&seq, cond_stmt);
5317 /* Split the block just after the call. */
5318 edge init_edge = split_block (gsi_bb (gsi), call);
5319 basic_block init_bb = init_edge->dest;
5320 basic_block call_bb = init_edge->src;
5322 /* Fixup flags from call_bb to init_bb. */
5323 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
5324 init_edge->probability = profile_probability::even ();
5326 /* Set the initialization stmts. */
5327 gimple_seq init_seq = NULL;
5328 tree init_var = make_ssa_name (TREE_TYPE (var));
5329 gimplify_assign (init_var, init, &init_seq);
5330 gsi = gsi_start_bb (init_bb);
5331 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5333 /* Split block just after the init stmt. */
5334 gsi_prev (&gsi);
5335 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5336 basic_block dst_bb = inited_edge->dest;
5338 /* Create false edge from call_bb to dst_bb. */
5339 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
5340 nop_edge->probability = profile_probability::even ();
5342 /* Create phi node in dst block. */
5343 gphi *phi = create_phi_node (lhs, dst_bb);
5344 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5345 add_phi_arg (phi, var, nop_edge, gimple_location (call));
5347 /* Reset dominator of dst bb. */
5348 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5350 /* Reset the gsi. */
5351 gsi = gsi_for_stmt (call);
5353 else
5355 if (level == GOMP_DIM_GANG)
5357 /* If there's no receiver object, propagate the incoming VAR. */
5358 tree ref_to_res = gimple_call_arg (call, 1);
5359 if (integer_zerop (ref_to_res))
5360 init = var;
5363 gimplify_assign (lhs, init, &seq);
5366 pop_gimplify_context (NULL);
5367 gsi_replace_with_seq (&gsi, seq, true);
5370 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
5372 static void
5373 nvptx_goacc_reduction_fini (gcall *call)
5375 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5376 tree lhs = gimple_call_lhs (call);
5377 tree ref_to_res = gimple_call_arg (call, 1);
5378 tree var = gimple_call_arg (call, 2);
5379 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5380 enum tree_code op
5381 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5382 gimple_seq seq = NULL;
5383 tree r = NULL_TREE;;
5385 push_gimplify_context (true);
5387 if (level == GOMP_DIM_VECTOR)
5389 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5390 but that requires a method of emitting a unified jump at the
5391 gimple level. */
5392 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5394 tree other_var = make_ssa_name (TREE_TYPE (var));
5395 nvptx_generate_vector_shuffle (gimple_location (call),
5396 other_var, var, shfl, &seq);
5398 r = make_ssa_name (TREE_TYPE (var));
5399 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5400 var, other_var), &seq);
5401 var = r;
5404 else
5406 tree accum = NULL_TREE;
5408 if (level == GOMP_DIM_WORKER)
5410 /* Get reduction buffer address. */
5411 tree offset = gimple_call_arg (call, 5);
5412 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5413 tree ptr = make_ssa_name (TREE_TYPE (call));
5415 gimplify_assign (ptr, call, &seq);
5416 accum = ptr;
5418 else if (integer_zerop (ref_to_res))
5419 r = var;
5420 else
5421 accum = ref_to_res;
5423 if (accum)
5425 /* UPDATE the accumulator. */
5426 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5427 seq = NULL;
5428 r = nvptx_reduction_update (gimple_location (call), &gsi,
5429 accum, var, op);
5433 if (lhs)
5434 gimplify_assign (lhs, r, &seq);
5435 pop_gimplify_context (NULL);
5437 gsi_replace_with_seq (&gsi, seq, true);
5440 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5442 static void
5443 nvptx_goacc_reduction_teardown (gcall *call)
5445 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5446 tree lhs = gimple_call_lhs (call);
5447 tree var = gimple_call_arg (call, 2);
5448 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5449 gimple_seq seq = NULL;
5451 push_gimplify_context (true);
5452 if (level == GOMP_DIM_WORKER)
5454 /* Read the worker reduction buffer. */
5455 tree offset = gimple_call_arg (call, 5);
5456 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
5457 tree ptr = make_ssa_name (TREE_TYPE (call));
5459 gimplify_assign (ptr, call, &seq);
5460 var = build_simple_mem_ref (ptr);
5461 TREE_THIS_VOLATILE (var) = 1;
5464 if (level != GOMP_DIM_GANG)
5466 /* Write to the receiver object. */
5467 tree ref_to_res = gimple_call_arg (call, 1);
5469 if (!integer_zerop (ref_to_res))
5470 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5473 if (lhs)
5474 gimplify_assign (lhs, var, &seq);
5476 pop_gimplify_context (NULL);
5478 gsi_replace_with_seq (&gsi, seq, true);
5481 /* NVPTX reduction expander. */
5483 static void
5484 nvptx_goacc_reduction (gcall *call)
5486 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5488 switch (code)
5490 case IFN_GOACC_REDUCTION_SETUP:
5491 nvptx_goacc_reduction_setup (call);
5492 break;
5494 case IFN_GOACC_REDUCTION_INIT:
5495 nvptx_goacc_reduction_init (call);
5496 break;
5498 case IFN_GOACC_REDUCTION_FINI:
5499 nvptx_goacc_reduction_fini (call);
5500 break;
5502 case IFN_GOACC_REDUCTION_TEARDOWN:
5503 nvptx_goacc_reduction_teardown (call);
5504 break;
5506 default:
5507 gcc_unreachable ();
5511 static bool
5512 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
5513 rtx x ATTRIBUTE_UNUSED)
5515 return true;
5518 static bool
5519 nvptx_vector_mode_supported (machine_mode mode)
5521 return (mode == V2SImode
5522 || mode == V2DImode);
5525 /* Return the preferred mode for vectorizing scalar MODE. */
5527 static machine_mode
5528 nvptx_preferred_simd_mode (scalar_mode mode)
5530 switch (mode)
5532 case E_DImode:
5533 return V2DImode;
5534 case E_SImode:
5535 return V2SImode;
5537 default:
5538 return default_preferred_simd_mode (mode);
5542 unsigned int
5543 nvptx_data_alignment (const_tree type, unsigned int basic_align)
5545 if (TREE_CODE (type) == INTEGER_TYPE)
5547 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5548 if (size == GET_MODE_SIZE (TImode))
5549 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
5552 return basic_align;
5555 /* Implement TARGET_MODES_TIEABLE_P. */
5557 static bool
5558 nvptx_modes_tieable_p (machine_mode, machine_mode)
5560 return false;
5563 /* Implement TARGET_HARD_REGNO_NREGS. */
5565 static unsigned int
5566 nvptx_hard_regno_nregs (unsigned int, machine_mode)
5568 return 1;
5571 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
5573 static bool
5574 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
5576 return false;
5579 #undef TARGET_OPTION_OVERRIDE
5580 #define TARGET_OPTION_OVERRIDE nvptx_option_override
5582 #undef TARGET_ATTRIBUTE_TABLE
5583 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5585 #undef TARGET_LRA_P
5586 #define TARGET_LRA_P hook_bool_void_false
5588 #undef TARGET_LEGITIMATE_ADDRESS_P
5589 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5591 #undef TARGET_PROMOTE_FUNCTION_MODE
5592 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5594 #undef TARGET_FUNCTION_ARG
5595 #define TARGET_FUNCTION_ARG nvptx_function_arg
5596 #undef TARGET_FUNCTION_INCOMING_ARG
5597 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5598 #undef TARGET_FUNCTION_ARG_ADVANCE
5599 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
5600 #undef TARGET_FUNCTION_ARG_BOUNDARY
5601 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
5602 #undef TARGET_PASS_BY_REFERENCE
5603 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5604 #undef TARGET_FUNCTION_VALUE_REGNO_P
5605 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5606 #undef TARGET_FUNCTION_VALUE
5607 #define TARGET_FUNCTION_VALUE nvptx_function_value
5608 #undef TARGET_LIBCALL_VALUE
5609 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
5610 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
5611 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
5612 #undef TARGET_GET_DRAP_RTX
5613 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
5614 #undef TARGET_SPLIT_COMPLEX_ARG
5615 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5616 #undef TARGET_RETURN_IN_MEMORY
5617 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5618 #undef TARGET_OMIT_STRUCT_RETURN_REG
5619 #define TARGET_OMIT_STRUCT_RETURN_REG true
5620 #undef TARGET_STRICT_ARGUMENT_NAMING
5621 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
5622 #undef TARGET_CALL_ARGS
5623 #define TARGET_CALL_ARGS nvptx_call_args
5624 #undef TARGET_END_CALL_ARGS
5625 #define TARGET_END_CALL_ARGS nvptx_end_call_args
5627 #undef TARGET_ASM_FILE_START
5628 #define TARGET_ASM_FILE_START nvptx_file_start
5629 #undef TARGET_ASM_FILE_END
5630 #define TARGET_ASM_FILE_END nvptx_file_end
5631 #undef TARGET_ASM_GLOBALIZE_LABEL
5632 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
5633 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
5634 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
5635 #undef TARGET_PRINT_OPERAND
5636 #define TARGET_PRINT_OPERAND nvptx_print_operand
5637 #undef TARGET_PRINT_OPERAND_ADDRESS
5638 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
5639 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
5640 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
5641 #undef TARGET_ASM_INTEGER
5642 #define TARGET_ASM_INTEGER nvptx_assemble_integer
5643 #undef TARGET_ASM_DECL_END
5644 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
5645 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
5646 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
5647 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
5648 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
5649 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
5650 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
5652 #undef TARGET_MACHINE_DEPENDENT_REORG
5653 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
5654 #undef TARGET_NO_REGISTER_ALLOCATION
5655 #define TARGET_NO_REGISTER_ALLOCATION true
5657 #undef TARGET_ENCODE_SECTION_INFO
5658 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
5659 #undef TARGET_RECORD_OFFLOAD_SYMBOL
5660 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
5662 #undef TARGET_VECTOR_ALIGNMENT
5663 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
5665 #undef TARGET_CANNOT_COPY_INSN_P
5666 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
5668 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
5669 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
5671 #undef TARGET_INIT_BUILTINS
5672 #define TARGET_INIT_BUILTINS nvptx_init_builtins
5673 #undef TARGET_EXPAND_BUILTIN
5674 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
5675 #undef TARGET_BUILTIN_DECL
5676 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
5678 #undef TARGET_SIMT_VF
5679 #define TARGET_SIMT_VF nvptx_simt_vf
5681 #undef TARGET_GOACC_VALIDATE_DIMS
5682 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
5684 #undef TARGET_GOACC_DIM_LIMIT
5685 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
5687 #undef TARGET_GOACC_FORK_JOIN
5688 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
5690 #undef TARGET_GOACC_REDUCTION
5691 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
5693 #undef TARGET_CANNOT_FORCE_CONST_MEM
5694 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
5696 #undef TARGET_VECTOR_MODE_SUPPORTED_P
5697 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
5699 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
5700 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
5701 nvptx_preferred_simd_mode
5703 #undef TARGET_MODES_TIEABLE_P
5704 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
5706 #undef TARGET_HARD_REGNO_NREGS
5707 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
5709 #undef TARGET_CAN_CHANGE_MODE_CLASS
5710 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
5712 struct gcc_target targetm = TARGET_INITIALIZER;
5714 #include "gt-nvptx.h"