Skip various cmp-mem-const tests on lp64 hppa*-*-*
[official-gcc.git] / gcc / config / nvptx / nvptx.cc
blob9363d3ecc6a25a883905232f8ea60c98bdac119d
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "cfgrtl.h"
56 #include "gimple.h"
57 #include "stor-layout.h"
58 #include "builtins.h"
59 #include "omp-general.h"
60 #include "omp-low.h"
61 #include "omp-offload.h"
62 #include "gomp-constants.h"
63 #include "dumpfile.h"
64 #include "internal-fn.h"
65 #include "gimple-iterator.h"
66 #include "stringpool.h"
67 #include "attribs.h"
68 #include "tree-vrp.h"
69 #include "tree-ssa-operands.h"
70 #include "tree-ssanames.h"
71 #include "gimplify.h"
72 #include "tree-phinodes.h"
73 #include "cfgloop.h"
74 #include "fold-const.h"
75 #include "intl.h"
76 #include "opts.h"
77 #include "tree-pretty-print.h"
78 #include "rtl-iter.h"
79 #include "cgraph.h"
81 /* This file should be included last. */
82 #include "target-def.h"
84 #define WORKAROUND_PTXJIT_BUG 1
85 #define WORKAROUND_PTXJIT_BUG_2 1
86 #define WORKAROUND_PTXJIT_BUG_3 1
88 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
89 block, which has had a maximum number of threads of 1024 since CUDA version
90 2.x. */
91 #define PTX_CTA_SIZE 1024
93 #define PTX_CTA_NUM_BARRIERS 16
94 #define PTX_WARP_SIZE 32
96 #define PTX_PER_CTA_BARRIER 0
97 #define PTX_NUM_PER_CTA_BARRIERS 1
98 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
99 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
101 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
102 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
103 #define PTX_WORKER_LENGTH 32
104 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
106 /* The various PTX memory areas an object might reside in. */
107 enum nvptx_data_area
109 DATA_AREA_GENERIC,
110 DATA_AREA_GLOBAL,
111 DATA_AREA_SHARED,
112 DATA_AREA_LOCAL,
113 DATA_AREA_CONST,
114 DATA_AREA_PARAM,
115 DATA_AREA_MAX
118 /* We record the data area in the target symbol flags. */
119 #define SYMBOL_DATA_AREA(SYM) \
120 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
121 & 7)
122 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
123 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
125 /* Record the function decls we've written, and the libfuncs and function
126 decls corresponding to them. */
127 static std::stringstream func_decls;
129 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
131 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
132 static bool equal (rtx a, rtx b) { return a == b; }
135 static GTY((cache))
136 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
138 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
140 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
141 static bool equal (tree a, tree b) { return a == b; }
144 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
145 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
147 /* Buffer needed to broadcast across workers and vectors. This is
148 used for both worker-neutering and worker broadcasting, and
149 vector-neutering and boardcasting when vector_length > 32. It is
150 shared by all functions emitted. The buffer is placed in shared
151 memory. It'd be nice if PTX supported common blocks, because then
152 this could be shared across TUs (taking the largest size). */
153 static unsigned oacc_bcast_size;
154 static unsigned oacc_bcast_partition;
155 static unsigned oacc_bcast_align;
156 static GTY(()) rtx oacc_bcast_sym;
158 /* Buffer needed for worker reductions. This has to be distinct from
159 the worker broadcast array, as both may be live concurrently. */
160 static unsigned worker_red_size;
161 static unsigned worker_red_align;
162 static GTY(()) rtx worker_red_sym;
164 /* Buffer needed for vector reductions, when vector_length >
165 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
166 array, as both may be live concurrently. */
167 static unsigned vector_red_size;
168 static unsigned vector_red_align;
169 static unsigned vector_red_partition;
170 static GTY(()) rtx vector_red_sym;
172 /* Shared memory block for gang-private variables. */
173 static unsigned gang_private_shared_size;
174 static unsigned gang_private_shared_align;
175 static GTY(()) rtx gang_private_shared_sym;
176 static hash_map<tree_decl_hash, unsigned int> gang_private_shared_hmap;
178 /* Global lock variable, needed for 128bit worker & gang reductions. */
179 static GTY(()) tree global_lock_var;
181 /* True if any function references __nvptx_stacks. */
182 static bool need_softstack_decl;
184 /* True if any function references __nvptx_uni. */
185 static bool need_unisimt_decl;
187 static int nvptx_mach_max_workers ();
189 /* Allocate a new, cleared machine_function structure. */
191 static struct machine_function *
192 nvptx_init_machine_status (void)
194 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
195 p->return_mode = VOIDmode;
196 return p;
199 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
200 and -fopenacc is also enabled. */
202 static void
203 diagnose_openacc_conflict (bool optval, const char *optname)
205 if (flag_openacc && optval)
206 error ("option %s is not supported together with %<-fopenacc%>", optname);
209 static enum ptx_version
210 first_ptx_version_supporting_sm (enum ptx_isa sm)
212 switch (sm)
214 case PTX_ISA_SM30:
215 return PTX_VERSION_3_0;
216 case PTX_ISA_SM35:
217 return PTX_VERSION_3_1;
218 case PTX_ISA_SM53:
219 return PTX_VERSION_4_2;
220 case PTX_ISA_SM70:
221 return PTX_VERSION_6_0;
222 case PTX_ISA_SM75:
223 return PTX_VERSION_6_3;
224 case PTX_ISA_SM80:
225 return PTX_VERSION_7_0;
226 default:
227 gcc_unreachable ();
231 static enum ptx_version
232 default_ptx_version_option (void)
234 enum ptx_version first
235 = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option);
237 /* Pick a version that supports the sm. */
238 enum ptx_version res = first;
240 /* Pick at least 3.1. This has been the smallest version historically. */
241 res = MAX (res, PTX_VERSION_3_1);
243 /* Pick at least 6.0, to enable using bar.warp.sync to have a way to force
244 warp convergence. */
245 res = MAX (res, PTX_VERSION_6_0);
247 /* Verify that we pick a version that supports the sm. */
248 gcc_assert (first <= res);
249 return res;
252 static const char *
253 ptx_version_to_string (enum ptx_version v)
255 switch (v)
257 case PTX_VERSION_3_0:
258 return "3.0";
259 case PTX_VERSION_3_1:
260 return "3.1";
261 case PTX_VERSION_4_2:
262 return "4.2";
263 case PTX_VERSION_6_0:
264 return "6.0";
265 case PTX_VERSION_6_3:
266 return "6.3";
267 case PTX_VERSION_7_0:
268 return "7.0";
269 default:
270 gcc_unreachable ();
274 unsigned int
275 ptx_version_to_number (enum ptx_version v, bool major_p)
277 switch (v)
279 case PTX_VERSION_3_0:
280 return major_p ? 3 : 0;
281 case PTX_VERSION_3_1:
282 return major_p ? 3 : 1;
283 case PTX_VERSION_4_2:
284 return major_p ? 4 : 2;
285 case PTX_VERSION_6_0:
286 return major_p ? 6 : 0;
287 case PTX_VERSION_6_3:
288 return major_p ? 6 : 3;
289 case PTX_VERSION_7_0:
290 return major_p ? 7 : 0;
291 default:
292 gcc_unreachable ();
296 static const char *
297 sm_version_to_string (enum ptx_isa sm)
299 switch (sm)
301 #define NVPTX_SM(XX, SEP) \
302 case PTX_ISA_SM ## XX: \
303 return #XX;
304 #include "nvptx-sm.def"
305 #undef NVPTX_SM
306 default:
307 gcc_unreachable ();
311 static void
312 handle_ptx_version_option (void)
314 if (!OPTION_SET_P (ptx_version_option)
315 || ptx_version_option == PTX_VERSION_default)
317 ptx_version_option = default_ptx_version_option ();
318 return;
321 enum ptx_version first
322 = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option);
324 if (ptx_version_option < first)
325 error ("PTX version (%<-mptx%>) needs to be at least %s to support selected"
326 " %<-misa%> (sm_%s)", ptx_version_to_string (first),
327 sm_version_to_string ((enum ptx_isa)ptx_isa_option));
330 /* Implement TARGET_OPTION_OVERRIDE. */
332 static void
333 nvptx_option_override (void)
335 init_machine_status = nvptx_init_machine_status;
337 /* Via nvptx 'OPTION_DEFAULT_SPECS', '-misa' always appears on the command
338 line; but handle the case that the compiler is not run via the driver. */
339 if (!OPTION_SET_P (ptx_isa_option))
340 fatal_error (UNKNOWN_LOCATION, "%<-march=%> must be specified");
342 handle_ptx_version_option ();
344 /* Set toplevel_reorder, unless explicitly disabled. We need
345 reordering so that we emit necessary assembler decls of
346 undeclared variables. */
347 if (!OPTION_SET_P (flag_toplevel_reorder))
348 flag_toplevel_reorder = 1;
350 debug_nonbind_markers_p = 0;
352 /* Set flag_no_common, unless explicitly disabled. We fake common
353 using .weak, and that's not entirely accurate, so avoid it
354 unless forced. */
355 if (!OPTION_SET_P (flag_no_common))
356 flag_no_common = 1;
358 /* The patch area requires nops, which we don't have. */
359 HOST_WIDE_INT patch_area_size, patch_area_entry;
360 parse_and_check_patch_area (flag_patchable_function_entry, false,
361 &patch_area_size, &patch_area_entry);
362 if (patch_area_size > 0)
363 sorry ("not generating patch area, nops not supported");
365 /* Assumes that it will see only hard registers. */
366 flag_var_tracking = 0;
368 if (nvptx_optimize < 0)
369 nvptx_optimize = optimize > 0;
371 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
372 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
373 declared_libfuncs_htab
374 = hash_table<declared_libfunc_hasher>::create_ggc (17);
376 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
377 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
378 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
379 oacc_bcast_partition = 0;
381 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
382 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
383 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
385 vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
386 SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
387 vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
388 vector_red_partition = 0;
390 gang_private_shared_sym = gen_rtx_SYMBOL_REF (Pmode, "__gang_private_shared");
391 SET_SYMBOL_DATA_AREA (gang_private_shared_sym, DATA_AREA_SHARED);
392 gang_private_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
394 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
395 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
396 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
398 if (TARGET_GOMP)
399 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
402 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
403 deal with ptx ideosyncracies. */
405 const char *
406 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
408 switch (mode)
410 case E_BLKmode:
411 return ".b8";
412 case E_BImode:
413 return ".pred";
414 case E_QImode:
415 if (promote)
416 return ".u32";
417 else
418 return ".u8";
419 case E_HImode:
420 return ".u16";
421 case E_SImode:
422 return ".u32";
423 case E_DImode:
424 return ".u64";
426 case E_HFmode:
427 return ".f16";
428 case E_SFmode:
429 return ".f32";
430 case E_DFmode:
431 return ".f64";
433 case E_V2SImode:
434 return ".v2.u32";
435 case E_V2DImode:
436 return ".v2.u64";
438 default:
439 gcc_unreachable ();
443 /* Encode the PTX data area that DECL (which might not actually be a
444 _DECL) should reside in. */
446 static void
447 nvptx_encode_section_info (tree decl, rtx rtl, int first)
449 default_encode_section_info (decl, rtl, first);
450 if (first && MEM_P (rtl))
452 nvptx_data_area area = DATA_AREA_GENERIC;
454 if (TREE_CONSTANT (decl))
455 area = DATA_AREA_CONST;
456 else if (VAR_P (decl))
458 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
460 area = DATA_AREA_SHARED;
461 if (DECL_INITIAL (decl))
462 error ("static initialization of variable %q+D in %<.shared%>"
463 " memory is not supported", decl);
465 else
466 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
469 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
473 /* Return the PTX name of the data area in which SYM should be
474 placed. The symbol must have already been processed by
475 nvptx_encode_seciton_info, or equivalent. */
477 static const char *
478 section_for_sym (rtx sym)
480 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
481 /* Same order as nvptx_data_area enum. */
482 static char const *const areas[] =
483 {"", ".global", ".shared", ".local", ".const", ".param"};
485 return areas[area];
488 /* Similarly for a decl. */
490 static const char *
491 section_for_decl (const_tree decl)
493 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
496 /* Check NAME for special function names and redirect them by returning a
497 replacement. This applies to malloc, free and realloc, for which we
498 want to use libgcc wrappers, and call, which triggers a bug in
499 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
500 not active in an offload compiler -- the names are all set by the
501 host-side compiler. */
503 static const char *
504 nvptx_name_replacement (const char *name)
506 if (strcmp (name, "call") == 0)
507 return "__nvptx_call";
508 if (strcmp (name, "malloc") == 0)
509 return "__nvptx_malloc";
510 if (strcmp (name, "free") == 0)
511 return "__nvptx_free";
512 if (strcmp (name, "realloc") == 0)
513 return "__nvptx_realloc";
514 return name;
517 /* Return NULL if NAME contains no dot. Otherwise return a copy of NAME
518 with the dots replaced with dollar signs. */
520 static char *
521 nvptx_replace_dot (const char *name)
523 if (strchr (name, '.') == NULL)
524 return NULL;
526 char *p = xstrdup (name);
527 for (size_t i = 0; i < strlen (p); ++i)
528 if (p[i] == '.')
529 p[i] = '$';
530 return p;
533 /* If MODE should be treated as two registers of an inner mode, return
534 that inner mode. Otherwise return VOIDmode. */
536 static machine_mode
537 maybe_split_mode (machine_mode mode)
539 if (COMPLEX_MODE_P (mode))
540 return GET_MODE_INNER (mode);
542 if (mode == TImode)
543 return DImode;
545 return VOIDmode;
548 /* Return true if mode should be treated as two registers. */
550 static bool
551 split_mode_p (machine_mode mode)
553 return maybe_split_mode (mode) != VOIDmode;
556 /* Output a register, subreg, or register pair (with optional
557 enclosing braces). */
559 static void
560 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
561 int subreg_offset = -1)
563 if (inner_mode == VOIDmode)
565 if (HARD_REGISTER_NUM_P (regno))
566 fprintf (file, "%s", reg_names[regno]);
567 else
568 fprintf (file, "%%r%d", regno);
570 else if (subreg_offset >= 0)
572 output_reg (file, regno, VOIDmode);
573 fprintf (file, "$%d", subreg_offset);
575 else
577 if (subreg_offset == -1)
578 fprintf (file, "{");
579 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
580 fprintf (file, ",");
581 output_reg (file, regno, inner_mode, 0);
582 if (subreg_offset == -1)
583 fprintf (file, "}");
587 /* Emit forking instructions for MASK. */
589 static void
590 nvptx_emit_forking (unsigned mask, bool is_call)
592 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
593 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
594 if (mask)
596 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
598 /* Emit fork at all levels. This helps form SESE regions, as
599 it creates a block with a single successor before entering a
600 partitooned region. That is a good candidate for the end of
601 an SESE region. */
602 emit_insn (gen_nvptx_fork (op));
603 emit_insn (gen_nvptx_forked (op));
607 /* Emit joining instructions for MASK. */
609 static void
610 nvptx_emit_joining (unsigned mask, bool is_call)
612 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
613 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
614 if (mask)
616 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
618 /* Emit joining for all non-call pars to ensure there's a single
619 predecessor for the block the join insn ends up in. This is
620 needed for skipping entire loops. */
621 emit_insn (gen_nvptx_joining (op));
622 emit_insn (gen_nvptx_join (op));
627 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
628 returned in memory. Integer and floating types supported by the
629 machine are passed in registers, everything else is passed in
630 memory. Complex types are split. */
632 static bool
633 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
635 if (type)
637 if (AGGREGATE_TYPE_P (type))
638 return true;
639 if (VECTOR_TYPE_P (type))
640 return true;
643 if (!for_return && COMPLEX_MODE_P (mode))
644 /* Complex types are passed as two underlying args. */
645 mode = GET_MODE_INNER (mode);
647 if (GET_MODE_CLASS (mode) != MODE_INT
648 && GET_MODE_CLASS (mode) != MODE_FLOAT)
649 return true;
651 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
652 return true;
654 return false;
657 /* A non-memory argument of mode MODE is being passed, determine the mode it
658 should be promoted to. This is also used for determining return
659 type promotion. */
661 static machine_mode
662 promote_arg (machine_mode mode, bool prototyped)
664 if (!prototyped && mode == SFmode)
665 /* K&R float promotion for unprototyped functions. */
666 mode = DFmode;
667 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
668 mode = SImode;
670 return mode;
673 /* A non-memory return type of MODE is being returned. Determine the
674 mode it should be promoted to. */
676 static machine_mode
677 promote_return (machine_mode mode)
679 return promote_arg (mode, true);
682 /* Implement TARGET_FUNCTION_ARG. */
684 static rtx
685 nvptx_function_arg (cumulative_args_t, const function_arg_info &arg)
687 if (arg.end_marker_p () || !arg.named)
688 return NULL_RTX;
690 return gen_reg_rtx (arg.mode);
693 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
695 static rtx
696 nvptx_function_incoming_arg (cumulative_args_t cum_v,
697 const function_arg_info &arg)
699 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
701 if (arg.end_marker_p () || !arg.named)
702 return NULL_RTX;
704 /* No need to deal with split modes here, the only case that can
705 happen is complex modes and those are dealt with by
706 TARGET_SPLIT_COMPLEX_ARG. */
707 return gen_rtx_UNSPEC (arg.mode,
708 gen_rtvec (1, GEN_INT (cum->count)),
709 UNSPEC_ARG_REG);
712 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
714 static void
715 nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &)
717 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
719 cum->count++;
722 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
724 For nvptx This is only used for variadic args. The type has already
725 been promoted and/or converted to invisible reference. */
727 static unsigned
728 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
730 return GET_MODE_ALIGNMENT (mode);
733 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
735 For nvptx, we know how to handle functions declared as stdarg: by
736 passing an extra pointer to the unnamed arguments. However, the
737 Fortran frontend can produce a different situation, where a
738 function pointer is declared with no arguments, but the actual
739 function and calls to it take more arguments. In that case, we
740 want to ensure the call matches the definition of the function. */
742 static bool
743 nvptx_strict_argument_naming (cumulative_args_t cum_v)
745 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
747 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
750 /* Implement TARGET_LIBCALL_VALUE. */
752 static rtx
753 nvptx_libcall_value (machine_mode mode, const_rtx)
755 if (!cfun || !cfun->machine->doing_call)
756 /* Pretend to return in a hard reg for early uses before pseudos can be
757 generated. */
758 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
760 return gen_reg_rtx (mode);
763 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
764 where function FUNC returns or receives a value of data type TYPE. */
766 static rtx
767 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
768 bool outgoing)
770 machine_mode mode = promote_return (TYPE_MODE (type));
772 if (outgoing)
774 gcc_assert (cfun);
775 cfun->machine->return_mode = mode;
776 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
779 return nvptx_libcall_value (mode, NULL_RTX);
782 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
784 static bool
785 nvptx_function_value_regno_p (const unsigned int regno)
787 return regno == NVPTX_RETURN_REGNUM;
790 /* Types with a mode other than those supported by the machine are passed by
791 reference in memory. */
793 static bool
794 nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
796 return pass_in_memory (arg.mode, arg.type, false);
799 /* Implement TARGET_RETURN_IN_MEMORY. */
801 static bool
802 nvptx_return_in_memory (const_tree type, const_tree)
804 return pass_in_memory (TYPE_MODE (type), type, true);
807 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
809 static machine_mode
810 nvptx_promote_function_mode (const_tree type, machine_mode mode,
811 int *ARG_UNUSED (punsignedp),
812 const_tree funtype, int for_return)
814 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
817 /* Helper for write_arg. Emit a single PTX argument of MODE, either
818 in a prototype, or as copy in a function prologue. ARGNO is the
819 index of this argument in the PTX function. FOR_REG is negative,
820 if we're emitting the PTX prototype. It is zero if we're copying
821 to an argument register and it is greater than zero if we're
822 copying to a specific hard register. */
824 static int
825 write_arg_mode (std::stringstream &s, int for_reg, int argno,
826 machine_mode mode)
828 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
830 if (for_reg < 0)
832 /* Writing PTX prototype. */
833 s << (argno ? ", " : " (");
834 s << ".param" << ptx_type << " %in_ar" << argno;
836 else
838 s << "\t.reg" << ptx_type << " ";
839 if (for_reg)
840 s << reg_names[for_reg];
841 else
842 s << "%ar" << argno;
843 s << ";\n";
844 if (argno >= 0)
846 s << "\tld.param" << ptx_type << " ";
847 if (for_reg)
848 s << reg_names[for_reg];
849 else
850 s << "%ar" << argno;
851 s << ", [%in_ar" << argno << "];\n";
854 return argno + 1;
857 /* Process function parameter TYPE to emit one or more PTX
858 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
859 is true, if this is a prototyped function, rather than an old-style
860 C declaration. Returns the next argument number to use.
862 The promotion behavior here must match the regular GCC function
863 parameter marshalling machinery. */
865 static int
866 write_arg_type (std::stringstream &s, int for_reg, int argno,
867 tree type, bool prototyped)
869 machine_mode mode = TYPE_MODE (type);
871 if (mode == VOIDmode)
872 return argno;
874 if (pass_in_memory (mode, type, false))
875 mode = Pmode;
876 else
878 bool split = TREE_CODE (type) == COMPLEX_TYPE;
880 if (split)
882 /* Complex types are sent as two separate args. */
883 type = TREE_TYPE (type);
884 mode = TYPE_MODE (type);
885 prototyped = true;
888 mode = promote_arg (mode, prototyped);
889 if (split)
890 argno = write_arg_mode (s, for_reg, argno, mode);
893 return write_arg_mode (s, for_reg, argno, mode);
896 /* Emit a PTX return as a prototype or function prologue declaration
897 for MODE. */
899 static void
900 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
902 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
903 const char *pfx = "\t.reg";
904 const char *sfx = ";\n";
906 if (for_proto)
907 pfx = "(.param", sfx = "_out) ";
909 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
912 /* Process a function return TYPE to emit a PTX return as a prototype
913 or function prologue declaration. Returns true if return is via an
914 additional pointer parameter. The promotion behavior here must
915 match the regular GCC function return mashalling. */
917 static bool
918 write_return_type (std::stringstream &s, bool for_proto, tree type)
920 machine_mode mode = TYPE_MODE (type);
922 if (mode == VOIDmode)
923 return false;
925 bool return_in_mem = pass_in_memory (mode, type, true);
927 if (return_in_mem)
929 if (for_proto)
930 return return_in_mem;
932 /* Named return values can cause us to return a pointer as well
933 as expect an argument for the return location. This is
934 optimization-level specific, so no caller can make use of
935 this data, but more importantly for us, we must ensure it
936 doesn't change the PTX prototype. */
937 mode = (machine_mode) cfun->machine->return_mode;
939 if (mode == VOIDmode)
940 return return_in_mem;
942 /* Clear return_mode to inhibit copy of retval to non-existent
943 retval parameter. */
944 cfun->machine->return_mode = VOIDmode;
946 else
947 mode = promote_return (mode);
949 write_return_mode (s, for_proto, mode);
951 return return_in_mem;
954 /* Look for attributes in ATTRS that would indicate we must write a function
955 as a .entry kernel rather than a .func. Return true if one is found. */
957 static bool
958 write_as_kernel (tree attrs)
960 return (lookup_attribute ("kernel", attrs) != NULL_TREE
961 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
962 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
963 /* For OpenMP target regions, the corresponding kernel entry is emitted from
964 write_omp_entry as a separate function. */
967 /* Emit a linker marker for a function decl or defn. */
969 static void
970 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
971 const char *name)
973 s << "\n// BEGIN";
974 if (globalize)
975 s << " GLOBAL";
976 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
977 s << name << "\n";
980 /* Emit a linker marker for a variable decl or defn. */
982 static void
983 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
985 fprintf (file, "\n// BEGIN%s VAR %s: ",
986 globalize ? " GLOBAL" : "",
987 is_defn ? "DEF" : "DECL");
988 assemble_name_raw (file, name);
989 fputs ("\n", file);
992 /* Helper function for write_fn_proto. */
994 static void
995 write_fn_proto_1 (std::stringstream &s, bool is_defn,
996 const char *name, const_tree decl, bool force_public)
998 if (lookup_attribute ("alias", DECL_ATTRIBUTES (decl)) == NULL)
999 write_fn_marker (s, is_defn, TREE_PUBLIC (decl) || force_public, name);
1001 /* PTX declaration. */
1002 if (DECL_EXTERNAL (decl))
1003 s << ".extern ";
1004 else if (TREE_PUBLIC (decl) || force_public)
1005 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
1006 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
1008 tree fntype = TREE_TYPE (decl);
1009 tree result_type = TREE_TYPE (fntype);
1011 /* atomic_compare_exchange_$n builtins have an exceptional calling
1012 convention. */
1013 int not_atomic_weak_arg = -1;
1014 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
1015 switch (DECL_FUNCTION_CODE (decl))
1017 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
1018 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
1019 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
1020 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
1021 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
1022 /* These atomics skip the 'weak' parm in an actual library
1023 call. We must skip it in the prototype too. */
1024 not_atomic_weak_arg = 3;
1025 break;
1027 default:
1028 break;
1031 /* Declare the result. */
1032 bool return_in_mem = write_return_type (s, true, result_type);
1034 s << name;
1036 int argno = 0;
1038 /* Emit argument list. */
1039 if (return_in_mem)
1040 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1042 /* We get:
1043 NULL in TYPE_ARG_TYPES, for old-style functions
1044 NULL in DECL_ARGUMENTS, for builtin functions without another
1045 declaration.
1046 So we have to pick the best one we have. */
1047 tree args = TYPE_ARG_TYPES (fntype);
1048 bool prototyped = true;
1049 if (!args)
1051 args = DECL_ARGUMENTS (decl);
1052 prototyped = false;
1055 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
1057 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1059 if (not_atomic_weak_arg)
1060 argno = write_arg_type (s, -1, argno, type, prototyped);
1061 else
1062 gcc_assert (TREE_CODE (type) == BOOLEAN_TYPE);
1065 if (stdarg_p (fntype))
1066 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1068 if (DECL_STATIC_CHAIN (decl))
1069 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1071 if (argno < 2 && strcmp (name, "main") == 0)
1073 if (argno == 0)
1074 argno = write_arg_type (s, -1, argno, integer_type_node, true);
1076 if (argno == 1)
1077 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1080 if (argno)
1081 s << ")";
1083 s << (is_defn ? "\n" : ";\n");
1086 /* Write a .func or .kernel declaration or definition along with
1087 a helper comment for use by ld. S is the stream to write to, DECL
1088 the decl for the function with name NAME. For definitions, emit
1089 a declaration too. */
1091 static void
1092 write_fn_proto (std::stringstream &s, bool is_defn,
1093 const char *name, const_tree decl, bool force_public=false)
1095 const char *replacement = nvptx_name_replacement (name);
1096 char *replaced_dots = NULL;
1097 if (replacement != name)
1098 name = replacement;
1099 else
1101 replaced_dots = nvptx_replace_dot (name);
1102 if (replaced_dots)
1103 name = replaced_dots;
1105 if (name[0] == '*')
1106 name++;
1108 if (is_defn)
1109 /* Emit a declaration. The PTX assembler gets upset without it. */
1110 write_fn_proto_1 (s, false, name, decl, force_public);
1112 write_fn_proto_1 (s, is_defn, name, decl, force_public);
1114 if (replaced_dots)
1115 XDELETE (replaced_dots);
1118 /* Construct a function declaration from a call insn. This can be
1119 necessary for two reasons - either we have an indirect call which
1120 requires a .callprototype declaration, or we have a libcall
1121 generated by emit_library_call for which no decl exists. */
1123 static void
1124 write_fn_proto_from_insn (std::stringstream &s, const char *name,
1125 rtx result, rtx pat)
1127 char *replaced_dots = NULL;
1129 if (!name)
1131 s << "\t.callprototype ";
1132 name = "_";
1134 else
1136 const char *replacement = nvptx_name_replacement (name);
1137 if (replacement != name)
1138 name = replacement;
1139 else
1141 replaced_dots = nvptx_replace_dot (name);
1142 if (replaced_dots)
1143 name = replaced_dots;
1145 write_fn_marker (s, false, true, name);
1146 s << "\t.extern .func ";
1149 if (result != NULL_RTX)
1150 write_return_mode (s, true, GET_MODE (result));
1152 s << name;
1153 if (replaced_dots)
1154 XDELETE (replaced_dots);
1156 int arg_end = XVECLEN (pat, 0);
1157 for (int i = 1; i < arg_end; i++)
1159 /* We don't have to deal with mode splitting & promotion here,
1160 as that was already done when generating the call
1161 sequence. */
1162 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
1164 write_arg_mode (s, -1, i - 1, mode);
1166 if (arg_end != 1)
1167 s << ")";
1168 s << ";\n";
1171 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
1172 table and write a ptx prototype. These are emitted at end of
1173 compilation. */
1175 static void
1176 nvptx_record_fndecl (tree decl)
1178 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
1179 if (*slot == NULL)
1181 *slot = decl;
1182 const char *name = get_fnname_from_decl (decl);
1183 write_fn_proto (func_decls, false, name, decl);
1187 /* Record a libcall or unprototyped external function. CALLEE is the
1188 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
1189 declaration for it. */
1191 static void
1192 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
1194 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1195 if (*slot == NULL)
1197 *slot = callee;
1199 const char *name = XSTR (callee, 0);
1200 write_fn_proto_from_insn (func_decls, name, retval, pat);
1204 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
1205 is prototyped, record it now. Otherwise record it as needed at end
1206 of compilation, when we might have more information about it. */
1208 void
1209 nvptx_record_needed_fndecl (tree decl)
1211 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1213 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1214 if (*slot == NULL)
1215 *slot = decl;
1217 else
1218 nvptx_record_fndecl (decl);
1221 /* SYM is a SYMBOL_REF. If it refers to an external function, record
1222 it as needed. */
1224 static void
1225 nvptx_maybe_record_fnsym (rtx sym)
1227 tree decl = SYMBOL_REF_DECL (sym);
1229 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1230 nvptx_record_needed_fndecl (decl);
1233 /* Emit a local array to hold some part of a conventional stack frame
1234 and initialize REGNO to point to it. If the size is zero, it'll
1235 never be valid to dereference, so we can simply initialize to
1236 zero. */
1238 static void
1239 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1241 if (size)
1242 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1243 align, reg_names[regno], size);
1244 fprintf (file, "\t.reg.u%d %s;\n",
1245 POINTER_SIZE, reg_names[regno]);
1246 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1247 : "\tmov.u%d %s, 0;\n"),
1248 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1251 /* Emit soft stack frame setup sequence. */
1253 static void
1254 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1256 /* Maintain 64-bit stack alignment. */
1257 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1258 size = ROUND_UP (size, keep_align);
1259 int bits = POINTER_SIZE;
1260 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1261 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1262 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1263 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1264 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1265 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1266 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1267 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1268 fprintf (file, "\t{\n");
1269 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1270 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1271 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1272 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1273 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1274 bits == 64 ? ".wide" : ".lo", bits / 8);
1275 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1277 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1278 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1280 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1281 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1282 bits, reg_sspprev, reg_sspslot);
1284 /* Initialize %frame = %sspprev - size. */
1285 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1286 bits, reg_frame, reg_sspprev, size);
1288 /* Apply alignment, if larger than 64. */
1289 if (alignment > keep_align)
1290 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1291 bits, reg_frame, reg_frame, -alignment);
1293 size = crtl->outgoing_args_size;
1294 gcc_assert (size % keep_align == 0);
1296 /* Initialize %stack. */
1297 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1298 bits, reg_stack, reg_frame, size);
1300 if (!crtl->is_leaf)
1301 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1302 bits, reg_sspslot, reg_stack);
1303 fprintf (file, "\t}\n");
1304 cfun->machine->has_softstack = true;
1305 need_softstack_decl = true;
1308 /* Emit code to initialize the REGNO predicate register to indicate
1309 whether we are not lane zero on the NAME axis. */
1311 static void
1312 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1314 fprintf (file, "\t{\n");
1315 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1316 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1318 fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1319 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1321 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1322 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1323 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1325 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1326 fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1327 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1328 "// vector reduction buffer\n",
1329 REGNO (cfun->machine->red_partition),
1330 vector_red_partition);
1332 /* Verify vector_red_size. */
1333 gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1334 <= vector_red_size);
1335 fprintf (file, "\t}\n");
1338 /* Emit code to initialize OpenACC worker broadcast and synchronization
1339 registers. */
1341 static void
1342 nvptx_init_oacc_workers (FILE *file)
1344 fprintf (file, "\t{\n");
1345 fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1346 if (cfun->machine->bcast_partition)
1348 fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1349 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1351 fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1352 if (cfun->machine->bcast_partition)
1354 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1355 fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1356 fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1357 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1358 "// vector broadcast offset\n",
1359 REGNO (cfun->machine->bcast_partition),
1360 oacc_bcast_partition);
1362 /* Verify oacc_bcast_size. */
1363 gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1364 <= oacc_bcast_size);
1365 if (cfun->machine->sync_bar)
1366 fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1367 "// vector synchronization barrier\n",
1368 REGNO (cfun->machine->sync_bar));
1369 fprintf (file, "\t}\n");
1372 /* Emit code to initialize predicate and master lane index registers for
1373 -muniform-simt code generation variant. */
1375 static void
1376 nvptx_init_unisimt_predicate (FILE *file)
1378 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1379 int loc = REGNO (cfun->machine->unisimt_location);
1380 int bits = POINTER_SIZE;
1381 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1382 fprintf (file, "\t{\n");
1383 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1384 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1385 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1386 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1387 bits == 64 ? ".wide" : ".lo");
1388 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1389 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1390 if (cfun->machine->unisimt_predicate)
1392 int master = REGNO (cfun->machine->unisimt_master);
1393 int pred = REGNO (cfun->machine->unisimt_predicate);
1394 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1395 if (cfun->machine->unisimt_outside_simt_predicate)
1397 int pred_outside_simt
1398 = REGNO (cfun->machine->unisimt_outside_simt_predicate);
1399 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, 0;\n",
1400 pred_outside_simt, master);
1402 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1403 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1404 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1405 /* Compute predicate as 'tid.x == master'. */
1406 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1408 fprintf (file, "\t}\n");
1409 need_unisimt_decl = true;
1412 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1414 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1415 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1417 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1418 __nvptx_uni[tid.y] = 0;
1419 gomp_nvptx_main (ORIG, arg);
1421 ORIG itself should not be emitted as a PTX .entry function. */
1423 static void
1424 write_omp_entry (FILE *file, const char *name, const char *orig)
1426 static bool gomp_nvptx_main_declared;
1427 if (!gomp_nvptx_main_declared)
1429 gomp_nvptx_main_declared = true;
1430 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1431 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1432 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1434 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1435 #define NTID_Y "%ntid.y"
1436 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1437 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1438 {\n\
1439 .reg.u32 %r<3>;\n\
1440 .reg.u" PS " %R<4>;\n\
1441 mov.u32 %r0, %tid.y;\n\
1442 mov.u32 %r1, " NTID_Y ";\n\
1443 mov.u32 %r2, %ctaid.x;\n\
1444 cvt.u" PS ".u32 %R1, %r0;\n\
1445 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1446 mov.u" PS " %R0, __nvptx_stacks;\n\
1447 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1448 ld.param.u" PS " %R2, [%stack];\n\
1449 ld.param.u" PS " %R3, [%sz];\n\
1450 add.u" PS " %R2, %R2, %R3;\n\
1451 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1452 st.shared.u" PS " [%R0], %R2;\n\
1453 mov.u" PS " %R0, __nvptx_uni;\n\
1454 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1455 mov.u32 %r0, 0;\n\
1456 st.shared.u32 [%R0], %r0;\n\
1457 mov.u" PS " %R0, \0;\n\
1458 ld.param.u" PS " %R1, [%arg];\n\
1459 {\n\
1460 .param.u" PS " %P<2>;\n\
1461 st.param.u" PS " [%P0], %R0;\n\
1462 st.param.u" PS " [%P1], %R1;\n\
1463 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1464 }\n\
1465 ret.uni;\n\
1466 }\n"
1467 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1468 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1469 #undef ENTRY_TEMPLATE
1470 #undef NTID_Y
1471 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1472 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1473 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1474 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1475 need_softstack_decl = need_unisimt_decl = true;
1478 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1479 function, including local var decls and copies from the arguments to
1480 local regs. */
1482 void
1483 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1485 tree fntype = TREE_TYPE (decl);
1486 tree result_type = TREE_TYPE (fntype);
1487 int argno = 0;
1488 bool force_public = false;
1490 /* For reverse-offload 'nohost' functions: In order to be collectable in
1491 '$offload_func_table', cf. mkoffload.cc, the function has to be visible. */
1492 if (lookup_attribute ("omp target device_ancestor_nohost",
1493 DECL_ATTRIBUTES (decl)))
1494 force_public = true;
1495 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1496 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1498 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1499 sprintf (buf, "%s$impl", name);
1500 write_omp_entry (file, name, buf);
1501 name = buf;
1503 /* We construct the initial part of the function into a string
1504 stream, in order to share the prototype writing code. */
1505 std::stringstream s;
1506 write_fn_proto (s, true, name, decl, force_public);
1507 s << "{\n";
1509 bool return_in_mem = write_return_type (s, false, result_type);
1510 if (return_in_mem)
1511 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1513 /* Declare and initialize incoming arguments. */
1514 tree args = TYPE_ARG_TYPES (fntype);
1515 bool prototyped = true;
1516 if (!args)
1518 args = DECL_ARGUMENTS (decl);
1519 prototyped = false;
1522 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1524 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1526 argno = write_arg_type (s, 0, argno, type, prototyped);
1529 if (stdarg_p (fntype))
1530 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1531 true);
1533 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1534 write_arg_type (s, STATIC_CHAIN_REGNUM,
1535 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1536 true);
1538 fprintf (file, "%s", s.str().c_str());
1540 /* Usually 'crtl->is_leaf' is computed during register allocator
1541 initialization (which is not done on NVPTX) or for pressure-sensitive
1542 optimizations. Initialize it here, except if already set. */
1543 if (!crtl->is_leaf)
1544 crtl->is_leaf = leaf_function_p ();
1546 HOST_WIDE_INT sz = get_frame_size ();
1547 bool need_frameptr = sz || cfun->machine->has_chain;
1548 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1549 if (!TARGET_SOFT_STACK)
1551 /* Declare a local var for outgoing varargs. */
1552 if (cfun->machine->has_variadic)
1553 init_frame (file, STACK_POINTER_REGNUM,
1554 UNITS_PER_WORD, crtl->outgoing_args_size);
1556 /* Declare a local variable for the frame. Force its size to be
1557 DImode-compatible. */
1558 if (need_frameptr)
1559 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1560 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1562 else if (need_frameptr || cfun->machine->has_variadic || cfun->calls_alloca
1563 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1564 init_softstack_frame (file, alignment, sz);
1566 if (cfun->machine->has_simtreg)
1568 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1569 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1570 align = MAX (align, GET_MODE_SIZE (DImode));
1571 if (!crtl->is_leaf || cfun->calls_alloca)
1572 simtsz = HOST_WIDE_INT_M1U;
1573 if (simtsz == HOST_WIDE_INT_M1U)
1574 simtsz = nvptx_softstack_size;
1575 if (cfun->machine->has_softstack)
1576 simtsz += POINTER_SIZE / 8;
1577 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1578 if (align > GET_MODE_SIZE (DImode))
1579 simtsz += align - GET_MODE_SIZE (DImode);
1580 if (simtsz)
1581 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1582 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1585 /* Restore the vector reduction partition register, if necessary.
1586 FIXME: Find out when and why this is necessary, and fix it. */
1587 if (cfun->machine->red_partition)
1588 regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1589 = cfun->machine->red_partition;
1591 /* Declare the pseudos we have as ptx registers. */
1592 int maxregs = max_reg_num ();
1593 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1595 if (regno_reg_rtx[i] != const0_rtx)
1597 machine_mode mode = PSEUDO_REGNO_MODE (i);
1598 machine_mode split = maybe_split_mode (mode);
1600 if (split_mode_p (mode))
1601 mode = split;
1602 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1603 output_reg (file, i, split, -2);
1604 fprintf (file, ";\n");
1608 /* Emit axis predicates. */
1609 if (cfun->machine->axis_predicate[0])
1610 nvptx_init_axis_predicate (file,
1611 REGNO (cfun->machine->axis_predicate[0]), "y");
1612 if (cfun->machine->axis_predicate[1])
1613 nvptx_init_axis_predicate (file,
1614 REGNO (cfun->machine->axis_predicate[1]), "x");
1615 if (cfun->machine->unisimt_predicate
1616 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1617 nvptx_init_unisimt_predicate (file);
1618 if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1619 nvptx_init_oacc_workers (file);
1622 /* Output code for switching uniform-simt state. ENTERING indicates whether
1623 we are entering or leaving non-uniform execution region. */
1625 static void
1626 nvptx_output_unisimt_switch (FILE *file, bool entering)
1628 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1629 return;
1630 fprintf (file, "\t{\n");
1631 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1632 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1633 if (cfun->machine->unisimt_outside_simt_predicate)
1635 int pred_outside_simt
1636 = REGNO (cfun->machine->unisimt_outside_simt_predicate);
1637 fprintf (file, "\t\tmov.pred %%r%d, %d;\n", pred_outside_simt,
1638 entering ? 0 : 1);
1640 if (!crtl->is_leaf)
1642 int loc = REGNO (cfun->machine->unisimt_location);
1643 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1645 if (cfun->machine->unisimt_predicate)
1647 int master = REGNO (cfun->machine->unisimt_master);
1648 int pred = REGNO (cfun->machine->unisimt_predicate);
1649 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1650 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1651 master, entering ? "%ustmp2" : "0");
1652 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1654 fprintf (file, "\t}\n");
1657 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1658 ENTERING indicates whether we are entering or leaving non-uniform execution.
1659 PTR is the register pointing to allocated storage, it is assigned to on
1660 entering and used to restore state on leaving. SIZE and ALIGN are used only
1661 on entering. */
1663 static void
1664 nvptx_output_softstack_switch (FILE *file, bool entering,
1665 rtx ptr, rtx size, rtx align)
1667 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1668 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1669 return;
1670 int bits = POINTER_SIZE, regno = REGNO (ptr);
1671 fprintf (file, "\t{\n");
1672 if (entering)
1674 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1675 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1676 cfun->machine->simt_stack_size);
1677 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1678 if (CONST_INT_P (size))
1679 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1680 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1681 else
1682 output_reg (file, REGNO (size), VOIDmode);
1683 fputs (";\n", file);
1684 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1685 fprintf (file,
1686 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1687 bits, regno, regno, UINTVAL (align));
1689 if (cfun->machine->has_softstack)
1691 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1692 if (entering)
1694 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1695 bits, regno, bits / 8, reg_stack);
1696 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1697 bits, reg_stack, regno, bits / 8);
1699 else
1701 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1702 bits, reg_stack, regno, bits / 8);
1704 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1706 fprintf (file, "\t}\n");
1709 /* Output code to enter non-uniform execution region. DEST is a register
1710 to hold a per-lane allocation given by SIZE and ALIGN. */
1712 const char *
1713 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1715 nvptx_output_unisimt_switch (asm_out_file, true);
1716 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1717 return "";
1720 /* Output code to leave non-uniform execution region. SRC is the register
1721 holding per-lane storage previously allocated by omp_simt_enter insn. */
1723 const char *
1724 nvptx_output_simt_exit (rtx src)
1726 nvptx_output_unisimt_switch (asm_out_file, false);
1727 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1728 return "";
1731 /* Output instruction that sets soft stack pointer in shared memory to the
1732 value in register given by SRC_REGNO. */
1734 const char *
1735 nvptx_output_set_softstack (unsigned src_regno)
1737 if (cfun->machine->has_softstack && !crtl->is_leaf)
1739 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1740 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1741 output_reg (asm_out_file, src_regno, VOIDmode);
1742 fprintf (asm_out_file, ";\n");
1744 return "";
1746 /* Output a return instruction. Also copy the return value to its outgoing
1747 location. */
1749 const char *
1750 nvptx_output_return (void)
1752 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1754 if (mode != VOIDmode)
1755 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1756 nvptx_ptx_type_from_mode (mode, false),
1757 reg_names[NVPTX_RETURN_REGNUM],
1758 reg_names[NVPTX_RETURN_REGNUM]);
1760 return "ret;";
1763 /* Terminate a function by writing a closing brace to FILE. */
1765 void
1766 nvptx_function_end (FILE *file)
1768 fprintf (file, "}\n");
1771 /* Decide whether we can make a sibling call to a function. For ptx, we
1772 can't. */
1774 static bool
1775 nvptx_function_ok_for_sibcall (tree, tree)
1777 return false;
1780 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1782 static rtx
1783 nvptx_get_drap_rtx (void)
1785 if (TARGET_SOFT_STACK && stack_realign_drap)
1786 return arg_pointer_rtx;
1787 return NULL_RTX;
1790 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1791 argument to the next call. */
1793 static void
1794 nvptx_call_args (cumulative_args_t, rtx arg, tree fntype)
1796 if (!cfun->machine->doing_call)
1798 cfun->machine->doing_call = true;
1799 cfun->machine->is_variadic = false;
1800 cfun->machine->num_args = 0;
1802 if (fntype && stdarg_p (fntype))
1804 cfun->machine->is_variadic = true;
1805 cfun->machine->has_variadic = true;
1806 cfun->machine->num_args++;
1810 if (REG_P (arg) && arg != pc_rtx)
1812 cfun->machine->num_args++;
1813 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1814 cfun->machine->call_args);
1818 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1819 information we recorded. */
1821 static void
1822 nvptx_end_call_args (cumulative_args_t)
1824 cfun->machine->doing_call = false;
1825 free_EXPR_LIST_list (&cfun->machine->call_args);
1828 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1829 track of whether calls involving static chains or varargs were seen
1830 in the current function.
1831 For libcalls, maintain a hash table of decls we have seen, and
1832 record a function decl for later when encountering a new one. */
1834 void
1835 nvptx_expand_call (rtx retval, rtx address)
1837 rtx callee = XEXP (address, 0);
1838 rtx varargs = NULL_RTX;
1839 unsigned parallel = 0;
1841 if (!call_insn_operand (callee, Pmode))
1843 callee = force_reg (Pmode, callee);
1844 address = change_address (address, QImode, callee);
1847 if (GET_CODE (callee) == SYMBOL_REF)
1849 tree decl = SYMBOL_REF_DECL (callee);
1850 if (decl != NULL_TREE)
1852 if (DECL_STATIC_CHAIN (decl))
1853 cfun->machine->has_chain = true;
1855 tree attr = oacc_get_fn_attrib (decl);
1856 if (attr)
1858 tree dims = TREE_VALUE (attr);
1860 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1861 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1863 if (TREE_PURPOSE (dims)
1864 && !integer_zerop (TREE_PURPOSE (dims)))
1865 break;
1866 /* Not on this axis. */
1867 parallel ^= GOMP_DIM_MASK (ix);
1868 dims = TREE_CHAIN (dims);
1874 unsigned nargs = cfun->machine->num_args;
1875 if (cfun->machine->is_variadic)
1877 varargs = gen_reg_rtx (Pmode);
1878 emit_move_insn (varargs, stack_pointer_rtx);
1881 rtvec vec = rtvec_alloc (nargs + 1);
1882 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1883 int vec_pos = 0;
1885 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1886 rtx tmp_retval = retval;
1887 if (retval)
1889 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1890 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1891 call = gen_rtx_SET (tmp_retval, call);
1893 XVECEXP (pat, 0, vec_pos++) = call;
1895 /* Construct the call insn, including a USE for each argument pseudo
1896 register. These will be used when printing the insn. */
1897 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1898 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1900 if (varargs)
1901 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1903 gcc_assert (vec_pos = XVECLEN (pat, 0));
1905 nvptx_emit_forking (parallel, true);
1906 emit_call_insn (pat);
1907 nvptx_emit_joining (parallel, true);
1909 if (tmp_retval != retval)
1910 emit_move_insn (retval, tmp_retval);
1913 /* Emit a comparison COMPARE, and return the new test to be used in the
1914 jump. */
1917 nvptx_expand_compare (rtx compare)
1919 rtx pred = gen_reg_rtx (BImode);
1920 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1921 XEXP (compare, 0), XEXP (compare, 1));
1922 emit_insn (gen_rtx_SET (pred, cmp));
1923 return gen_rtx_NE (BImode, pred, const0_rtx);
1926 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1928 void
1929 nvptx_expand_oacc_fork (unsigned mode)
1931 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1934 void
1935 nvptx_expand_oacc_join (unsigned mode)
1937 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1940 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1941 objects. */
1943 static rtx
1944 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1946 rtx res;
1948 switch (GET_MODE (src))
1950 case E_DImode:
1951 res = gen_unpackdisi2 (dst0, dst1, src);
1952 break;
1953 case E_DFmode:
1954 res = gen_unpackdfsi2 (dst0, dst1, src);
1955 break;
1956 default: gcc_unreachable ();
1958 return res;
1961 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1962 object. */
1964 static rtx
1965 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1967 rtx res;
1969 switch (GET_MODE (dst))
1971 case E_DImode:
1972 res = gen_packsidi2 (dst, src0, src1);
1973 break;
1974 case E_DFmode:
1975 res = gen_packsidf2 (dst, src0, src1);
1976 break;
1977 default: gcc_unreachable ();
1979 return res;
1982 /* Generate an instruction or sequence to broadcast register REG
1983 across the vectors of a single warp. */
1986 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1988 rtx res;
1990 switch (GET_MODE (dst))
1992 case E_DCmode:
1993 case E_CDImode:
1995 gcc_assert (GET_CODE (dst) == CONCAT);
1996 gcc_assert (GET_CODE (src) == CONCAT);
1997 rtx dst_real = XEXP (dst, 0);
1998 rtx dst_imag = XEXP (dst, 1);
1999 rtx src_real = XEXP (src, 0);
2000 rtx src_imag = XEXP (src, 1);
2002 start_sequence ();
2003 emit_insn (nvptx_gen_shuffle (dst_real, src_real, idx, kind));
2004 emit_insn (nvptx_gen_shuffle (dst_imag, src_imag, idx, kind));
2005 res = get_insns ();
2006 end_sequence ();
2008 break;
2009 case E_SImode:
2010 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
2011 break;
2012 case E_SFmode:
2013 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
2014 break;
2015 case E_DImode:
2016 case E_DFmode:
2018 rtx tmp0 = gen_reg_rtx (SImode);
2019 rtx tmp1 = gen_reg_rtx (SImode);
2021 start_sequence ();
2022 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
2023 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
2024 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
2025 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
2026 res = get_insns ();
2027 end_sequence ();
2029 break;
2030 case E_V2SImode:
2032 rtx src0 = gen_rtx_SUBREG (SImode, src, 0);
2033 rtx src1 = gen_rtx_SUBREG (SImode, src, 4);
2034 rtx dst0 = gen_rtx_SUBREG (SImode, dst, 0);
2035 rtx dst1 = gen_rtx_SUBREG (SImode, dst, 4);
2036 rtx tmp0 = gen_reg_rtx (SImode);
2037 rtx tmp1 = gen_reg_rtx (SImode);
2038 start_sequence ();
2039 emit_insn (gen_movsi (tmp0, src0));
2040 emit_insn (gen_movsi (tmp1, src1));
2041 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
2042 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
2043 emit_insn (gen_movsi (dst0, tmp0));
2044 emit_insn (gen_movsi (dst1, tmp1));
2045 res = get_insns ();
2046 end_sequence ();
2048 break;
2049 case E_V2DImode:
2051 rtx src0 = gen_rtx_SUBREG (DImode, src, 0);
2052 rtx src1 = gen_rtx_SUBREG (DImode, src, 8);
2053 rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0);
2054 rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8);
2055 rtx tmp0 = gen_reg_rtx (DImode);
2056 rtx tmp1 = gen_reg_rtx (DImode);
2057 start_sequence ();
2058 emit_insn (gen_movdi (tmp0, src0));
2059 emit_insn (gen_movdi (tmp1, src1));
2060 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
2061 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
2062 emit_insn (gen_movdi (dst0, tmp0));
2063 emit_insn (gen_movdi (dst1, tmp1));
2064 res = get_insns ();
2065 end_sequence ();
2067 break;
2068 case E_BImode:
2070 rtx tmp = gen_reg_rtx (SImode);
2072 start_sequence ();
2073 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
2074 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
2075 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
2076 res = get_insns ();
2077 end_sequence ();
2079 break;
2080 case E_QImode:
2081 case E_HImode:
2083 rtx tmp = gen_reg_rtx (SImode);
2085 start_sequence ();
2086 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
2087 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
2088 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
2089 tmp)));
2090 res = get_insns ();
2091 end_sequence ();
2093 break;
2095 default:
2096 gcc_unreachable ();
2098 return res;
2101 /* Generate an instruction or sequence to broadcast register REG
2102 across the vectors of a single warp. */
2104 static rtx
2105 nvptx_gen_warp_bcast (rtx reg)
2107 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
2110 /* Structure used when generating a worker-level spill or fill. */
2112 struct broadcast_data_t
2114 rtx base; /* Register holding base addr of buffer. */
2115 rtx ptr; /* Iteration var, if needed. */
2116 unsigned offset; /* Offset into worker buffer. */
2119 /* Direction of the spill/fill and looping setup/teardown indicator. */
2121 enum propagate_mask
2123 PM_read = 1 << 0,
2124 PM_write = 1 << 1,
2125 PM_loop_begin = 1 << 2,
2126 PM_loop_end = 1 << 3,
2128 PM_read_write = PM_read | PM_write
2131 /* Generate instruction(s) to spill or fill register REG to/from the
2132 worker broadcast array. PM indicates what is to be done, REP
2133 how many loop iterations will be executed (0 for not a loop). */
2135 static rtx
2136 nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
2137 broadcast_data_t *data, bool vector)
2139 rtx res;
2140 machine_mode mode = GET_MODE (reg);
2142 switch (mode)
2144 case E_BImode:
2146 rtx tmp = gen_reg_rtx (SImode);
2148 start_sequence ();
2149 if (pm & PM_read)
2150 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
2151 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
2152 if (pm & PM_write)
2153 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
2154 res = get_insns ();
2155 end_sequence ();
2157 break;
2159 default:
2161 rtx addr = data->ptr;
2163 if (!addr)
2165 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
2167 oacc_bcast_align = MAX (oacc_bcast_align, align);
2168 data->offset = ROUND_UP (data->offset, align);
2169 addr = data->base;
2170 gcc_assert (data->base != NULL);
2171 if (data->offset)
2172 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
2175 addr = gen_rtx_MEM (mode, addr);
2176 if (pm == PM_read)
2177 res = gen_rtx_SET (addr, reg);
2178 else if (pm == PM_write)
2179 res = gen_rtx_SET (reg, addr);
2180 else
2181 gcc_unreachable ();
2183 if (data->ptr)
2185 /* We're using a ptr, increment it. */
2186 start_sequence ();
2188 emit_insn (res);
2189 emit_insn (gen_adddi3 (data->ptr, data->ptr,
2190 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
2191 res = get_insns ();
2192 end_sequence ();
2194 else
2195 rep = 1;
2196 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
2198 break;
2200 return res;
2203 /* Returns true if X is a valid address for use in a memory reference. */
2205 static bool
2206 nvptx_legitimate_address_p (machine_mode, rtx x, bool, code_helper)
2208 enum rtx_code code = GET_CODE (x);
2210 switch (code)
2212 case REG:
2213 return true;
2215 case PLUS:
2216 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
2217 return true;
2218 return false;
2220 case CONST:
2221 case SYMBOL_REF:
2222 case LABEL_REF:
2223 return true;
2225 default:
2226 return false;
2230 /* Machinery to output constant initializers. When beginning an
2231 initializer, we decide on a fragment size (which is visible in ptx
2232 in the type used), and then all initializer data is buffered until
2233 a fragment is filled and ready to be written out. */
2235 static struct
2237 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
2238 unsigned HOST_WIDE_INT val; /* Current fragment value. */
2239 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
2240 out. */
2241 unsigned size; /* Fragment size to accumulate. */
2242 unsigned offset; /* Offset within current fragment. */
2243 bool started; /* Whether we've output any initializer. */
2244 } init_frag;
2246 /* The current fragment is full, write it out. SYM may provide a
2247 symbolic reference we should output, in which case the fragment
2248 value is the addend. */
2250 static void
2251 output_init_frag (rtx sym)
2253 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
2254 unsigned HOST_WIDE_INT val = init_frag.val;
2256 init_frag.started = true;
2257 init_frag.val = 0;
2258 init_frag.offset = 0;
2259 init_frag.remaining--;
2261 if (sym)
2263 bool function = (SYMBOL_REF_DECL (sym)
2264 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
2265 if (!function)
2266 fprintf (asm_out_file, "generic(");
2267 output_address (VOIDmode, sym);
2268 if (!function)
2269 fprintf (asm_out_file, ")");
2270 if (val)
2271 fprintf (asm_out_file, " + ");
2274 if (!sym || val)
2275 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
2278 /* Add value VAL of size SIZE to the data we're emitting, and keep
2279 writing out chunks as they fill up. */
2281 static void
2282 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
2284 bool negative_p
2285 = val & (HOST_WIDE_INT_1U << (HOST_BITS_PER_WIDE_INT - 1));
2287 /* Avoid undefined behaviour. */
2288 if (size * BITS_PER_UNIT < HOST_BITS_PER_WIDE_INT)
2289 val &= (HOST_WIDE_INT_1U << (size * BITS_PER_UNIT)) - 1;
2291 for (unsigned part = 0; size; size -= part)
2293 if (part * BITS_PER_UNIT == HOST_BITS_PER_WIDE_INT)
2294 /* Avoid undefined behaviour. */
2295 val = negative_p ? -1 : 0;
2296 else
2297 val >>= (part * BITS_PER_UNIT);
2298 part = init_frag.size - init_frag.offset;
2299 part = MIN (part, size);
2301 unsigned HOST_WIDE_INT partial
2302 = val << (init_frag.offset * BITS_PER_UNIT);
2303 init_frag.val |= partial & init_frag.mask;
2304 init_frag.offset += part;
2306 if (init_frag.offset == init_frag.size)
2307 output_init_frag (NULL);
2311 /* Target hook for assembling integer object X of size SIZE. */
2313 static bool
2314 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2316 HOST_WIDE_INT val = 0;
2318 switch (GET_CODE (x))
2320 default:
2321 /* Let the generic machinery figure it out, usually for a
2322 CONST_WIDE_INT. */
2323 return false;
2325 case CONST_INT:
2326 nvptx_assemble_value (INTVAL (x), size);
2327 break;
2329 case CONST:
2330 x = XEXP (x, 0);
2331 gcc_assert (GET_CODE (x) == PLUS);
2332 val = INTVAL (XEXP (x, 1));
2333 x = XEXP (x, 0);
2334 gcc_assert (GET_CODE (x) == SYMBOL_REF);
2335 gcc_fallthrough (); /* FALLTHROUGH */
2337 case SYMBOL_REF:
2338 gcc_assert (size == init_frag.size);
2339 if (init_frag.offset)
2340 sorry ("cannot emit unaligned pointers in ptx assembly");
2342 nvptx_maybe_record_fnsym (x);
2343 init_frag.val = val;
2344 output_init_frag (x);
2345 break;
2348 return true;
2351 /* Output SIZE zero bytes. We ignore the FILE argument since the
2352 functions we're calling to perform the output just use
2353 asm_out_file. */
2355 void
2356 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2358 /* Finish the current fragment, if it's started. */
2359 if (init_frag.offset)
2361 unsigned part = init_frag.size - init_frag.offset;
2362 part = MIN (part, (unsigned)size);
2363 size -= part;
2364 nvptx_assemble_value (0, part);
2367 /* If this skip doesn't terminate the initializer, write as many
2368 remaining pieces as possible directly. */
2369 if (size < init_frag.remaining * init_frag.size)
2371 while (size >= init_frag.size)
2373 size -= init_frag.size;
2374 output_init_frag (NULL_RTX);
2376 if (size)
2377 nvptx_assemble_value (0, size);
2381 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2382 ignore the FILE arg. */
2384 void
2385 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2387 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2388 nvptx_assemble_value (str[i], 1);
2391 /* Return true if TYPE is a record type where the last field is an array without
2392 given dimension. */
2394 static bool
2395 flexible_array_member_type_p (const_tree type)
2397 if (TREE_CODE (type) != RECORD_TYPE)
2398 return false;
2400 const_tree last_field = NULL_TREE;
2401 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2402 last_field = f;
2404 if (!last_field)
2405 return false;
2407 const_tree last_field_type = TREE_TYPE (last_field);
2408 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2409 return false;
2411 return (! TYPE_DOMAIN (last_field_type)
2412 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2415 /* Emit a PTX variable decl and prepare for emission of its
2416 initializer. NAME is the symbol name and SETION the PTX data
2417 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2418 The caller has already emitted any indentation and linkage
2419 specifier. It is responsible for any initializer, terminating ;
2420 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2421 this is the opposite way round that PTX wants them! */
2423 static void
2424 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2425 const_tree type, HOST_WIDE_INT size, unsigned align,
2426 bool undefined = false)
2428 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2429 && (TYPE_DOMAIN (type) == NULL_TREE);
2431 if (undefined && flexible_array_member_type_p (type))
2433 size = 0;
2434 atype = true;
2437 while (TREE_CODE (type) == ARRAY_TYPE)
2438 type = TREE_TYPE (type);
2440 if (TREE_CODE (type) == VECTOR_TYPE
2441 || TREE_CODE (type) == COMPLEX_TYPE)
2442 /* Neither vector nor complex types can contain the other. */
2443 type = TREE_TYPE (type);
2445 unsigned HOST_WIDE_INT elt_size = int_size_in_bytes (type);
2447 /* Largest mode we're prepared to accept. For BLKmode types we
2448 don't know if it'll contain pointer constants, so have to choose
2449 pointer size, otherwise we can choose DImode. */
2450 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2452 elt_size |= GET_MODE_SIZE (elt_mode);
2453 elt_size &= -elt_size; /* Extract LSB set. */
2455 init_frag.size = elt_size;
2456 /* Avoid undefined shift behavior by using '2'. */
2457 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2458 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2459 init_frag.val = 0;
2460 init_frag.offset = 0;
2461 init_frag.started = false;
2462 /* Size might not be a multiple of elt size, if there's an
2463 initialized trailing struct array with smaller type than
2464 elt_size. */
2465 init_frag.remaining = (size + elt_size - 1) / elt_size;
2467 fprintf (file, "%s .align %d .u" HOST_WIDE_INT_PRINT_UNSIGNED " ",
2468 section, align / BITS_PER_UNIT,
2469 elt_size * BITS_PER_UNIT);
2470 assemble_name (file, name);
2472 if (size)
2473 /* We make everything an array, to simplify any initialization
2474 emission. */
2475 fprintf (file, "[" HOST_WIDE_INT_PRINT_UNSIGNED "]", init_frag.remaining);
2476 else if (atype)
2477 fprintf (file, "[]");
2480 /* Called when the initializer for a decl has been completely output through
2481 combinations of the three functions above. */
2483 static void
2484 nvptx_assemble_decl_end (void)
2486 if (init_frag.offset)
2487 /* This can happen with a packed struct with trailing array member. */
2488 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2489 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2492 /* Output an uninitialized common or file-scope variable. */
2494 void
2495 nvptx_output_aligned_decl (FILE *file, const char *name,
2496 const_tree decl, HOST_WIDE_INT size, unsigned align)
2498 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2500 /* If this is public, it is common. The nearest thing we have to
2501 common is weak. */
2502 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2504 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2505 TREE_TYPE (decl), size, align);
2506 nvptx_assemble_decl_end ();
2509 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2510 writing a constant variable EXP with NAME and SIZE and its
2511 initializer to FILE. */
2513 static void
2514 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2515 const_tree exp, HOST_WIDE_INT obj_size)
2517 write_var_marker (file, true, false, name);
2519 fprintf (file, "\t");
2521 tree type = TREE_TYPE (exp);
2522 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2523 TYPE_ALIGN (type));
2526 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2527 a variable DECL with NAME to FILE. */
2529 void
2530 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2532 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2534 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2535 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2537 tree type = TREE_TYPE (decl);
2538 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2539 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2540 type, obj_size, DECL_ALIGN (decl));
2543 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2545 static void
2546 nvptx_globalize_label (FILE *, const char *)
2550 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2551 declaration only for variable DECL with NAME to FILE. */
2553 static void
2554 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2556 /* The middle end can place constant pool decls into the varpool as
2557 undefined. Until that is fixed, catch the problem here. */
2558 if (DECL_IN_CONSTANT_POOL (decl))
2559 return;
2561 /* We support weak defintions, and hence have the right
2562 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2563 if (DECL_WEAK (decl))
2564 error_at (DECL_SOURCE_LOCATION (decl),
2565 "PTX does not support weak declarations"
2566 " (only weak definitions)");
2567 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2569 fprintf (file, "\t.extern ");
2570 tree size = DECL_SIZE_UNIT (decl);
2571 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2572 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2573 DECL_ALIGN (decl), true);
2574 nvptx_assemble_decl_end ();
2577 /* Output a pattern for a move instruction. */
2579 const char *
2580 nvptx_output_mov_insn (rtx dst, rtx src)
2582 machine_mode dst_mode = GET_MODE (dst);
2583 machine_mode src_mode = GET_MODE (src);
2584 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2585 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2586 machine_mode src_inner = (GET_CODE (src) == SUBREG
2587 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2589 rtx sym = src;
2590 if (GET_CODE (sym) == CONST)
2591 sym = XEXP (XEXP (sym, 0), 0);
2592 if (SYMBOL_REF_P (sym))
2594 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2595 return "%.\tcvta%D1%t0\t%0, %1;";
2596 nvptx_maybe_record_fnsym (sym);
2599 if (src_inner == dst_inner)
2600 return "%.\tmov%t0\t%0, %1;";
2602 if (CONSTANT_P (src))
2603 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2604 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2605 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2607 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2609 if (GET_MODE_BITSIZE (dst_mode) == 128
2610 && GET_MODE_BITSIZE (src_mode) == 128)
2612 /* mov.b128 is not supported. */
2613 if (dst_inner == V2DImode && src_inner == TImode)
2614 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2615 else if (dst_inner == TImode && src_inner == V2DImode)
2616 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2618 gcc_unreachable ();
2620 return "%.\tmov.b%T0\t%0, %1;";
2623 if (GET_MODE_BITSIZE (src_inner) == 128
2624 && GET_MODE_BITSIZE (src_mode) == 64)
2625 return "%.\tmov.b%T0\t%0, %1;";
2627 return "%.\tcvt%t0%t1\t%0, %1;";
2630 /* Output a pre/post barrier for MEM_OPERAND according to MEMMODEL. */
2632 static void
2633 nvptx_output_barrier (rtx *mem_operand, int memmodel, bool pre_p)
2635 bool post_p = !pre_p;
2637 switch (memmodel)
2639 case MEMMODEL_RELAXED:
2640 return;
2641 case MEMMODEL_CONSUME:
2642 case MEMMODEL_ACQUIRE:
2643 case MEMMODEL_SYNC_ACQUIRE:
2644 if (post_p)
2645 break;
2646 return;
2647 case MEMMODEL_RELEASE:
2648 case MEMMODEL_SYNC_RELEASE:
2649 if (pre_p)
2650 break;
2651 return;
2652 case MEMMODEL_ACQ_REL:
2653 case MEMMODEL_SEQ_CST:
2654 case MEMMODEL_SYNC_SEQ_CST:
2655 if (pre_p || post_p)
2656 break;
2657 return;
2658 default:
2659 gcc_unreachable ();
2662 output_asm_insn ("%.\tmembar%B0;", mem_operand);
2665 const char *
2666 nvptx_output_atomic_insn (const char *asm_template, rtx *operands, int mem_pos,
2667 int memmodel_pos)
2669 nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]),
2670 true);
2671 output_asm_insn (asm_template, operands);
2672 nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]),
2673 false);
2674 return "";
2677 static void nvptx_print_operand (FILE *, rtx, int);
2679 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2680 involves writing .param declarations and in/out copies into them. For
2681 indirect calls, also write the .callprototype. */
2683 const char *
2684 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2686 char buf[16];
2687 static int labelno;
2688 bool needs_tgt = register_operand (callee, Pmode);
2689 rtx pat = PATTERN (insn);
2690 if (GET_CODE (pat) == COND_EXEC)
2691 pat = COND_EXEC_CODE (pat);
2692 int arg_end = XVECLEN (pat, 0);
2693 tree decl = NULL_TREE;
2695 fprintf (asm_out_file, "\t{\n");
2696 if (result != NULL)
2697 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2698 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2699 reg_names[NVPTX_RETURN_REGNUM]);
2701 /* Ensure we have a ptx declaration in the output if necessary. */
2702 if (GET_CODE (callee) == SYMBOL_REF)
2704 decl = SYMBOL_REF_DECL (callee);
2705 if (!decl
2706 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2707 nvptx_record_libfunc (callee, result, pat);
2708 else if (DECL_EXTERNAL (decl))
2709 nvptx_record_fndecl (decl);
2712 if (needs_tgt)
2714 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2715 labelno++;
2716 ASM_OUTPUT_LABEL (asm_out_file, buf);
2717 std::stringstream s;
2718 write_fn_proto_from_insn (s, NULL, result, pat);
2719 fputs (s.str().c_str(), asm_out_file);
2722 for (int argno = 1; argno < arg_end; argno++)
2724 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2725 machine_mode mode = GET_MODE (t);
2726 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2728 /* Mode splitting has already been done. */
2729 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2730 "\t\tst.param%s [%%out_arg%d], ",
2731 ptx_type, argno, ptx_type, argno);
2732 output_reg (asm_out_file, REGNO (t), VOIDmode);
2733 fprintf (asm_out_file, ";\n");
2736 /* The '.' stands for the call's predicate, if any. */
2737 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2738 fprintf (asm_out_file, "\t\tcall ");
2739 if (result != NULL_RTX)
2740 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2742 if (decl)
2744 char *replaced_dots = NULL;
2745 const char *name = get_fnname_from_decl (decl);
2746 const char *replacement = nvptx_name_replacement (name);
2747 if (replacement != name)
2748 name = replacement;
2749 else
2751 replaced_dots = nvptx_replace_dot (name);
2752 if (replaced_dots)
2753 name = replaced_dots;
2755 assemble_name (asm_out_file, name);
2756 if (replaced_dots)
2757 XDELETE (replaced_dots);
2759 else
2760 output_address (VOIDmode, callee);
2762 const char *open = "(";
2763 for (int argno = 1; argno < arg_end; argno++)
2765 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2766 open = "";
2768 if (decl && DECL_STATIC_CHAIN (decl))
2770 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2771 open = "";
2773 if (!open[0])
2774 fprintf (asm_out_file, ")");
2776 if (needs_tgt)
2778 fprintf (asm_out_file, ", ");
2779 assemble_name (asm_out_file, buf);
2781 fprintf (asm_out_file, ";\n");
2783 if (find_reg_note (insn, REG_NORETURN, NULL))
2785 /* No return functions confuse the PTX JIT, as it doesn't realize
2786 the flow control barrier they imply. It can seg fault if it
2787 encounters what looks like an unexitable loop. Emit a trailing
2788 trap and exit, which it does grok. */
2789 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2790 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2793 if (result)
2795 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2797 if (!rval[0])
2798 /* We must escape the '%' that starts RETURN_REGNUM. */
2799 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2800 reg_names[NVPTX_RETURN_REGNUM]);
2801 return rval;
2804 return "}";
2807 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2809 static bool
2810 nvptx_print_operand_punct_valid_p (unsigned char c)
2812 return c == '.' || c== '#';
2815 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2817 static void
2818 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2820 rtx off;
2821 if (GET_CODE (x) == CONST)
2822 x = XEXP (x, 0);
2823 switch (GET_CODE (x))
2825 case PLUS:
2826 off = XEXP (x, 1);
2827 output_address (VOIDmode, XEXP (x, 0));
2828 fprintf (file, "+");
2829 output_address (VOIDmode, off);
2830 break;
2832 case SYMBOL_REF:
2833 case LABEL_REF:
2834 output_addr_const (file, x);
2835 break;
2837 default:
2838 gcc_assert (GET_CODE (x) != MEM);
2839 nvptx_print_operand (file, x, 0);
2840 break;
2844 /* Write assembly language output for the address ADDR to FILE. */
2846 static void
2847 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2849 nvptx_print_address_operand (file, addr, mode);
2852 static nvptx_data_area
2853 nvptx_mem_data_area (const_rtx x)
2855 gcc_assert (GET_CODE (x) == MEM);
2857 const_rtx addr = XEXP (x, 0);
2858 subrtx_iterator::array_type array;
2859 FOR_EACH_SUBRTX (iter, array, addr, ALL)
2860 if (SYMBOL_REF_P (*iter))
2861 return SYMBOL_DATA_AREA (*iter);
2863 return DATA_AREA_GENERIC;
2866 bool
2867 nvptx_mem_maybe_shared_p (const_rtx x)
2869 nvptx_data_area area = nvptx_mem_data_area (x);
2870 return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC;
2873 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2875 Meaning of CODE:
2876 . -- print the predicate for the instruction or an emptry string for an
2877 unconditional one.
2878 # -- print a rounding mode for the instruction
2880 A -- print a data area for a MEM
2881 c -- print an opcode suffix for a comparison operator, including a type code
2882 D -- print a data area for a MEM operand
2883 S -- print a shuffle kind specified by CONST_INT
2884 t -- print a type opcode suffix, promoting QImode to 32 bits
2885 T -- print a type size in bits
2886 u -- print a type opcode suffix without promotions.
2887 p -- print a '!' for constant 0.
2888 x -- print a destination operand that may also be a bit bucket. */
2890 static void
2891 nvptx_print_operand (FILE *file, rtx x, int code)
2893 if (code == '.')
2895 x = current_insn_predicate;
2896 if (x)
2898 fputs ("@", file);
2899 if (GET_CODE (x) == EQ)
2900 fputs ("!", file);
2901 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2903 return;
2905 else if (code == '#')
2907 fputs (".rn", file);
2908 return;
2911 enum rtx_code x_code = GET_CODE (x);
2912 machine_mode mode = GET_MODE (x);
2914 switch (code)
2916 case 'x':
2917 if (current_output_insn != NULL
2918 && find_reg_note (current_output_insn, REG_UNUSED, x) != NULL_RTX)
2920 fputs ("_", file);
2921 return;
2923 goto common;
2924 case 'B':
2925 if (SYMBOL_REF_P (XEXP (x, 0)))
2926 switch (SYMBOL_DATA_AREA (XEXP (x, 0)))
2928 case DATA_AREA_GENERIC:
2929 /* Assume worst-case: global. */
2930 gcc_fallthrough (); /* FALLTHROUGH. */
2931 case DATA_AREA_GLOBAL:
2932 break;
2933 case DATA_AREA_SHARED:
2934 fputs (".cta", file);
2935 return;
2936 case DATA_AREA_LOCAL:
2937 case DATA_AREA_CONST:
2938 case DATA_AREA_PARAM:
2939 default:
2940 gcc_unreachable ();
2943 /* There are 2 cases where membar.sys differs from membar.gl:
2944 - host accesses global memory (f.i. systemwide atomics)
2945 - 2 or more devices are setup in peer-to-peer mode, and one
2946 peer can access global memory of other peer.
2947 Neither are currently supported by openMP/OpenACC on nvptx, but
2948 that could change, so we default to membar.sys. We could support
2949 this more optimally by adding DATA_AREA_SYS and then emitting
2950 .gl for DATA_AREA_GLOBAL and .sys for DATA_AREA_SYS. */
2951 fputs (".sys", file);
2952 return;
2954 case 'A':
2955 x = XEXP (x, 0);
2956 gcc_fallthrough (); /* FALLTHROUGH. */
2958 case 'D':
2959 if (GET_CODE (x) == CONST)
2960 x = XEXP (x, 0);
2961 if (GET_CODE (x) == PLUS)
2962 x = XEXP (x, 0);
2964 if (GET_CODE (x) == SYMBOL_REF)
2965 fputs (section_for_sym (x), file);
2966 break;
2968 case 't':
2969 case 'u':
2970 if (x_code == SUBREG)
2972 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2973 if (VECTOR_MODE_P (inner_mode)
2974 && (GET_MODE_SIZE (mode)
2975 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2976 mode = GET_MODE_INNER (inner_mode);
2977 else if (split_mode_p (inner_mode))
2978 mode = maybe_split_mode (inner_mode);
2979 else
2980 mode = inner_mode;
2982 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2983 break;
2985 case 'H':
2986 case 'L':
2988 rtx inner_x = SUBREG_REG (x);
2989 machine_mode inner_mode = GET_MODE (inner_x);
2990 machine_mode split = maybe_split_mode (inner_mode);
2992 output_reg (file, REGNO (inner_x), split,
2993 (code == 'H'
2994 ? GET_MODE_SIZE (inner_mode) / 2
2995 : 0));
2997 break;
2999 case 'S':
3001 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
3002 /* Same order as nvptx_shuffle_kind. */
3003 static const char *const kinds[] =
3004 {".up", ".down", ".bfly", ".idx"};
3005 fputs (kinds[kind], file);
3007 break;
3009 case 'T':
3010 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
3011 break;
3013 case 'j':
3014 fprintf (file, "@");
3015 goto common;
3017 case 'J':
3018 fprintf (file, "@!");
3019 goto common;
3021 case 'p':
3022 if (INTVAL (x) == 0)
3023 fprintf (file, "!");
3024 break;
3026 case 'c':
3027 mode = GET_MODE (XEXP (x, 0));
3028 switch (x_code)
3030 case EQ:
3031 fputs (".eq", file);
3032 break;
3033 case NE:
3034 if (FLOAT_MODE_P (mode))
3035 fputs (".neu", file);
3036 else
3037 fputs (".ne", file);
3038 break;
3039 case LE:
3040 case LEU:
3041 fputs (".le", file);
3042 break;
3043 case GE:
3044 case GEU:
3045 fputs (".ge", file);
3046 break;
3047 case LT:
3048 case LTU:
3049 fputs (".lt", file);
3050 break;
3051 case GT:
3052 case GTU:
3053 fputs (".gt", file);
3054 break;
3055 case LTGT:
3056 fputs (".ne", file);
3057 break;
3058 case UNEQ:
3059 fputs (".equ", file);
3060 break;
3061 case UNLE:
3062 fputs (".leu", file);
3063 break;
3064 case UNGE:
3065 fputs (".geu", file);
3066 break;
3067 case UNLT:
3068 fputs (".ltu", file);
3069 break;
3070 case UNGT:
3071 fputs (".gtu", file);
3072 break;
3073 case UNORDERED:
3074 fputs (".nan", file);
3075 break;
3076 case ORDERED:
3077 fputs (".num", file);
3078 break;
3079 default:
3080 gcc_unreachable ();
3082 if (FLOAT_MODE_P (mode)
3083 || x_code == EQ || x_code == NE
3084 || x_code == GEU || x_code == GTU
3085 || x_code == LEU || x_code == LTU)
3086 fputs (nvptx_ptx_type_from_mode (mode, true), file);
3087 else
3088 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
3089 break;
3090 default:
3091 common:
3092 switch (x_code)
3094 case SUBREG:
3096 rtx inner_x = SUBREG_REG (x);
3097 machine_mode inner_mode = GET_MODE (inner_x);
3098 machine_mode split = maybe_split_mode (inner_mode);
3100 if (VECTOR_MODE_P (inner_mode)
3101 && (GET_MODE_SIZE (mode)
3102 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
3104 output_reg (file, REGNO (inner_x), VOIDmode);
3105 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
3107 else if (split_mode_p (inner_mode)
3108 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
3109 output_reg (file, REGNO (inner_x), split);
3110 else
3111 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
3113 break;
3115 case REG:
3116 output_reg (file, REGNO (x), maybe_split_mode (mode));
3117 break;
3119 case MEM:
3120 fputc ('[', file);
3121 nvptx_print_address_operand (file, XEXP (x, 0), mode);
3122 fputc (']', file);
3123 break;
3125 case CONST_INT:
3126 output_addr_const (file, x);
3127 break;
3129 case CONST:
3130 case SYMBOL_REF:
3131 case LABEL_REF:
3132 /* We could use output_addr_const, but that can print things like
3133 "x-8", which breaks ptxas. Need to ensure it is output as
3134 "x+-8". */
3135 nvptx_print_address_operand (file, x, VOIDmode);
3136 break;
3138 case CONST_DOUBLE:
3139 long vals[2];
3140 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
3141 vals[0] &= 0xffffffff;
3142 vals[1] &= 0xffffffff;
3143 if (mode == SFmode)
3144 fprintf (file, "0f%08lx", vals[0]);
3145 else
3146 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
3147 break;
3149 case CONST_VECTOR:
3151 unsigned n = CONST_VECTOR_NUNITS (x);
3152 fprintf (file, "{ ");
3153 for (unsigned i = 0; i < n; ++i)
3155 if (i != 0)
3156 fprintf (file, ", ");
3158 rtx elem = CONST_VECTOR_ELT (x, i);
3159 output_addr_const (file, elem);
3161 fprintf (file, " }");
3163 break;
3165 default:
3166 output_addr_const (file, x);
3171 /* Record replacement regs used to deal with subreg operands. */
3172 struct reg_replace
3174 rtx replacement[MAX_RECOG_OPERANDS];
3175 machine_mode mode;
3176 int n_allocated;
3177 int n_in_use;
3180 /* Allocate or reuse a replacement in R and return the rtx. */
3182 static rtx
3183 get_replacement (struct reg_replace *r)
3185 if (r->n_allocated == r->n_in_use)
3186 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
3187 return r->replacement[r->n_in_use++];
3190 /* Clean up subreg operands. In ptx assembly, everything is typed, and
3191 the presence of subregs would break the rules for most instructions.
3192 Replace them with a suitable new register of the right size, plus
3193 conversion copyin/copyout instructions. */
3195 static void
3196 nvptx_reorg_subreg (void)
3198 struct reg_replace qiregs, hiregs, siregs, diregs;
3199 rtx_insn *insn, *next;
3201 qiregs.n_allocated = 0;
3202 hiregs.n_allocated = 0;
3203 siregs.n_allocated = 0;
3204 diregs.n_allocated = 0;
3205 qiregs.mode = QImode;
3206 hiregs.mode = HImode;
3207 siregs.mode = SImode;
3208 diregs.mode = DImode;
3210 for (insn = get_insns (); insn; insn = next)
3212 next = NEXT_INSN (insn);
3213 if (!NONDEBUG_INSN_P (insn)
3214 || asm_noperands (PATTERN (insn)) >= 0
3215 || GET_CODE (PATTERN (insn)) == USE
3216 || GET_CODE (PATTERN (insn)) == CLOBBER)
3217 continue;
3219 qiregs.n_in_use = 0;
3220 hiregs.n_in_use = 0;
3221 siregs.n_in_use = 0;
3222 diregs.n_in_use = 0;
3223 extract_insn (insn);
3224 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
3226 for (int i = 0; i < recog_data.n_operands; i++)
3228 rtx op = recog_data.operand[i];
3229 if (GET_CODE (op) != SUBREG)
3230 continue;
3232 rtx inner = SUBREG_REG (op);
3234 machine_mode outer_mode = GET_MODE (op);
3235 machine_mode inner_mode = GET_MODE (inner);
3236 gcc_assert (s_ok);
3237 if (s_ok
3238 && (GET_MODE_PRECISION (inner_mode)
3239 >= GET_MODE_PRECISION (outer_mode)))
3240 continue;
3241 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
3242 struct reg_replace *r = (outer_mode == QImode ? &qiregs
3243 : outer_mode == HImode ? &hiregs
3244 : outer_mode == SImode ? &siregs
3245 : &diregs);
3246 rtx new_reg = get_replacement (r);
3248 if (recog_data.operand_type[i] != OP_OUT)
3250 enum rtx_code code;
3251 if (GET_MODE_PRECISION (inner_mode)
3252 < GET_MODE_PRECISION (outer_mode))
3253 code = ZERO_EXTEND;
3254 else
3255 code = TRUNCATE;
3257 rtx pat = gen_rtx_SET (new_reg,
3258 gen_rtx_fmt_e (code, outer_mode, inner));
3259 emit_insn_before (pat, insn);
3262 if (recog_data.operand_type[i] != OP_IN)
3264 enum rtx_code code;
3265 if (GET_MODE_PRECISION (inner_mode)
3266 < GET_MODE_PRECISION (outer_mode))
3267 code = TRUNCATE;
3268 else
3269 code = ZERO_EXTEND;
3271 rtx pat = gen_rtx_SET (inner,
3272 gen_rtx_fmt_e (code, inner_mode, new_reg));
3273 emit_insn_after (pat, insn);
3275 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
3280 /* Return a SImode "master lane index" register for uniform-simt, allocating on
3281 first use. */
3283 static rtx
3284 nvptx_get_unisimt_master ()
3286 rtx &master = cfun->machine->unisimt_master;
3287 return master ? master : master = gen_reg_rtx (SImode);
3290 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
3292 static rtx
3293 nvptx_get_unisimt_predicate ()
3295 rtx &pred = cfun->machine->unisimt_predicate;
3296 return pred ? pred : pred = gen_reg_rtx (BImode);
3299 static rtx
3300 nvptx_get_unisimt_outside_simt_predicate ()
3302 rtx &pred = cfun->machine->unisimt_outside_simt_predicate;
3303 return pred ? pred : pred = gen_reg_rtx (BImode);
3306 /* Return true if given call insn references one of the functions provided by
3307 the CUDA runtime: malloc, free, vprintf. */
3309 static bool
3310 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
3312 rtx pat = PATTERN (insn);
3313 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
3314 pat = XVECEXP (pat, 0, 0);
3315 if (GET_CODE (pat) == SET)
3316 pat = SET_SRC (pat);
3317 gcc_checking_assert (GET_CODE (pat) == CALL
3318 && GET_CODE (XEXP (pat, 0)) == MEM);
3319 rtx addr = XEXP (XEXP (pat, 0), 0);
3320 if (GET_CODE (addr) != SYMBOL_REF)
3321 return false;
3322 const char *name = XSTR (addr, 0);
3323 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
3324 references with forced assembler name refer to PTX syscalls. For vprintf,
3325 accept both normal and forced-assembler-name references. */
3326 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
3327 || !strcmp (name, "*malloc")
3328 || !strcmp (name, "*free"));
3331 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
3332 propagate its value from lane MASTER to current lane. */
3334 static bool
3335 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
3337 rtx reg;
3338 if (GET_CODE (set) == SET
3339 && REG_P (reg = SET_DEST (set))
3340 && find_reg_note (insn, REG_UNUSED, reg) == NULL_RTX)
3342 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX),
3343 insn);
3344 return true;
3347 return false;
3350 static void
3351 predicate_insn (rtx_insn *insn, rtx pred)
3353 rtx pat = PATTERN (insn);
3354 pred = gen_rtx_NE (BImode, pred, const0_rtx);
3355 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
3356 bool changed_p = validate_change (insn, &PATTERN (insn), pat, false);
3357 gcc_assert (changed_p);
3360 /* Adjust code for uniform-simt code generation variant by making atomics and
3361 "syscalls" conditionally executed, and inserting shuffle-based propagation
3362 for registers being set. */
3364 static void
3365 nvptx_reorg_uniform_simt ()
3367 rtx_insn *insn, *next;
3369 for (insn = get_insns (); insn; insn = next)
3371 next = NEXT_INSN (insn);
3373 /* Skip NOTE, USE, etc. */
3374 if (!INSN_P (insn) || recog_memoized (insn) == -1)
3375 continue;
3377 if (CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
3379 /* Handle syscall. */
3381 else if (get_attr_atomic (insn))
3383 /* Handle atomic insn. */
3385 else
3386 continue;
3388 rtx pat = PATTERN (insn);
3389 rtx master = nvptx_get_unisimt_master ();
3390 bool shuffle_p = false;
3391 switch (GET_CODE (pat))
3393 case PARALLEL:
3394 for (int i = 0; i < XVECLEN (pat, 0); i++)
3395 shuffle_p
3396 |= nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
3397 break;
3398 case SET:
3399 shuffle_p |= nvptx_unisimt_handle_set (pat, insn, master);
3400 break;
3401 default:
3402 gcc_unreachable ();
3405 if (shuffle_p && TARGET_PTX_6_0)
3407 /* The shuffle is a sync, so uniformity is guaranteed. */
3409 else
3411 if (TARGET_PTX_6_0)
3413 gcc_assert (!shuffle_p);
3414 /* Emit after the insn, to guarantee uniformity. */
3415 emit_insn_after (gen_nvptx_warpsync (), insn);
3417 else
3419 /* Emit after the insn (and before the shuffle, if there are any)
3420 to check uniformity. */
3421 emit_insn_after (gen_nvptx_uniform_warp_check (), insn);
3425 rtx pred = nvptx_get_unisimt_predicate ();
3426 predicate_insn (insn, pred);
3428 pred = NULL_RTX;
3429 for (rtx_insn *post = NEXT_INSN (insn); post != next;
3430 post = NEXT_INSN (post))
3432 if (pred == NULL_RTX)
3433 pred = nvptx_get_unisimt_outside_simt_predicate ();
3434 predicate_insn (post, pred);
3439 /* Offloading function attributes. */
3441 struct offload_attrs
3443 unsigned mask;
3444 int num_gangs;
3445 int num_workers;
3446 int vector_length;
3449 /* Define entries for cfun->machine->axis_dim. */
3451 #define MACH_VECTOR_LENGTH 0
3452 #define MACH_MAX_WORKERS 1
3454 static void populate_offload_attrs (offload_attrs *oa);
3456 static void
3457 init_axis_dim (void)
3459 offload_attrs oa;
3460 int max_workers;
3462 populate_offload_attrs (&oa);
3464 if (oa.num_workers == 0)
3465 max_workers = PTX_CTA_SIZE / oa.vector_length;
3466 else
3467 max_workers = oa.num_workers;
3469 cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
3470 cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
3471 cfun->machine->axis_dim_init_p = true;
3474 static int ATTRIBUTE_UNUSED
3475 nvptx_mach_max_workers ()
3477 if (!cfun->machine->axis_dim_init_p)
3478 init_axis_dim ();
3479 return cfun->machine->axis_dim[MACH_MAX_WORKERS];
3482 static int ATTRIBUTE_UNUSED
3483 nvptx_mach_vector_length ()
3485 if (!cfun->machine->axis_dim_init_p)
3486 init_axis_dim ();
3487 return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3490 /* Loop structure of the function. The entire function is described as
3491 a NULL loop. */
3492 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:struct parallel_g'. */
3494 struct parallel
3496 /* Parent parallel. */
3497 parallel *parent;
3499 /* Next sibling parallel. */
3500 parallel *next;
3502 /* First child parallel. */
3503 parallel *inner;
3505 /* Partitioning mask of the parallel. */
3506 unsigned mask;
3508 /* Partitioning used within inner parallels. */
3509 unsigned inner_mask;
3511 /* Location of parallel forked and join. The forked is the first
3512 block in the parallel and the join is the first block after of
3513 the partition. */
3514 basic_block forked_block;
3515 basic_block join_block;
3517 rtx_insn *forked_insn;
3518 rtx_insn *join_insn;
3520 rtx_insn *fork_insn;
3521 rtx_insn *joining_insn;
3523 /* Basic blocks in this parallel, but not in child parallels. The
3524 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3525 blocks are not. */
3526 auto_vec<basic_block> blocks;
3528 public:
3529 parallel (parallel *parent, unsigned mode);
3530 ~parallel ();
3533 /* Constructor links the new parallel into it's parent's chain of
3534 children. */
3536 parallel::parallel (parallel *parent_, unsigned mask_)
3537 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3539 forked_block = join_block = 0;
3540 forked_insn = join_insn = 0;
3541 fork_insn = joining_insn = 0;
3543 if (parent)
3545 next = parent->inner;
3546 parent->inner = this;
3550 parallel::~parallel ()
3552 delete inner;
3553 delete next;
3556 /* Map of basic blocks to insns */
3557 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3559 /* A tuple of an insn of interest and the BB in which it resides. */
3560 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3561 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3563 /* Split basic blocks such that each forked and join unspecs are at
3564 the start of their basic blocks. Thus afterwards each block will
3565 have a single partitioning mode. We also do the same for return
3566 insns, as they are executed by every thread. Return the
3567 partitioning mode of the function as a whole. Populate MAP with
3568 head and tail blocks. We also clear the BB visited flag, which is
3569 used when finding partitions. */
3570 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_split_blocks'. */
3572 static void
3573 nvptx_split_blocks (bb_insn_map_t *map)
3575 insn_bb_vec_t worklist;
3576 basic_block block;
3577 rtx_insn *insn;
3579 /* Locate all the reorg instructions of interest. */
3580 FOR_ALL_BB_FN (block, cfun)
3582 bool seen_insn = false;
3584 /* Clear visited flag, for use by parallel locator */
3585 block->flags &= ~BB_VISITED;
3587 FOR_BB_INSNS (block, insn)
3589 if (!INSN_P (insn))
3590 continue;
3591 switch (recog_memoized (insn))
3593 default:
3594 seen_insn = true;
3595 continue;
3596 case CODE_FOR_nvptx_forked:
3597 case CODE_FOR_nvptx_join:
3598 break;
3600 case CODE_FOR_return:
3601 /* We also need to split just before return insns, as
3602 that insn needs executing by all threads, but the
3603 block it is in probably does not. */
3604 break;
3607 if (seen_insn)
3608 /* We've found an instruction that must be at the start of
3609 a block, but isn't. Add it to the worklist. */
3610 worklist.safe_push (insn_bb_t (insn, block));
3611 else
3612 /* It was already the first instruction. Just add it to
3613 the map. */
3614 map->get_or_insert (block) = insn;
3615 seen_insn = true;
3619 /* Split blocks on the worklist. */
3620 unsigned ix;
3621 insn_bb_t *elt;
3622 basic_block remap = 0;
3623 for (ix = 0; worklist.iterate (ix, &elt); ix++)
3625 if (remap != elt->second)
3627 block = elt->second;
3628 remap = block;
3631 /* Split block before insn. The insn is in the new block */
3632 edge e = split_block (block, PREV_INSN (elt->first));
3634 block = e->dest;
3635 map->get_or_insert (block) = elt->first;
3639 /* Return true if MASK contains parallelism that requires shared
3640 memory to broadcast. */
3642 static bool
3643 nvptx_needs_shared_bcast (unsigned mask)
3645 bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3646 bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3647 && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3649 return worker || large_vector;
3652 /* BLOCK is a basic block containing a head or tail instruction.
3653 Locate the associated prehead or pretail instruction, which must be
3654 in the single predecessor block. */
3656 static rtx_insn *
3657 nvptx_discover_pre (basic_block block, int expected)
3659 gcc_assert (block->preds->length () == 1);
3660 basic_block pre_block = (*block->preds)[0]->src;
3661 rtx_insn *pre_insn;
3663 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3664 pre_insn = PREV_INSN (pre_insn))
3665 gcc_assert (pre_insn != BB_HEAD (pre_block));
3667 gcc_assert (recog_memoized (pre_insn) == expected);
3668 return pre_insn;
3671 /* Dump this parallel and all its inner parallels. */
3672 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_dump_pars'. */
3674 static void
3675 nvptx_dump_pars (parallel *par, unsigned depth)
3677 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3678 depth, par->mask,
3679 par->forked_block ? par->forked_block->index : -1,
3680 par->join_block ? par->join_block->index : -1);
3682 fprintf (dump_file, " blocks:");
3684 basic_block block;
3685 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3686 fprintf (dump_file, " %d", block->index);
3687 fprintf (dump_file, "\n");
3688 if (par->inner)
3689 nvptx_dump_pars (par->inner, depth + 1);
3691 if (par->next)
3692 nvptx_dump_pars (par->next, depth);
3695 /* If BLOCK contains a fork/join marker, process it to create or
3696 terminate a loop structure. Add this block to the current loop,
3697 and then walk successor blocks. */
3698 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_find_par'. */
3700 static parallel *
3701 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3703 if (block->flags & BB_VISITED)
3704 return par;
3705 block->flags |= BB_VISITED;
3707 if (rtx_insn **endp = map->get (block))
3709 rtx_insn *end = *endp;
3711 /* This is a block head or tail, or return instruction. */
3712 switch (recog_memoized (end))
3714 case CODE_FOR_return:
3715 /* Return instructions are in their own block, and we
3716 don't need to do anything more. */
3717 return par;
3719 case CODE_FOR_nvptx_forked:
3720 /* Loop head, create a new inner loop and add it into
3721 our parent's child list. */
3723 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3725 gcc_assert (mask);
3726 par = new parallel (par, mask);
3727 par->forked_block = block;
3728 par->forked_insn = end;
3729 if (nvptx_needs_shared_bcast (mask))
3730 par->fork_insn
3731 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3733 break;
3735 case CODE_FOR_nvptx_join:
3736 /* A loop tail. Finish the current loop and return to
3737 parent. */
3739 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3741 gcc_assert (par->mask == mask);
3742 gcc_assert (par->join_block == NULL);
3743 par->join_block = block;
3744 par->join_insn = end;
3745 if (nvptx_needs_shared_bcast (mask))
3746 par->joining_insn
3747 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3748 par = par->parent;
3750 break;
3752 default:
3753 gcc_unreachable ();
3757 if (par)
3758 /* Add this block onto the current loop's list of blocks. */
3759 par->blocks.safe_push (block);
3760 else
3761 /* This must be the entry block. Create a NULL parallel. */
3762 par = new parallel (0, 0);
3764 /* Walk successor blocks. */
3765 edge e;
3766 edge_iterator ei;
3768 FOR_EACH_EDGE (e, ei, block->succs)
3769 nvptx_find_par (map, par, e->dest);
3771 return par;
3774 /* DFS walk the CFG looking for fork & join markers. Construct
3775 loop structures as we go. MAP is a mapping of basic blocks
3776 to head & tail markers, discovered when splitting blocks. This
3777 speeds up the discovery. We rely on the BB visited flag having
3778 been cleared when splitting blocks. */
3779 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_discover_pars'. */
3781 static parallel *
3782 nvptx_discover_pars (bb_insn_map_t *map)
3784 basic_block block;
3786 /* Mark exit blocks as visited. */
3787 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3788 block->flags |= BB_VISITED;
3790 /* And entry block as not. */
3791 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3792 block->flags &= ~BB_VISITED;
3794 parallel *par = nvptx_find_par (map, 0, block);
3796 if (dump_file)
3798 fprintf (dump_file, "\nLoops\n");
3799 nvptx_dump_pars (par, 0);
3800 fprintf (dump_file, "\n");
3803 return par;
3806 /* Analyse a group of BBs within a partitioned region and create N
3807 Single-Entry-Single-Exit regions. Some of those regions will be
3808 trivial ones consisting of a single BB. The blocks of a
3809 partitioned region might form a set of disjoint graphs -- because
3810 the region encloses a differently partitoned sub region.
3812 We use the linear time algorithm described in 'Finding Regions Fast:
3813 Single Entry Single Exit and control Regions in Linear Time'
3814 Johnson, Pearson & Pingali. That algorithm deals with complete
3815 CFGs, where a back edge is inserted from END to START, and thus the
3816 problem becomes one of finding equivalent loops.
3818 In this case we have a partial CFG. We complete it by redirecting
3819 any incoming edge to the graph to be from an arbitrary external BB,
3820 and similarly redirecting any outgoing edge to be to that BB.
3821 Thus we end up with a closed graph.
3823 The algorithm works by building a spanning tree of an undirected
3824 graph and keeping track of back edges from nodes further from the
3825 root in the tree to nodes nearer to the root in the tree. In the
3826 description below, the root is up and the tree grows downwards.
3828 We avoid having to deal with degenerate back-edges to the same
3829 block, by splitting each BB into 3 -- one for input edges, one for
3830 the node itself and one for the output edges. Such back edges are
3831 referred to as 'Brackets'. Cycle equivalent nodes will have the
3832 same set of brackets.
3834 Determining bracket equivalency is done by maintaining a list of
3835 brackets in such a manner that the list length and final bracket
3836 uniquely identify the set.
3838 We use coloring to mark all BBs with cycle equivalency with the
3839 same color. This is the output of the 'Finding Regions Fast'
3840 algorithm. Notice it doesn't actually find the set of nodes within
3841 a particular region, just unorderd sets of nodes that are the
3842 entries and exits of SESE regions.
3844 After determining cycle equivalency, we need to find the minimal
3845 set of SESE regions. Do this with a DFS coloring walk of the
3846 complete graph. We're either 'looking' or 'coloring'. When
3847 looking, and we're in the subgraph, we start coloring the color of
3848 the current node, and remember that node as the start of the
3849 current color's SESE region. Every time we go to a new node, we
3850 decrement the count of nodes with thet color. If it reaches zero,
3851 we remember that node as the end of the current color's SESE region
3852 and return to 'looking'. Otherwise we color the node the current
3853 color.
3855 This way we end up with coloring the inside of non-trivial SESE
3856 regions with the color of that region. */
3858 /* A pair of BBs. We use this to represent SESE regions. */
3859 typedef std::pair<basic_block, basic_block> bb_pair_t;
3860 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3862 /* A node in the undirected CFG. The discriminator SECOND indicates just
3863 above or just below the BB idicated by FIRST. */
3864 typedef std::pair<basic_block, int> pseudo_node_t;
3866 /* A bracket indicates an edge towards the root of the spanning tree of the
3867 undirected graph. Each bracket has a color, determined
3868 from the currrent set of brackets. */
3869 struct bracket
3871 pseudo_node_t back; /* Back target */
3873 /* Current color and size of set. */
3874 unsigned color;
3875 unsigned size;
3877 bracket (pseudo_node_t back_)
3878 : back (back_), color (~0u), size (~0u)
3882 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3884 if (length != size)
3886 size = length;
3887 color = color_counts.length ();
3888 color_counts.quick_push (0);
3890 color_counts[color]++;
3891 return color;
3895 typedef auto_vec<bracket> bracket_vec_t;
3897 /* Basic block info for finding SESE regions. */
3899 struct bb_sese
3901 int node; /* Node number in spanning tree. */
3902 int parent; /* Parent node number. */
3904 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3905 edges arrive at pseudo-node Ai and the outgoing edges leave at
3906 pseudo-node Ao. We have to remember which way we arrived at a
3907 particular node when generating the spanning tree. dir > 0 means
3908 we arrived at Ai, dir < 0 means we arrived at Ao. */
3909 int dir;
3911 /* Lowest numbered pseudo-node reached via a backedge from thsis
3912 node, or any descendant. */
3913 pseudo_node_t high;
3915 int color; /* Cycle-equivalence color */
3917 /* Stack of brackets for this node. */
3918 bracket_vec_t brackets;
3920 bb_sese (unsigned node_, unsigned p, int dir_)
3921 :node (node_), parent (p), dir (dir_)
3924 ~bb_sese ();
3926 /* Push a bracket ending at BACK. */
3927 void push (const pseudo_node_t &back)
3929 if (dump_file)
3930 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3931 back.first ? back.first->index : 0, back.second);
3932 brackets.safe_push (bracket (back));
3935 void append (bb_sese *child);
3936 void remove (const pseudo_node_t &);
3938 /* Set node's color. */
3939 void set_color (auto_vec<unsigned> &color_counts)
3941 color = brackets.last ().get_color (color_counts, brackets.length ());
3945 bb_sese::~bb_sese ()
3949 /* Destructively append CHILD's brackets. */
3951 void
3952 bb_sese::append (bb_sese *child)
3954 if (int len = child->brackets.length ())
3956 int ix;
3958 if (dump_file)
3960 for (ix = 0; ix < len; ix++)
3962 const pseudo_node_t &pseudo = child->brackets[ix].back;
3963 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3964 child->node, pseudo.first ? pseudo.first->index : 0,
3965 pseudo.second);
3968 if (!brackets.length ())
3969 std::swap (brackets, child->brackets);
3970 else
3972 brackets.reserve (len);
3973 for (ix = 0; ix < len; ix++)
3974 brackets.quick_push (child->brackets[ix]);
3979 /* Remove brackets that terminate at PSEUDO. */
3981 void
3982 bb_sese::remove (const pseudo_node_t &pseudo)
3984 unsigned removed = 0;
3985 int len = brackets.length ();
3987 for (int ix = 0; ix < len; ix++)
3989 if (brackets[ix].back == pseudo)
3991 if (dump_file)
3992 fprintf (dump_file, "Removing backedge %d:%+d\n",
3993 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3994 removed++;
3996 else if (removed)
3997 brackets[ix-removed] = brackets[ix];
3999 while (removed--)
4000 brackets.pop ();
4003 /* Accessors for BB's aux pointer. */
4004 #define BB_SET_SESE(B, S) ((B)->aux = (S))
4005 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
4007 /* DFS walk creating SESE data structures. Only cover nodes with
4008 BB_VISITED set. Append discovered blocks to LIST. We number in
4009 increments of 3 so that the above and below pseudo nodes can be
4010 implicitly numbered too. */
4012 static int
4013 nvptx_sese_number (int n, int p, int dir, basic_block b,
4014 auto_vec<basic_block> *list)
4016 if (BB_GET_SESE (b))
4017 return n;
4019 if (dump_file)
4020 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
4021 b->index, n, p, dir);
4023 BB_SET_SESE (b, new bb_sese (n, p, dir));
4024 p = n;
4026 n += 3;
4027 list->quick_push (b);
4029 /* First walk the nodes on the 'other side' of this node, then walk
4030 the nodes on the same side. */
4031 for (unsigned ix = 2; ix; ix--)
4033 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
4034 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
4035 : offsetof (edge_def, src));
4036 edge e;
4037 edge_iterator ei;
4039 FOR_EACH_EDGE (e, ei, edges)
4041 basic_block target = *(basic_block *)((char *)e + offset);
4043 if (target->flags & BB_VISITED)
4044 n = nvptx_sese_number (n, p, dir, target, list);
4046 dir = -dir;
4048 return n;
4051 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
4052 EDGES are the outgoing edges and OFFSET is the offset to the src
4053 or dst block on the edges. */
4055 static void
4056 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
4057 vec<edge, va_gc> *edges, size_t offset)
4059 edge e;
4060 edge_iterator ei;
4061 int hi_back = depth;
4062 pseudo_node_t node_back (nullptr, depth);
4063 int hi_child = depth;
4064 pseudo_node_t node_child (nullptr, depth);
4065 basic_block child = NULL;
4066 unsigned num_children = 0;
4067 int usd = -dir * sese->dir;
4069 if (dump_file)
4070 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
4071 me->index, sese->node, dir);
4073 if (dir < 0)
4075 /* This is the above pseudo-child. It has the BB itself as an
4076 additional child node. */
4077 node_child = sese->high;
4078 hi_child = node_child.second;
4079 if (node_child.first)
4080 hi_child += BB_GET_SESE (node_child.first)->node;
4081 num_children++;
4084 /* Examine each edge.
4085 - if it is a child (a) append its bracket list and (b) record
4086 whether it is the child with the highest reaching bracket.
4087 - if it is an edge to ancestor, record whether it's the highest
4088 reaching backlink. */
4089 FOR_EACH_EDGE (e, ei, edges)
4091 basic_block target = *(basic_block *)((char *)e + offset);
4093 if (bb_sese *t_sese = BB_GET_SESE (target))
4095 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
4097 /* Child node. Append its bracket list. */
4098 num_children++;
4099 sese->append (t_sese);
4101 /* Compare it's hi value. */
4102 int t_hi = t_sese->high.second;
4104 if (basic_block child_hi_block = t_sese->high.first)
4105 t_hi += BB_GET_SESE (child_hi_block)->node;
4107 if (hi_child > t_hi)
4109 hi_child = t_hi;
4110 node_child = t_sese->high;
4111 child = target;
4114 else if (t_sese->node < sese->node + dir
4115 && !(dir < 0 && sese->parent == t_sese->node))
4117 /* Non-parental ancestor node -- a backlink. */
4118 int d = usd * t_sese->dir;
4119 int back = t_sese->node + d;
4121 if (hi_back > back)
4123 hi_back = back;
4124 node_back = pseudo_node_t (target, d);
4128 else
4129 { /* Fallen off graph, backlink to entry node. */
4130 hi_back = 0;
4131 node_back = pseudo_node_t (nullptr, 0);
4135 /* Remove any brackets that terminate at this pseudo node. */
4136 sese->remove (pseudo_node_t (me, dir));
4138 /* Now push any backlinks from this pseudo node. */
4139 FOR_EACH_EDGE (e, ei, edges)
4141 basic_block target = *(basic_block *)((char *)e + offset);
4142 if (bb_sese *t_sese = BB_GET_SESE (target))
4144 if (t_sese->node < sese->node + dir
4145 && !(dir < 0 && sese->parent == t_sese->node))
4146 /* Non-parental ancestor node - backedge from me. */
4147 sese->push (pseudo_node_t (target, usd * t_sese->dir));
4149 else
4151 /* back edge to entry node */
4152 sese->push (pseudo_node_t (nullptr, 0));
4156 /* If this node leads directly or indirectly to a no-return region of
4157 the graph, then fake a backedge to entry node. */
4158 if (!sese->brackets.length () || !edges || !edges->length ())
4160 hi_back = 0;
4161 node_back = pseudo_node_t (nullptr, 0);
4162 sese->push (node_back);
4165 /* Record the highest reaching backedge from us or a descendant. */
4166 sese->high = hi_back < hi_child ? node_back : node_child;
4168 if (num_children > 1)
4170 /* There is more than one child -- this is a Y shaped piece of
4171 spanning tree. We have to insert a fake backedge from this
4172 node to the highest ancestor reached by not-the-highest
4173 reaching child. Note that there may be multiple children
4174 with backedges to the same highest node. That's ok and we
4175 insert the edge to that highest node. */
4176 hi_child = depth;
4177 if (dir < 0 && child)
4179 node_child = sese->high;
4180 hi_child = node_child.second;
4181 if (node_child.first)
4182 hi_child += BB_GET_SESE (node_child.first)->node;
4185 FOR_EACH_EDGE (e, ei, edges)
4187 basic_block target = *(basic_block *)((char *)e + offset);
4189 if (target == child)
4190 /* Ignore the highest child. */
4191 continue;
4193 bb_sese *t_sese = BB_GET_SESE (target);
4194 if (!t_sese)
4195 continue;
4196 if (t_sese->parent != sese->node)
4197 /* Not a child. */
4198 continue;
4200 /* Compare its hi value. */
4201 int t_hi = t_sese->high.second;
4203 if (basic_block child_hi_block = t_sese->high.first)
4204 t_hi += BB_GET_SESE (child_hi_block)->node;
4206 if (hi_child > t_hi)
4208 hi_child = t_hi;
4209 node_child = t_sese->high;
4213 sese->push (node_child);
4218 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
4219 proceed to successors. Set SESE entry and exit nodes of
4220 REGIONS. */
4222 static void
4223 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
4224 basic_block block, int coloring)
4226 bb_sese *sese = BB_GET_SESE (block);
4228 if (block->flags & BB_VISITED)
4230 /* If we've already encountered this block, either we must not
4231 be coloring, or it must have been colored the current color. */
4232 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
4233 return;
4236 block->flags |= BB_VISITED;
4238 if (sese)
4240 if (coloring < 0)
4242 /* Start coloring a region. */
4243 regions[sese->color].first = block;
4244 coloring = sese->color;
4247 if (!--color_counts[sese->color] && sese->color == coloring)
4249 /* Found final block of SESE region. */
4250 regions[sese->color].second = block;
4251 coloring = -1;
4253 else
4254 /* Color the node, so we can assert on revisiting the node
4255 that the graph is indeed SESE. */
4256 sese->color = coloring;
4258 else
4259 /* Fallen off the subgraph, we cannot be coloring. */
4260 gcc_assert (coloring < 0);
4262 /* Walk each successor block. */
4263 if (block->succs && block->succs->length ())
4265 edge e;
4266 edge_iterator ei;
4268 FOR_EACH_EDGE (e, ei, block->succs)
4269 nvptx_sese_color (color_counts, regions, e->dest, coloring);
4271 else
4272 gcc_assert (coloring < 0);
4275 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
4276 end up with NULL entries in it. */
4278 static void
4279 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
4281 basic_block block;
4282 int ix;
4284 /* First clear each BB of the whole function. */
4285 FOR_ALL_BB_FN (block, cfun)
4287 block->flags &= ~BB_VISITED;
4288 BB_SET_SESE (block, 0);
4291 /* Mark blocks in the function that are in this graph. */
4292 for (ix = 0; blocks.iterate (ix, &block); ix++)
4293 block->flags |= BB_VISITED;
4295 /* Counts of nodes assigned to each color. There cannot be more
4296 colors than blocks (and hopefully there will be fewer). */
4297 auto_vec<unsigned> color_counts;
4298 color_counts.reserve (blocks.length ());
4300 /* Worklist of nodes in the spanning tree. Again, there cannot be
4301 more nodes in the tree than blocks (there will be fewer if the
4302 CFG of blocks is disjoint). */
4303 auto_vec<basic_block> spanlist;
4304 spanlist.reserve (blocks.length ());
4306 /* Make sure every block has its cycle class determined. */
4307 for (ix = 0; blocks.iterate (ix, &block); ix++)
4309 if (BB_GET_SESE (block))
4310 /* We already met this block in an earlier graph solve. */
4311 continue;
4313 if (dump_file)
4314 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
4316 /* Number the nodes reachable from block initial DFS order. */
4317 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
4319 /* Now walk in reverse DFS order to find cycle equivalents. */
4320 while (spanlist.length ())
4322 block = spanlist.pop ();
4323 bb_sese *sese = BB_GET_SESE (block);
4325 /* Do the pseudo node below. */
4326 nvptx_sese_pseudo (block, sese, depth, +1,
4327 sese->dir > 0 ? block->succs : block->preds,
4328 (sese->dir > 0 ? offsetof (edge_def, dest)
4329 : offsetof (edge_def, src)));
4330 sese->set_color (color_counts);
4331 /* Do the pseudo node above. */
4332 nvptx_sese_pseudo (block, sese, depth, -1,
4333 sese->dir < 0 ? block->succs : block->preds,
4334 (sese->dir < 0 ? offsetof (edge_def, dest)
4335 : offsetof (edge_def, src)));
4337 if (dump_file)
4338 fprintf (dump_file, "\n");
4341 if (dump_file)
4343 unsigned count;
4344 const char *comma = "";
4346 fprintf (dump_file, "Found %d cycle equivalents\n",
4347 color_counts.length ());
4348 for (ix = 0; color_counts.iterate (ix, &count); ix++)
4350 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
4352 comma = "";
4353 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
4354 if (BB_GET_SESE (block)->color == ix)
4356 block->flags |= BB_VISITED;
4357 fprintf (dump_file, "%s%d", comma, block->index);
4358 comma=",";
4360 fprintf (dump_file, "}");
4361 comma = ", ";
4363 fprintf (dump_file, "\n");
4366 /* Now we've colored every block in the subgraph. We now need to
4367 determine the minimal set of SESE regions that cover that
4368 subgraph. Do this with a DFS walk of the complete function.
4369 During the walk we're either 'looking' or 'coloring'. When we
4370 reach the last node of a particular color, we stop coloring and
4371 return to looking. */
4373 /* There cannot be more SESE regions than colors. */
4374 regions.reserve (color_counts.length ());
4375 for (ix = color_counts.length (); ix--;)
4376 regions.quick_push (bb_pair_t (0, 0));
4378 for (ix = 0; blocks.iterate (ix, &block); ix++)
4379 block->flags &= ~BB_VISITED;
4381 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
4383 if (dump_file)
4385 const char *comma = "";
4386 int len = regions.length ();
4388 fprintf (dump_file, "SESE regions:");
4389 for (ix = 0; ix != len; ix++)
4391 basic_block from = regions[ix].first;
4392 basic_block to = regions[ix].second;
4394 if (from)
4396 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
4397 if (to != from)
4398 fprintf (dump_file, "->%d", to->index);
4400 int color = BB_GET_SESE (from)->color;
4402 /* Print the blocks within the region (excluding ends). */
4403 FOR_EACH_BB_FN (block, cfun)
4405 bb_sese *sese = BB_GET_SESE (block);
4407 if (sese && sese->color == color
4408 && block != from && block != to)
4409 fprintf (dump_file, ".%d", block->index);
4411 fprintf (dump_file, "}");
4413 comma = ",";
4415 fprintf (dump_file, "\n\n");
4418 for (ix = 0; blocks.iterate (ix, &block); ix++)
4419 delete BB_GET_SESE (block);
4422 #undef BB_SET_SESE
4423 #undef BB_GET_SESE
4425 /* Propagate live state at the start of a partitioned region. IS_CALL
4426 indicates whether the propagation is for a (partitioned) call
4427 instruction. BLOCK provides the live register information, and
4428 might not contain INSN. Propagation is inserted just after INSN. RW
4429 indicates whether we are reading and/or writing state. This
4430 separation is needed for worker-level proppagation where we
4431 essentially do a spill & fill. FN is the underlying worker
4432 function to generate the propagation instructions for single
4433 register. DATA is user data.
4435 Returns true if we didn't emit any instructions.
4437 We propagate the live register set for non-calls and the entire
4438 frame for calls and non-calls. We could do better by (a)
4439 propagating just the live set that is used within the partitioned
4440 regions and (b) only propagating stack entries that are used. The
4441 latter might be quite hard to determine. */
4443 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
4445 static bool
4446 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
4447 propagate_mask rw, propagator_fn fn, void *data, bool vector)
4449 bitmap live = DF_LIVE_IN (block);
4450 bitmap_iterator iterator;
4451 unsigned ix;
4452 bool empty = true;
4454 /* Copy the frame array. */
4455 HOST_WIDE_INT fs = get_frame_size ();
4456 if (fs)
4458 rtx tmp = gen_reg_rtx (DImode);
4459 rtx idx = NULL_RTX;
4460 rtx ptr = gen_reg_rtx (Pmode);
4461 rtx pred = NULL_RTX;
4462 rtx_code_label *label = NULL;
4464 empty = false;
4465 /* The frame size might not be DImode compatible, but the frame
4466 array's declaration will be. So it's ok to round up here. */
4467 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
4468 /* Detect single iteration loop. */
4469 if (fs == 1)
4470 fs = 0;
4472 start_sequence ();
4473 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
4474 if (fs)
4476 idx = gen_reg_rtx (SImode);
4477 pred = gen_reg_rtx (BImode);
4478 label = gen_label_rtx ();
4480 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
4481 /* Allow worker function to initialize anything needed. */
4482 rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
4483 if (init)
4484 emit_insn (init);
4485 emit_label (label);
4486 LABEL_NUSES (label)++;
4487 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
4489 if (rw & PM_read)
4490 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
4491 emit_insn (fn (tmp, rw, fs, data, vector));
4492 if (rw & PM_write)
4493 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4494 if (fs)
4496 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4497 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4498 emit_insn (gen_br_true_uni (pred, label));
4499 rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
4500 if (fini)
4501 emit_insn (fini);
4502 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4504 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4505 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4506 rtx cpy = get_insns ();
4507 end_sequence ();
4508 insn = emit_insn_after (cpy, insn);
4511 if (!is_call)
4512 /* Copy live registers. */
4513 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4515 rtx reg = regno_reg_rtx[ix];
4517 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4519 rtx bcast = fn (reg, rw, 0, data, vector);
4521 insn = emit_insn_after (bcast, insn);
4522 empty = false;
4525 return empty;
4528 /* Worker for nvptx_warp_propagate. */
4530 static rtx
4531 warp_prop_gen (rtx reg, propagate_mask pm,
4532 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4533 bool ARG_UNUSED (vector))
4535 if (!(pm & PM_read_write))
4536 return 0;
4538 return nvptx_gen_warp_bcast (reg);
4541 /* Propagate state that is live at start of BLOCK across the vectors
4542 of a single warp. Propagation is inserted just after INSN.
4543 IS_CALL and return as for nvptx_propagate. */
4545 static bool
4546 nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
4548 return nvptx_propagate (is_call, block, insn, PM_read_write,
4549 warp_prop_gen, 0, false);
4552 /* Worker for nvptx_shared_propagate. */
4554 static rtx
4555 shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4556 bool vector)
4558 broadcast_data_t *data = (broadcast_data_t *)data_;
4560 if (pm & PM_loop_begin)
4562 /* Starting a loop, initialize pointer. */
4563 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4565 oacc_bcast_align = MAX (oacc_bcast_align, align);
4566 data->offset = ROUND_UP (data->offset, align);
4568 data->ptr = gen_reg_rtx (Pmode);
4570 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4572 else if (pm & PM_loop_end)
4574 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4575 data->ptr = NULL_RTX;
4576 return clobber;
4578 else
4579 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
4582 /* Spill or fill live state that is live at start of BLOCK. PRE_P
4583 indicates if this is just before partitioned mode (do spill), or
4584 just after it starts (do fill). Sequence is inserted just after
4585 INSN. IS_CALL and return as for nvptx_propagate. */
4587 static bool
4588 nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4589 rtx_insn *insn, bool vector)
4591 broadcast_data_t data;
4593 data.base = gen_reg_rtx (Pmode);
4594 data.offset = 0;
4595 data.ptr = NULL_RTX;
4597 bool empty = nvptx_propagate (is_call, block, insn,
4598 pre_p ? PM_read : PM_write, shared_prop_gen,
4599 &data, vector);
4600 gcc_assert (empty == !data.offset);
4601 if (data.offset)
4603 rtx bcast_sym = oacc_bcast_sym;
4605 /* Stuff was emitted, initialize the base pointer now. */
4606 if (vector && nvptx_mach_max_workers () > 1)
4608 if (!cfun->machine->bcast_partition)
4610 /* It would be nice to place this register in
4611 DATA_AREA_SHARED. */
4612 cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4614 if (!cfun->machine->sync_bar)
4615 cfun->machine->sync_bar = gen_reg_rtx (SImode);
4617 bcast_sym = cfun->machine->bcast_partition;
4620 rtx init = gen_rtx_SET (data.base, bcast_sym);
4621 emit_insn_after (init, insn);
4623 unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4624 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4625 ? nvptx_mach_max_workers () + 1
4626 : 1);
4628 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4629 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4631 return empty;
4634 /* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4635 which is an integer or a register. THREADS is the number of threads
4636 controlled by the barrier. */
4638 static rtx
4639 nvptx_cta_sync (rtx lock, int threads)
4641 return gen_nvptx_barsync (lock, GEN_INT (threads));
4644 #if WORKAROUND_PTXJIT_BUG
4645 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4646 real insns. */
4648 static rtx_insn *
4649 bb_first_real_insn (basic_block bb)
4651 rtx_insn *insn;
4653 /* Find first insn of from block. */
4654 FOR_BB_INSNS (bb, insn)
4655 if (INSN_P (insn))
4656 return insn;
4658 return 0;
4660 #endif
4662 /* Return true if INSN needs neutering. */
4664 static bool
4665 needs_neutering_p (rtx_insn *insn)
4667 if (!INSN_P (insn))
4668 return false;
4670 switch (recog_memoized (insn))
4672 case CODE_FOR_nvptx_fork:
4673 case CODE_FOR_nvptx_forked:
4674 case CODE_FOR_nvptx_joining:
4675 case CODE_FOR_nvptx_join:
4676 case CODE_FOR_nvptx_barsync:
4677 return false;
4678 default:
4679 return true;
4683 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4685 static bool
4686 verify_neutering_jumps (basic_block from,
4687 rtx_insn *vector_jump, rtx_insn *worker_jump,
4688 rtx_insn *vector_label, rtx_insn *worker_label)
4690 basic_block bb = from;
4691 rtx_insn *insn = BB_HEAD (bb);
4692 bool seen_worker_jump = false;
4693 bool seen_vector_jump = false;
4694 bool seen_worker_label = false;
4695 bool seen_vector_label = false;
4696 bool worker_neutered = false;
4697 bool vector_neutered = false;
4698 while (true)
4700 if (insn == worker_jump)
4702 seen_worker_jump = true;
4703 worker_neutered = true;
4704 gcc_assert (!vector_neutered);
4706 else if (insn == vector_jump)
4708 seen_vector_jump = true;
4709 vector_neutered = true;
4711 else if (insn == worker_label)
4713 seen_worker_label = true;
4714 gcc_assert (worker_neutered);
4715 worker_neutered = false;
4717 else if (insn == vector_label)
4719 seen_vector_label = true;
4720 gcc_assert (vector_neutered);
4721 vector_neutered = false;
4723 else if (INSN_P (insn))
4724 switch (recog_memoized (insn))
4726 case CODE_FOR_nvptx_barsync:
4727 gcc_assert (!vector_neutered && !worker_neutered);
4728 break;
4729 default:
4730 break;
4733 if (insn != BB_END (bb))
4734 insn = NEXT_INSN (insn);
4735 else if (JUMP_P (insn) && single_succ_p (bb)
4736 && !seen_vector_jump && !seen_worker_jump)
4738 bb = single_succ (bb);
4739 insn = BB_HEAD (bb);
4741 else
4742 break;
4745 gcc_assert (!(vector_jump && !seen_vector_jump));
4746 gcc_assert (!(worker_jump && !seen_worker_jump));
4748 if (seen_vector_label || seen_worker_label)
4750 gcc_assert (!(vector_label && !seen_vector_label));
4751 gcc_assert (!(worker_label && !seen_worker_label));
4753 return true;
4756 return false;
4759 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4761 static void
4762 verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4763 rtx_insn *worker_label)
4765 basic_block bb = to;
4766 rtx_insn *insn = BB_END (bb);
4767 bool seen_worker_label = false;
4768 bool seen_vector_label = false;
4769 while (true)
4771 if (insn == worker_label)
4773 seen_worker_label = true;
4774 gcc_assert (!seen_vector_label);
4776 else if (insn == vector_label)
4777 seen_vector_label = true;
4778 else if (INSN_P (insn))
4779 switch (recog_memoized (insn))
4781 case CODE_FOR_nvptx_barsync:
4782 gcc_assert (!seen_vector_label && !seen_worker_label);
4783 break;
4786 if (insn != BB_HEAD (bb))
4787 insn = PREV_INSN (insn);
4788 else
4789 break;
4792 gcc_assert (!(vector_label && !seen_vector_label));
4793 gcc_assert (!(worker_label && !seen_worker_label));
4796 /* Single neutering according to MASK. FROM is the incoming block and
4797 TO is the outgoing block. These may be the same block. Insert at
4798 start of FROM:
4800 if (tid.<axis>) goto end.
4802 and insert before ending branch of TO (if there is such an insn):
4804 end:
4805 <possibly-broadcast-cond>
4806 <branch>
4808 We currently only use differnt FROM and TO when skipping an entire
4809 loop. We could do more if we detected superblocks. */
4811 static void
4812 nvptx_single (unsigned mask, basic_block from, basic_block to)
4814 rtx_insn *head = BB_HEAD (from);
4815 rtx_insn *tail = BB_END (to);
4816 unsigned skip_mask = mask;
4818 while (true)
4820 /* Find first insn of from block. */
4821 while (head != BB_END (from) && !needs_neutering_p (head))
4822 head = NEXT_INSN (head);
4824 if (from == to)
4825 break;
4827 if (!(JUMP_P (head) && single_succ_p (from)))
4828 break;
4830 basic_block jump_target = single_succ (from);
4831 if (!single_pred_p (jump_target))
4832 break;
4834 from = jump_target;
4835 head = BB_HEAD (from);
4838 /* Find last insn of to block */
4839 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4840 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4841 tail = PREV_INSN (tail);
4843 /* Detect if tail is a branch. */
4844 rtx tail_branch = NULL_RTX;
4845 rtx cond_branch = NULL_RTX;
4846 if (tail && INSN_P (tail))
4848 tail_branch = PATTERN (tail);
4849 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4850 tail_branch = NULL_RTX;
4851 else
4853 cond_branch = SET_SRC (tail_branch);
4854 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4855 cond_branch = NULL_RTX;
4859 if (tail == head)
4861 /* If this is empty, do nothing. */
4862 if (!head || !needs_neutering_p (head))
4863 return;
4865 if (cond_branch)
4867 /* If we're only doing vector single, there's no need to
4868 emit skip code because we'll not insert anything. */
4869 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4870 skip_mask = 0;
4872 else if (tail_branch)
4873 /* Block with only unconditional branch. Nothing to do. */
4874 return;
4877 /* Insert the vector test inside the worker test. */
4878 unsigned mode;
4879 rtx_insn *before = tail;
4880 rtx_insn *neuter_start = NULL;
4881 rtx_insn *worker_label = NULL, *vector_label = NULL;
4882 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4883 rtx_insn *warp_sync = NULL;
4884 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4885 if (GOMP_DIM_MASK (mode) & skip_mask)
4887 rtx_code_label *label = gen_label_rtx ();
4888 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4889 rtx_insn **mode_jump
4890 = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4891 rtx_insn **mode_label
4892 = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4894 if (!pred)
4896 pred = gen_reg_rtx (BImode);
4897 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4900 rtx br;
4901 if (mode == GOMP_DIM_VECTOR)
4902 br = gen_br_true (pred, label);
4903 else
4904 br = gen_br_true_uni (pred, label);
4905 if (neuter_start)
4906 neuter_start = emit_insn_after (br, neuter_start);
4907 else
4908 neuter_start = emit_insn_before (br, head);
4909 *mode_jump = neuter_start;
4911 LABEL_NUSES (label)++;
4912 rtx_insn *label_insn;
4913 if (tail_branch)
4915 label_insn = emit_label_before (label, before);
4916 if (mode == GOMP_DIM_VECTOR)
4918 if (TARGET_PTX_6_0)
4919 warp_sync = emit_insn_after (gen_nvptx_warpsync (),
4920 label_insn);
4921 else
4922 warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
4923 label_insn);
4925 before = label_insn;
4927 else
4929 label_insn = emit_label_after (label, tail);
4930 if (mode == GOMP_DIM_VECTOR)
4932 if (TARGET_PTX_6_0)
4933 warp_sync = emit_insn_after (gen_nvptx_warpsync (),
4934 label_insn);
4935 else
4936 warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
4937 label_insn);
4939 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4940 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4941 emit_insn_after (gen_exit (), label_insn);
4944 *mode_label = label_insn;
4947 /* Now deal with propagating the branch condition. */
4948 if (cond_branch)
4950 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4952 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4953 && nvptx_mach_vector_length () == PTX_WARP_SIZE)
4955 /* Vector mode only, do a shuffle. */
4956 #if WORKAROUND_PTXJIT_BUG
4957 /* The branch condition %rcond is propagated like this:
4960 .reg .u32 %x;
4961 mov.u32 %x,%tid.x;
4962 setp.ne.u32 %rnotvzero,%x,0;
4965 @%rnotvzero bra Lskip;
4966 setp.<op>.<type> %rcond,op1,op2;
4967 Lskip:
4968 selp.u32 %rcondu32,1,0,%rcond;
4969 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4970 setp.ne.u32 %rcond,%rcondu32,0;
4972 There seems to be a bug in the ptx JIT compiler (observed at driver
4973 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4974 unless %rcond is initialized to something before 'bra Lskip'. The
4975 bug is not observed with ptxas from cuda 8.0.61.
4977 It is true that the code is non-trivial: at Lskip, %rcond is
4978 uninitialized in threads 1-31, and after the selp the same holds
4979 for %rcondu32. But shfl propagates the defined value in thread 0
4980 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4981 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4983 There is nothing in the PTX spec to suggest that this is wrong, or
4984 to explain why the extra initialization is needed. So, we classify
4985 it as a JIT bug, and the extra initialization as workaround:
4988 .reg .u32 %x;
4989 mov.u32 %x,%tid.x;
4990 setp.ne.u32 %rnotvzero,%x,0;
4993 +.reg .pred %rcond2;
4994 +setp.eq.u32 %rcond2, 1, 0;
4996 @%rnotvzero bra Lskip;
4997 setp.<op>.<type> %rcond,op1,op2;
4998 +mov.pred %rcond2, %rcond;
4999 Lskip:
5000 +mov.pred %rcond, %rcond2;
5001 selp.u32 %rcondu32,1,0,%rcond;
5002 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
5003 setp.ne.u32 %rcond,%rcondu32,0;
5005 rtx_insn *label = PREV_INSN (tail);
5006 if (label == warp_sync)
5007 label = PREV_INSN (label);
5008 gcc_assert (label && LABEL_P (label));
5009 rtx tmp = gen_reg_rtx (BImode);
5010 emit_insn_before (gen_movbi (tmp, const0_rtx),
5011 bb_first_real_insn (from));
5012 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
5013 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
5014 #endif
5015 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
5017 else
5019 /* Includes worker mode, do spill & fill. By construction
5020 we should never have worker mode only. */
5021 broadcast_data_t data;
5022 unsigned size = GET_MODE_SIZE (SImode);
5023 bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
5024 bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
5025 rtx barrier = GEN_INT (0);
5026 int threads = 0;
5028 data.base = oacc_bcast_sym;
5029 data.ptr = 0;
5031 bool use_partitioning_p = (vector && !worker
5032 && nvptx_mach_max_workers () > 1
5033 && cfun->machine->bcast_partition);
5034 if (use_partitioning_p)
5036 data.base = cfun->machine->bcast_partition;
5037 barrier = cfun->machine->sync_bar;
5038 threads = nvptx_mach_vector_length ();
5040 gcc_assert (data.base != NULL);
5041 gcc_assert (barrier);
5043 unsigned int psize = ROUND_UP (size, oacc_bcast_align);
5044 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
5045 ? nvptx_mach_max_workers () + 1
5046 : 1);
5048 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
5049 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
5051 data.offset = 0;
5052 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
5053 vector),
5054 before);
5056 /* Barrier so other workers can see the write. */
5057 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
5058 data.offset = 0;
5059 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
5060 vector),
5061 tail);
5062 /* This barrier is needed to avoid worker zero clobbering
5063 the broadcast buffer before all the other workers have
5064 had a chance to read this instance of it. */
5065 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
5068 extract_insn (tail);
5069 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
5070 UNSPEC_BR_UNIFIED);
5071 validate_change (tail, recog_data.operand_loc[0], unsp, false);
5074 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
5075 vector_label, worker_label);
5076 if (!seen_label)
5077 verify_neutering_labels (to, vector_label, worker_label);
5080 /* PAR is a parallel that is being skipped in its entirety according to
5081 MASK. Treat this as skipping a superblock starting at forked
5082 and ending at joining. */
5084 static void
5085 nvptx_skip_par (unsigned mask, parallel *par)
5087 basic_block tail = par->join_block;
5088 gcc_assert (tail->preds->length () == 1);
5090 basic_block pre_tail = (*tail->preds)[0]->src;
5091 gcc_assert (pre_tail->succs->length () == 1);
5093 nvptx_single (mask, par->forked_block, pre_tail);
5096 /* If PAR has a single inner parallel and PAR itself only contains
5097 empty entry and exit blocks, swallow the inner PAR. */
5099 static void
5100 nvptx_optimize_inner (parallel *par)
5102 parallel *inner = par->inner;
5104 /* We mustn't be the outer dummy par. */
5105 if (!par->mask)
5106 return;
5108 /* We must have a single inner par. */
5109 if (!inner || inner->next)
5110 return;
5112 /* We must only contain 2 blocks ourselves -- the head and tail of
5113 the inner par. */
5114 if (par->blocks.length () != 2)
5115 return;
5117 /* We must be disjoint partitioning. As we only have vector and
5118 worker partitioning, this is sufficient to guarantee the pars
5119 have adjacent partitioning. */
5120 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
5121 /* This indicates malformed code generation. */
5122 return;
5124 /* The outer forked insn should be immediately followed by the inner
5125 fork insn. */
5126 rtx_insn *forked = par->forked_insn;
5127 rtx_insn *fork = BB_END (par->forked_block);
5129 if (NEXT_INSN (forked) != fork)
5130 return;
5131 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
5133 /* The outer joining insn must immediately follow the inner join
5134 insn. */
5135 rtx_insn *joining = par->joining_insn;
5136 rtx_insn *join = inner->join_insn;
5137 if (NEXT_INSN (join) != joining)
5138 return;
5140 /* Preconditions met. Swallow the inner par. */
5141 if (dump_file)
5142 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
5143 inner->mask, inner->forked_block->index,
5144 inner->join_block->index,
5145 par->mask, par->forked_block->index, par->join_block->index);
5147 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
5149 par->blocks.reserve (inner->blocks.length ());
5150 while (inner->blocks.length ())
5151 par->blocks.quick_push (inner->blocks.pop ());
5153 par->inner = inner->inner;
5154 inner->inner = NULL;
5156 delete inner;
5159 /* Process the parallel PAR and all its contained
5160 parallels. We do everything but the neutering. Return mask of
5161 partitioned modes used within this parallel. */
5163 static unsigned
5164 nvptx_process_pars (parallel *par)
5166 if (nvptx_optimize)
5167 nvptx_optimize_inner (par);
5169 unsigned inner_mask = par->mask;
5171 /* Do the inner parallels first. */
5172 if (par->inner)
5174 par->inner_mask = nvptx_process_pars (par->inner);
5175 inner_mask |= par->inner_mask;
5178 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
5179 bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
5180 bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
5181 && nvptx_mach_vector_length () > PTX_WARP_SIZE);
5183 if (worker || large_vector)
5185 nvptx_shared_propagate (false, is_call, par->forked_block,
5186 par->forked_insn, !worker);
5187 bool no_prop_p
5188 = nvptx_shared_propagate (true, is_call, par->forked_block,
5189 par->fork_insn, !worker);
5190 bool empty_loop_p
5191 = !is_call && (NEXT_INSN (par->forked_insn)
5192 && NEXT_INSN (par->forked_insn) == par->joining_insn);
5193 rtx barrier = GEN_INT (0);
5194 int threads = 0;
5196 if (!worker && cfun->machine->sync_bar)
5198 barrier = cfun->machine->sync_bar;
5199 threads = nvptx_mach_vector_length ();
5202 if (no_prop_p && empty_loop_p)
5204 else if (no_prop_p && is_call)
5206 else
5208 /* Insert begin and end synchronizations. */
5209 emit_insn_before (nvptx_cta_sync (barrier, threads),
5210 par->forked_insn);
5211 emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
5214 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
5215 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
5217 /* Now do siblings. */
5218 if (par->next)
5219 inner_mask |= nvptx_process_pars (par->next);
5220 return inner_mask;
5223 /* Neuter the parallel described by PAR. We recurse in depth-first
5224 order. MODES are the partitioning of the execution and OUTER is
5225 the partitioning of the parallels we are contained in. */
5227 static void
5228 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
5230 unsigned me = (par->mask
5231 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
5232 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5233 unsigned skip_mask = 0, neuter_mask = 0;
5235 if (par->inner)
5236 nvptx_neuter_pars (par->inner, modes, outer | me);
5238 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
5240 if ((outer | me) & GOMP_DIM_MASK (mode))
5241 {} /* Mode is partitioned: no neutering. */
5242 else if (!(modes & GOMP_DIM_MASK (mode)))
5243 {} /* Mode is not used: nothing to do. */
5244 else if (par->inner_mask & GOMP_DIM_MASK (mode)
5245 || !par->forked_insn)
5246 /* Partitioned in inner parallels, or we're not a partitioned
5247 at all: neuter individual blocks. */
5248 neuter_mask |= GOMP_DIM_MASK (mode);
5249 else if (!par->parent || !par->parent->forked_insn
5250 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
5251 /* Parent isn't a parallel or contains this paralleling: skip
5252 parallel at this level. */
5253 skip_mask |= GOMP_DIM_MASK (mode);
5254 else
5255 {} /* Parent will skip this parallel itself. */
5258 if (neuter_mask)
5260 int ix, len;
5262 if (nvptx_optimize)
5264 /* Neuter whole SESE regions. */
5265 bb_pair_vec_t regions;
5267 nvptx_find_sese (par->blocks, regions);
5268 len = regions.length ();
5269 for (ix = 0; ix != len; ix++)
5271 basic_block from = regions[ix].first;
5272 basic_block to = regions[ix].second;
5274 if (from)
5275 nvptx_single (neuter_mask, from, to);
5276 else
5277 gcc_assert (!to);
5280 else
5282 /* Neuter each BB individually. */
5283 len = par->blocks.length ();
5284 for (ix = 0; ix != len; ix++)
5286 basic_block block = par->blocks[ix];
5288 nvptx_single (neuter_mask, block, block);
5293 if (skip_mask)
5294 nvptx_skip_par (skip_mask, par);
5296 if (par->next)
5297 nvptx_neuter_pars (par->next, modes, outer);
5300 static void
5301 populate_offload_attrs (offload_attrs *oa)
5303 tree attr = oacc_get_fn_attrib (current_function_decl);
5304 tree dims = TREE_VALUE (attr);
5305 unsigned ix;
5307 oa->mask = 0;
5309 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
5311 tree t = TREE_VALUE (dims);
5312 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
5313 tree allowed = TREE_PURPOSE (dims);
5315 if (size != 1 && !(allowed && integer_zerop (allowed)))
5316 oa->mask |= GOMP_DIM_MASK (ix);
5318 switch (ix)
5320 case GOMP_DIM_GANG:
5321 oa->num_gangs = size;
5322 break;
5324 case GOMP_DIM_WORKER:
5325 oa->num_workers = size;
5326 break;
5328 case GOMP_DIM_VECTOR:
5329 oa->vector_length = size;
5330 break;
5335 #if WORKAROUND_PTXJIT_BUG_2
5336 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
5337 is needed in the nvptx target because the branches generated for
5338 parititioning are NONJUMP_INSN_P, not JUMP_P. */
5340 static rtx
5341 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
5343 rtx pat;
5344 if ((strict && !JUMP_P (insn))
5345 || (!strict && !INSN_P (insn)))
5346 return NULL_RTX;
5347 pat = PATTERN (insn);
5349 /* The set is allowed to appear either as the insn pattern or
5350 the first set in a PARALLEL. */
5351 if (GET_CODE (pat) == PARALLEL)
5352 pat = XVECEXP (pat, 0, 0);
5353 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
5354 return pat;
5356 return NULL_RTX;
5359 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
5361 static rtx
5362 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
5364 rtx x = nvptx_pc_set (insn, strict);
5366 if (!x)
5367 return NULL_RTX;
5368 x = SET_SRC (x);
5369 if (GET_CODE (x) == LABEL_REF)
5370 return x;
5371 if (GET_CODE (x) != IF_THEN_ELSE)
5372 return NULL_RTX;
5373 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
5374 return XEXP (x, 1);
5375 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
5376 return XEXP (x, 2);
5377 return NULL_RTX;
5380 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
5381 insn inbetween the branch and the label. This works around a JIT bug
5382 observed at driver version 384.111, at -O0 for sm_50. */
5384 static void
5385 prevent_branch_around_nothing (void)
5387 rtx_insn *seen_label = NULL;
5388 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
5390 if (INSN_P (insn) && condjump_p (insn))
5392 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
5393 continue;
5396 if (seen_label == NULL)
5397 continue;
5399 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5400 continue;
5402 if (INSN_P (insn))
5403 switch (recog_memoized (insn))
5405 case CODE_FOR_nvptx_fork:
5406 case CODE_FOR_nvptx_forked:
5407 case CODE_FOR_nvptx_joining:
5408 case CODE_FOR_nvptx_join:
5409 case CODE_FOR_nop:
5410 continue;
5411 case -1:
5412 /* Handle asm ("") and similar. */
5413 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
5414 || GET_CODE (PATTERN (insn)) == ASM_OPERANDS
5415 || (GET_CODE (PATTERN (insn)) == PARALLEL
5416 && asm_noperands (PATTERN (insn)) >= 0))
5417 continue;
5418 /* FALLTHROUGH. */
5419 default:
5420 seen_label = NULL;
5421 continue;
5424 if (LABEL_P (insn) && insn == seen_label)
5425 emit_insn_before (gen_fake_nop (), insn);
5427 seen_label = NULL;
5430 #endif
5432 #ifdef WORKAROUND_PTXJIT_BUG_3
5433 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
5434 works around a hang observed at driver version 390.48 for sm_50. */
5436 static void
5437 workaround_barsyncs (void)
5439 bool seen_barsync = false;
5440 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
5442 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
5444 if (seen_barsync)
5446 emit_insn_before (gen_nvptx_membar_cta (), insn);
5447 emit_insn_before (gen_nvptx_membar_cta (), insn);
5450 seen_barsync = true;
5451 continue;
5454 if (!seen_barsync)
5455 continue;
5457 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5458 continue;
5459 else if (INSN_P (insn))
5460 switch (recog_memoized (insn))
5462 case CODE_FOR_nvptx_fork:
5463 case CODE_FOR_nvptx_forked:
5464 case CODE_FOR_nvptx_joining:
5465 case CODE_FOR_nvptx_join:
5466 continue;
5467 default:
5468 break;
5471 seen_barsync = false;
5474 #endif
5476 static rtx
5477 gen_comment (const char *s)
5479 const char *sep = " ";
5480 size_t len = strlen (ASM_COMMENT_START) + strlen (sep) + strlen (s) + 1;
5481 char *comment = (char *) alloca (len);
5482 snprintf (comment, len, "%s%s%s", ASM_COMMENT_START, sep, s);
5483 return gen_rtx_ASM_INPUT_loc (VOIDmode, ggc_strdup (comment),
5484 DECL_SOURCE_LOCATION (cfun->decl));
5487 /* Initialize all declared regs at function entry.
5488 Advantage : Fool-proof.
5489 Disadvantage: Potentially creates a lot of long live ranges and adds a lot
5490 of insns. */
5492 static void
5493 workaround_uninit_method_1 (void)
5495 rtx_insn *first = get_insns ();
5496 rtx_insn *insert_here = NULL;
5498 for (int ix = LAST_VIRTUAL_REGISTER + 1; ix < max_reg_num (); ix++)
5500 rtx reg = regno_reg_rtx[ix];
5502 /* Skip undeclared registers. */
5503 if (reg == const0_rtx)
5504 continue;
5506 gcc_assert (CONST0_RTX (GET_MODE (reg)));
5508 start_sequence ();
5509 if (nvptx_comment && first != NULL)
5510 emit_insn (gen_comment ("Start: Added by -minit-regs=1"));
5511 emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
5512 rtx_insn *inits = get_insns ();
5513 end_sequence ();
5515 if (dump_file && (dump_flags & TDF_DETAILS))
5516 for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init))
5517 fprintf (dump_file, "Default init of reg %u inserted: insn %u\n",
5518 ix, INSN_UID (init));
5520 if (first != NULL)
5522 insert_here = emit_insn_before (inits, first);
5523 first = NULL;
5525 else
5526 insert_here = emit_insn_after (inits, insert_here);
5529 if (nvptx_comment && insert_here != NULL)
5530 emit_insn_after (gen_comment ("End: Added by -minit-regs=1"), insert_here);
5533 /* Find uses of regs that are not defined on all incoming paths, and insert a
5534 corresponding def at function entry.
5535 Advantage : Simple.
5536 Disadvantage: Potentially creates long live ranges.
5537 May not catch all cases. F.i. a clobber cuts a live range in
5538 the compiler and may prevent entry_lr_in from being set for a
5539 reg, but the clobber does not translate to a ptx insn, so in
5540 ptx there still may be an uninitialized ptx reg. See f.i.
5541 gcc.c-torture/compile/20020926-1.c. */
5543 static void
5544 workaround_uninit_method_2 (void)
5546 auto_bitmap entry_pseudo_uninit;
5548 auto_bitmap not_pseudo;
5549 bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER);
5551 bitmap entry_lr_in = DF_LR_IN (ENTRY_BLOCK_PTR_FOR_FN (cfun));
5552 bitmap_and_compl (entry_pseudo_uninit, entry_lr_in, not_pseudo);
5555 rtx_insn *first = get_insns ();
5556 rtx_insn *insert_here = NULL;
5558 bitmap_iterator iterator;
5559 unsigned ix;
5560 EXECUTE_IF_SET_IN_BITMAP (entry_pseudo_uninit, 0, ix, iterator)
5562 rtx reg = regno_reg_rtx[ix];
5563 gcc_assert (CONST0_RTX (GET_MODE (reg)));
5565 start_sequence ();
5566 if (nvptx_comment && first != NULL)
5567 emit_insn (gen_comment ("Start: Added by -minit-regs=2:"));
5568 emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
5569 rtx_insn *inits = get_insns ();
5570 end_sequence ();
5572 if (dump_file && (dump_flags & TDF_DETAILS))
5573 for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init))
5574 fprintf (dump_file, "Missing init of reg %u inserted: insn %u\n",
5575 ix, INSN_UID (init));
5577 if (first != NULL)
5579 insert_here = emit_insn_before (inits, first);
5580 first = NULL;
5582 else
5583 insert_here = emit_insn_after (inits, insert_here);
5586 if (nvptx_comment && insert_here != NULL)
5587 emit_insn_after (gen_comment ("End: Added by -minit-regs=2"), insert_here);
5590 /* Find uses of regs that are not defined on all incoming paths, and insert a
5591 corresponding def on those.
5592 Advantage : Doesn't create long live ranges.
5593 Disadvantage: More complex, and potentially also more defs. */
5595 static void
5596 workaround_uninit_method_3 (void)
5598 auto_bitmap not_pseudo;
5599 bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER);
5601 basic_block bb;
5602 FOR_EACH_BB_FN (bb, cfun)
5604 if (single_pred_p (bb))
5605 continue;
5607 auto_bitmap bb_pseudo_uninit;
5608 bitmap_and_compl (bb_pseudo_uninit, DF_LIVE_IN (bb), DF_MIR_IN (bb));
5609 bitmap_and_compl_into (bb_pseudo_uninit, not_pseudo);
5611 bitmap_iterator iterator;
5612 unsigned ix;
5613 EXECUTE_IF_SET_IN_BITMAP (bb_pseudo_uninit, 0, ix, iterator)
5615 bool have_false = false;
5616 bool have_true = false;
5618 edge e;
5619 edge_iterator ei;
5620 FOR_EACH_EDGE (e, ei, bb->preds)
5622 if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix))
5623 have_true = true;
5624 else
5625 have_false = true;
5627 if (have_false ^ have_true)
5628 continue;
5630 FOR_EACH_EDGE (e, ei, bb->preds)
5632 if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix))
5633 continue;
5635 rtx reg = regno_reg_rtx[ix];
5636 gcc_assert (CONST0_RTX (GET_MODE (reg)));
5638 start_sequence ();
5639 emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
5640 rtx_insn *inits = get_insns ();
5641 end_sequence ();
5643 if (dump_file && (dump_flags & TDF_DETAILS))
5644 for (rtx_insn *init = inits; init != NULL;
5645 init = NEXT_INSN (init))
5646 fprintf (dump_file,
5647 "Missing init of reg %u inserted on edge: %d -> %d:"
5648 " insn %u\n", ix, e->src->index, e->dest->index,
5649 INSN_UID (init));
5651 insert_insn_on_edge (inits, e);
5656 if (nvptx_comment)
5657 FOR_EACH_BB_FN (bb, cfun)
5659 if (single_pred_p (bb))
5660 continue;
5662 edge e;
5663 edge_iterator ei;
5664 FOR_EACH_EDGE (e, ei, bb->preds)
5666 if (e->insns.r == NULL_RTX)
5667 continue;
5668 start_sequence ();
5669 emit_insn (gen_comment ("Start: Added by -minit-regs=3:"));
5670 emit_insn (e->insns.r);
5671 emit_insn (gen_comment ("End: Added by -minit-regs=3:"));
5672 e->insns.r = get_insns ();
5673 end_sequence ();
5677 commit_edge_insertions ();
5680 static void
5681 workaround_uninit (void)
5683 switch (nvptx_init_regs)
5685 case 0:
5686 /* Skip. */
5687 break;
5688 case 1:
5689 workaround_uninit_method_1 ();
5690 break;
5691 case 2:
5692 workaround_uninit_method_2 ();
5693 break;
5694 case 3:
5695 workaround_uninit_method_3 ();
5696 break;
5697 default:
5698 gcc_unreachable ();
5702 /* PTX-specific reorganization
5703 - Split blocks at fork and join instructions
5704 - Compute live registers
5705 - Mark now-unused registers, so function begin doesn't declare
5706 unused registers.
5707 - Insert state propagation when entering partitioned mode
5708 - Insert neutering instructions when in single mode
5709 - Replace subregs with suitable sequences.
5712 static void
5713 nvptx_reorg (void)
5715 /* We are freeing block_for_insn in the toplev to keep compatibility
5716 with old MDEP_REORGS that are not CFG based. Recompute it now. */
5717 compute_bb_for_insn ();
5719 thread_prologue_and_epilogue_insns ();
5721 /* Split blocks and record interesting unspecs. */
5722 bb_insn_map_t bb_insn_map;
5724 nvptx_split_blocks (&bb_insn_map);
5726 /* Compute live regs */
5727 df_clear_flags (DF_LR_RUN_DCE);
5728 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
5729 df_live_add_problem ();
5730 df_live_set_all_dirty ();
5731 if (nvptx_init_regs == 3)
5732 df_mir_add_problem ();
5733 df_analyze ();
5734 regstat_init_n_sets_and_refs ();
5736 if (dump_file)
5737 df_dump (dump_file);
5739 /* Mark unused regs as unused. */
5740 int max_regs = max_reg_num ();
5741 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
5742 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
5743 regno_reg_rtx[i] = const0_rtx;
5745 workaround_uninit ();
5747 /* Determine launch dimensions of the function. If it is not an
5748 offloaded function (i.e. this is a regular compiler), the
5749 function has no neutering. */
5750 tree attr = oacc_get_fn_attrib (current_function_decl);
5751 if (attr)
5753 /* If we determined this mask before RTL expansion, we could
5754 elide emission of some levels of forks and joins. */
5755 offload_attrs oa;
5757 populate_offload_attrs (&oa);
5759 /* If there is worker neutering, there must be vector
5760 neutering. Otherwise the hardware will fail. */
5761 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5762 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5764 /* Discover & process partitioned regions. */
5765 parallel *pars = nvptx_discover_pars (&bb_insn_map);
5766 nvptx_process_pars (pars);
5767 nvptx_neuter_pars (pars, oa.mask, 0);
5768 delete pars;
5771 /* Replace subregs. */
5772 nvptx_reorg_subreg ();
5774 if (TARGET_UNIFORM_SIMT)
5775 nvptx_reorg_uniform_simt ();
5777 #if WORKAROUND_PTXJIT_BUG_2
5778 prevent_branch_around_nothing ();
5779 #endif
5781 #ifdef WORKAROUND_PTXJIT_BUG_3
5782 workaround_barsyncs ();
5783 #endif
5785 regstat_free_n_sets_and_refs ();
5787 df_finish_pass (true);
5790 /* Handle a "kernel" attribute; arguments as in
5791 struct attribute_spec.handler. */
5793 static tree
5794 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5795 int ARG_UNUSED (flags), bool *no_add_attrs)
5797 tree decl = *node;
5799 if (TREE_CODE (decl) != FUNCTION_DECL)
5801 error ("%qE attribute only applies to functions", name);
5802 *no_add_attrs = true;
5804 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
5806 error ("%qE attribute requires a void return type", name);
5807 *no_add_attrs = true;
5810 return NULL_TREE;
5813 /* Handle a "shared" attribute; arguments as in
5814 struct attribute_spec.handler. */
5816 static tree
5817 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5818 int ARG_UNUSED (flags), bool *no_add_attrs)
5820 tree decl = *node;
5822 if (TREE_CODE (decl) != VAR_DECL)
5824 error ("%qE attribute only applies to variables", name);
5825 *no_add_attrs = true;
5827 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5829 error ("%qE attribute not allowed with auto storage class", name);
5830 *no_add_attrs = true;
5833 return NULL_TREE;
5836 /* Table of valid machine attributes. */
5837 TARGET_GNU_ATTRIBUTES (nvptx_attribute_table,
5839 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5840 affects_type_identity, handler, exclude } */
5841 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
5842 NULL },
5843 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
5844 NULL }
5847 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
5849 static HOST_WIDE_INT
5850 nvptx_vector_alignment (const_tree type)
5852 unsigned HOST_WIDE_INT align;
5853 tree size = TYPE_SIZE (type);
5855 /* Ensure align is not bigger than BIGGEST_ALIGNMENT. */
5856 if (tree_fits_uhwi_p (size))
5858 align = tree_to_uhwi (size);
5859 align = MIN (align, BIGGEST_ALIGNMENT);
5861 else
5862 align = BIGGEST_ALIGNMENT;
5864 /* Ensure align is not smaller than mode alignment. */
5865 align = MAX (align, GET_MODE_ALIGNMENT (TYPE_MODE (type)));
5867 return align;
5870 /* Indicate that INSN cannot be duplicated. */
5872 static bool
5873 nvptx_cannot_copy_insn_p (rtx_insn *insn)
5875 switch (recog_memoized (insn))
5877 case CODE_FOR_nvptx_shufflesi:
5878 case CODE_FOR_nvptx_shufflesf:
5879 case CODE_FOR_nvptx_barsync:
5880 case CODE_FOR_nvptx_fork:
5881 case CODE_FOR_nvptx_forked:
5882 case CODE_FOR_nvptx_joining:
5883 case CODE_FOR_nvptx_join:
5884 return true;
5885 default:
5886 return false;
5890 /* Section anchors do not work. Initialization for flag_section_anchor
5891 probes the existence of the anchoring target hooks and prevents
5892 anchoring if they don't exist. However, we may be being used with
5893 a host-side compiler that does support anchoring, and hence see
5894 the anchor flag set (as it's not recalculated). So provide an
5895 implementation denying anchoring. */
5897 static bool
5898 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5900 return false;
5903 /* Record a symbol for mkoffload to enter into the mapping table. */
5905 static void
5906 nvptx_record_offload_symbol (tree decl)
5908 switch (TREE_CODE (decl))
5910 case VAR_DECL:
5911 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5912 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5913 break;
5915 case FUNCTION_DECL:
5917 tree attr = oacc_get_fn_attrib (decl);
5918 /* OpenMP offloading does not set this attribute. */
5919 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
5921 fprintf (asm_out_file, "//:");
5922 if (lookup_attribute ("omp declare target indirect",
5923 DECL_ATTRIBUTES (decl)))
5924 fprintf (asm_out_file, "IND_FUNC_MAP");
5925 else
5926 fprintf (asm_out_file, "FUNC_MAP");
5927 fprintf (asm_out_file, " \"%s\"",
5928 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5930 for (; dims; dims = TREE_CHAIN (dims))
5932 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
5934 gcc_assert (!TREE_PURPOSE (dims));
5935 fprintf (asm_out_file, ", %#x", size);
5938 fprintf (asm_out_file, "\n");
5940 break;
5942 default:
5943 gcc_unreachable ();
5947 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5948 at the start of a file. */
5950 static void
5951 nvptx_file_start (void)
5953 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5955 fputs ("\t.version\t", asm_out_file);
5956 fputs (ptx_version_to_string ((enum ptx_version)ptx_version_option),
5957 asm_out_file);
5958 fputs ("\n", asm_out_file);
5960 fputs ("\t.target\tsm_", asm_out_file);
5961 fputs (sm_version_to_string ((enum ptx_isa)ptx_isa_option),
5962 asm_out_file);
5963 fputs ("\n", asm_out_file);
5965 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5967 fputs ("// END PREAMBLE\n", asm_out_file);
5970 /* Emit a declaration for a worker and vector-level buffer in .shared
5971 memory. */
5973 static void
5974 write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
5976 const char *name = XSTR (sym, 0);
5978 write_var_marker (file, true, false, name);
5979 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5980 align, name, size);
5983 /* Write out the function declarations we've collected and declare storage
5984 for the broadcast buffer. */
5986 static void
5987 nvptx_file_end (void)
5989 hash_table<tree_hasher>::iterator iter;
5990 tree decl;
5991 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
5992 nvptx_record_fndecl (decl);
5993 fputs (func_decls.str().c_str(), asm_out_file);
5995 if (oacc_bcast_size)
5996 write_shared_buffer (asm_out_file, oacc_bcast_sym,
5997 oacc_bcast_align, oacc_bcast_size);
5999 if (worker_red_size)
6000 write_shared_buffer (asm_out_file, worker_red_sym,
6001 worker_red_align, worker_red_size);
6003 if (vector_red_size)
6004 write_shared_buffer (asm_out_file, vector_red_sym,
6005 vector_red_align, vector_red_size);
6007 if (gang_private_shared_size)
6008 write_shared_buffer (asm_out_file, gang_private_shared_sym,
6009 gang_private_shared_align, gang_private_shared_size);
6011 if (need_softstack_decl)
6013 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
6014 /* 32 is the maximum number of warps in a block. Even though it's an
6015 external declaration, emit the array size explicitly; otherwise, it
6016 may fail at PTX JIT time if the definition is later in link order. */
6017 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
6018 POINTER_SIZE);
6020 if (need_unisimt_decl)
6022 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
6023 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
6027 /* Expander for the shuffle builtins. */
6029 static rtx
6030 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
6032 if (ignore)
6033 return target;
6035 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
6036 NULL_RTX, mode, EXPAND_NORMAL);
6037 if (!REG_P (src))
6038 src = copy_to_mode_reg (mode, src);
6040 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
6041 NULL_RTX, SImode, EXPAND_NORMAL);
6042 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
6043 NULL_RTX, SImode, EXPAND_NORMAL);
6045 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
6046 idx = copy_to_mode_reg (SImode, idx);
6048 rtx pat = nvptx_gen_shuffle (target, src, idx,
6049 (nvptx_shuffle_kind) INTVAL (op));
6050 if (pat)
6051 emit_insn (pat);
6053 return target;
6056 /* Expander for the bit reverse builtins. */
6058 static rtx
6059 nvptx_expand_brev (tree exp, rtx target, machine_mode mode, int ignore)
6061 if (ignore)
6062 return target;
6064 rtx arg = expand_expr (CALL_EXPR_ARG (exp, 0),
6065 NULL_RTX, mode, EXPAND_NORMAL);
6066 if (!REG_P (arg))
6067 arg = copy_to_mode_reg (mode, arg);
6068 if (!target)
6069 target = gen_reg_rtx (mode);
6070 rtx pat;
6071 if (mode == SImode)
6072 pat = gen_bitrevsi2 (target, arg);
6073 else
6074 pat = gen_bitrevdi2 (target, arg);
6075 emit_insn (pat);
6076 return target;
6079 const char *
6080 nvptx_output_red_partition (rtx dst, rtx offset)
6082 const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
6083 const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
6085 if (offset == const0_rtx)
6086 fprintf (asm_out_file, zero_offset, REGNO (dst),
6087 REGNO (cfun->machine->red_partition));
6088 else
6089 fprintf (asm_out_file, with_offset, REGNO (dst),
6090 REGNO (cfun->machine->red_partition), UINTVAL (offset));
6092 return "";
6095 /* Shared-memory reduction address expander. */
6097 static rtx
6098 nvptx_expand_shared_addr (tree exp, rtx target,
6099 machine_mode ARG_UNUSED (mode), int ignore,
6100 int vector)
6102 if (ignore)
6103 return target;
6105 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
6106 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
6107 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
6108 rtx addr = worker_red_sym;
6110 if (vector)
6112 offload_attrs oa;
6114 populate_offload_attrs (&oa);
6116 unsigned int psize = ROUND_UP (size + offset, align);
6117 unsigned int pnum = nvptx_mach_max_workers ();
6118 vector_red_partition = MAX (vector_red_partition, psize);
6119 vector_red_size = MAX (vector_red_size, psize * pnum);
6120 vector_red_align = MAX (vector_red_align, align);
6122 if (cfun->machine->red_partition == NULL)
6123 cfun->machine->red_partition = gen_reg_rtx (Pmode);
6125 addr = gen_reg_rtx (Pmode);
6126 emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
6128 else
6130 worker_red_align = MAX (worker_red_align, align);
6131 worker_red_size = MAX (worker_red_size, size + offset);
6133 if (offset)
6135 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
6136 addr = gen_rtx_CONST (Pmode, addr);
6140 emit_move_insn (target, addr);
6141 return target;
6144 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
6145 not require taking the address of any object, other than the memory
6146 cell being operated on. */
6148 static rtx
6149 nvptx_expand_cmp_swap (tree exp, rtx target,
6150 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
6152 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
6154 if (!target)
6155 target = gen_reg_rtx (mode);
6157 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
6158 NULL_RTX, Pmode, EXPAND_NORMAL);
6159 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
6160 NULL_RTX, mode, EXPAND_NORMAL);
6161 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
6162 NULL_RTX, mode, EXPAND_NORMAL);
6163 rtx pat;
6165 mem = gen_rtx_MEM (mode, mem);
6166 if (!REG_P (cmp))
6167 cmp = copy_to_mode_reg (mode, cmp);
6168 if (!REG_P (src))
6169 src = copy_to_mode_reg (mode, src);
6171 if (mode == SImode)
6172 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
6173 else
6174 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
6176 emit_insn (pat);
6178 return target;
6182 /* Codes for all the NVPTX builtins. */
6183 enum nvptx_builtins
6185 NVPTX_BUILTIN_SHUFFLE,
6186 NVPTX_BUILTIN_SHUFFLELL,
6187 NVPTX_BUILTIN_WORKER_ADDR,
6188 NVPTX_BUILTIN_VECTOR_ADDR,
6189 NVPTX_BUILTIN_CMP_SWAP,
6190 NVPTX_BUILTIN_CMP_SWAPLL,
6191 NVPTX_BUILTIN_MEMBAR_GL,
6192 NVPTX_BUILTIN_MEMBAR_CTA,
6193 NVPTX_BUILTIN_BAR_RED_AND,
6194 NVPTX_BUILTIN_BAR_RED_OR,
6195 NVPTX_BUILTIN_BAR_RED_POPC,
6196 NVPTX_BUILTIN_BREV,
6197 NVPTX_BUILTIN_BREVLL,
6198 NVPTX_BUILTIN_MAX
6201 /* Expander for 'bar.red' instruction builtins. */
6203 static rtx
6204 nvptx_expand_bar_red (tree exp, rtx target,
6205 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
6207 int code = DECL_MD_FUNCTION_CODE (TREE_OPERAND (CALL_EXPR_FN (exp), 0));
6208 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
6210 if (!target)
6211 target = gen_reg_rtx (mode);
6213 rtx pred, dst;
6214 rtx bar = expand_expr (CALL_EXPR_ARG (exp, 0),
6215 NULL_RTX, SImode, EXPAND_NORMAL);
6216 rtx nthr = expand_expr (CALL_EXPR_ARG (exp, 1),
6217 NULL_RTX, SImode, EXPAND_NORMAL);
6218 rtx cpl = expand_expr (CALL_EXPR_ARG (exp, 2),
6219 NULL_RTX, SImode, EXPAND_NORMAL);
6220 rtx redop = expand_expr (CALL_EXPR_ARG (exp, 3),
6221 NULL_RTX, SImode, EXPAND_NORMAL);
6222 if (CONST_INT_P (bar))
6224 if (INTVAL (bar) < 0 || INTVAL (bar) > 15)
6226 error_at (EXPR_LOCATION (exp),
6227 "barrier value must be within [0,15]");
6228 return const0_rtx;
6231 else if (!REG_P (bar))
6232 bar = copy_to_mode_reg (SImode, bar);
6234 if (!CONST_INT_P (nthr) && !REG_P (nthr))
6235 nthr = copy_to_mode_reg (SImode, nthr);
6237 if (!CONST_INT_P (cpl))
6239 error_at (EXPR_LOCATION (exp),
6240 "complement argument must be constant");
6241 return const0_rtx;
6244 pred = gen_reg_rtx (BImode);
6245 if (!REG_P (redop))
6246 redop = copy_to_mode_reg (SImode, redop);
6247 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, redop, GEN_INT (0))));
6248 redop = pred;
6250 rtx pat;
6251 switch (code)
6253 case NVPTX_BUILTIN_BAR_RED_AND:
6254 dst = gen_reg_rtx (BImode);
6255 pat = gen_nvptx_barred_and (dst, bar, nthr, cpl, redop);
6256 break;
6257 case NVPTX_BUILTIN_BAR_RED_OR:
6258 dst = gen_reg_rtx (BImode);
6259 pat = gen_nvptx_barred_or (dst, bar, nthr, cpl, redop);
6260 break;
6261 case NVPTX_BUILTIN_BAR_RED_POPC:
6262 dst = gen_reg_rtx (SImode);
6263 pat = gen_nvptx_barred_popc (dst, bar, nthr, cpl, redop);
6264 break;
6265 default:
6266 gcc_unreachable ();
6268 emit_insn (pat);
6269 if (GET_MODE (dst) == BImode)
6271 rtx tmp = gen_reg_rtx (mode);
6272 emit_insn (gen_rtx_SET (tmp, gen_rtx_NE (mode, dst, GEN_INT (0))));
6273 dst = tmp;
6275 emit_move_insn (target, dst);
6276 return target;
6279 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
6281 /* Return the NVPTX builtin for CODE. */
6283 static tree
6284 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
6286 if (code >= NVPTX_BUILTIN_MAX)
6287 return error_mark_node;
6289 return nvptx_builtin_decls[code];
6292 /* Set up all builtin functions for this target. */
6294 static void
6295 nvptx_init_builtins (void)
6297 #define DEF(ID, NAME, T) \
6298 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
6299 = add_builtin_function ("__builtin_nvptx_" NAME, \
6300 build_function_type_list T, \
6301 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
6302 #define ST sizetype
6303 #define UINT unsigned_type_node
6304 #define LLUINT long_long_unsigned_type_node
6305 #define PTRVOID ptr_type_node
6306 #define VOID void_type_node
6308 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
6309 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
6310 DEF (WORKER_ADDR, "worker_addr",
6311 (PTRVOID, ST, UINT, UINT, NULL_TREE));
6312 DEF (VECTOR_ADDR, "vector_addr",
6313 (PTRVOID, ST, UINT, UINT, NULL_TREE));
6314 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
6315 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
6316 DEF (MEMBAR_GL, "membar_gl", (VOID, VOID, NULL_TREE));
6317 DEF (MEMBAR_CTA, "membar_cta", (VOID, VOID, NULL_TREE));
6319 DEF (BAR_RED_AND, "bar_red_and",
6320 (UINT, UINT, UINT, UINT, UINT, NULL_TREE));
6321 DEF (BAR_RED_OR, "bar_red_or",
6322 (UINT, UINT, UINT, UINT, UINT, NULL_TREE));
6323 DEF (BAR_RED_POPC, "bar_red_popc",
6324 (UINT, UINT, UINT, UINT, UINT, NULL_TREE));
6326 DEF (BREV, "brev", (UINT, UINT, NULL_TREE));
6327 DEF (BREVLL, "brevll", (LLUINT, LLUINT, NULL_TREE));
6329 #undef DEF
6330 #undef ST
6331 #undef UINT
6332 #undef LLUINT
6333 #undef PTRVOID
6336 /* Expand an expression EXP that calls a built-in function,
6337 with result going to TARGET if that's convenient
6338 (and in mode MODE if that's convenient).
6339 SUBTARGET may be used as the target for computing one of EXP's operands.
6340 IGNORE is nonzero if the value is to be ignored. */
6342 static rtx
6343 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
6344 machine_mode mode, int ignore)
6346 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
6347 switch (DECL_MD_FUNCTION_CODE (fndecl))
6349 case NVPTX_BUILTIN_SHUFFLE:
6350 case NVPTX_BUILTIN_SHUFFLELL:
6351 return nvptx_expand_shuffle (exp, target, mode, ignore);
6353 case NVPTX_BUILTIN_WORKER_ADDR:
6354 return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
6356 case NVPTX_BUILTIN_VECTOR_ADDR:
6357 return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
6359 case NVPTX_BUILTIN_CMP_SWAP:
6360 case NVPTX_BUILTIN_CMP_SWAPLL:
6361 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
6363 case NVPTX_BUILTIN_MEMBAR_GL:
6364 emit_insn (gen_nvptx_membar_gl ());
6365 return NULL_RTX;
6367 case NVPTX_BUILTIN_MEMBAR_CTA:
6368 emit_insn (gen_nvptx_membar_cta ());
6369 return NULL_RTX;
6371 case NVPTX_BUILTIN_BAR_RED_AND:
6372 case NVPTX_BUILTIN_BAR_RED_OR:
6373 case NVPTX_BUILTIN_BAR_RED_POPC:
6374 return nvptx_expand_bar_red (exp, target, mode, ignore);
6376 case NVPTX_BUILTIN_BREV:
6377 case NVPTX_BUILTIN_BREVLL:
6378 return nvptx_expand_brev (exp, target, mode, ignore);
6380 default: gcc_unreachable ();
6384 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
6386 static int
6387 nvptx_simt_vf ()
6389 return PTX_WARP_SIZE;
6392 /* Return 1 if TRAIT NAME is present in the OpenMP context's
6393 device trait set, return 0 if not present in any OpenMP context in the
6394 whole translation unit, or -1 if not present in the current OpenMP context
6395 but might be present in another OpenMP context in the same TU. */
6398 nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
6399 const char *name)
6401 switch (trait)
6403 case omp_device_kind:
6404 return strcmp (name, "gpu") == 0;
6405 case omp_device_arch:
6406 return strcmp (name, "nvptx") == 0;
6407 case omp_device_isa:
6408 #define NVPTX_SM(XX, SEP) \
6410 if (strcmp (name, "sm_" #XX) == 0) \
6411 return ptx_isa_option == PTX_ISA_SM ## XX; \
6413 #include "nvptx-sm.def"
6414 #undef NVPTX_SM
6415 return 0;
6416 default:
6417 gcc_unreachable ();
6421 static bool
6422 nvptx_welformed_vector_length_p (int l)
6424 gcc_assert (l > 0);
6425 return l % PTX_WARP_SIZE == 0;
6428 static void
6429 nvptx_apply_dim_limits (int dims[])
6431 /* Check that the vector_length is not too large. */
6432 if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
6433 dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
6435 /* Check that the number of workers is not too large. */
6436 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
6437 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
6439 /* Ensure that num_worker * vector_length <= cta size. */
6440 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
6441 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
6442 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6444 /* If we need a per-worker barrier ... . */
6445 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
6446 && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
6447 /* Don't use more barriers than available. */
6448 dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
6449 PTX_NUM_PER_WORKER_BARRIERS);
6452 /* Return true if FNDECL contains calls to vector-partitionable routines. */
6454 static bool
6455 has_vector_partitionable_routine_calls_p (tree fndecl)
6457 if (!fndecl)
6458 return false;
6460 basic_block bb;
6461 FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
6462 for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
6463 gsi_next_nondebug (&i))
6465 gimple *stmt = gsi_stmt (i);
6466 if (gimple_code (stmt) != GIMPLE_CALL)
6467 continue;
6469 tree callee = gimple_call_fndecl (stmt);
6470 if (!callee)
6471 continue;
6473 tree attrs = oacc_get_fn_attrib (callee);
6474 if (attrs == NULL_TREE)
6475 return false;
6477 int partition_level = oacc_fn_attrib_level (attrs);
6478 bool seq_routine_p = partition_level == GOMP_DIM_MAX;
6479 if (!seq_routine_p)
6480 return true;
6483 return false;
6486 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
6487 DIMS has changed. */
6489 static void
6490 nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
6492 bool oacc_default_dims_p = false;
6493 bool oacc_min_dims_p = false;
6494 bool offload_region_p = false;
6495 bool routine_p = false;
6496 bool routine_seq_p = false;
6497 int default_vector_length = -1;
6499 if (decl == NULL_TREE)
6501 if (fn_level == -1)
6502 oacc_default_dims_p = true;
6503 else if (fn_level == -2)
6504 oacc_min_dims_p = true;
6505 else
6506 gcc_unreachable ();
6508 else if (fn_level == -1)
6509 offload_region_p = true;
6510 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
6512 routine_p = true;
6513 routine_seq_p = fn_level == GOMP_DIM_MAX;
6515 else
6516 gcc_unreachable ();
6518 if (oacc_min_dims_p)
6520 gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
6521 gcc_assert (dims[GOMP_DIM_WORKER] == 1);
6522 gcc_assert (dims[GOMP_DIM_GANG] == 1);
6524 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6525 return;
6528 if (routine_p)
6530 if (!routine_seq_p)
6531 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6533 return;
6536 if (oacc_default_dims_p)
6538 /* -1 : not set
6539 0 : set at runtime, f.i. -fopenacc-dims=-
6540 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
6541 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
6542 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
6543 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
6545 /* But -fopenacc-dims=- is not yet supported on trunk. */
6546 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
6547 gcc_assert (dims[GOMP_DIM_WORKER] != 0);
6548 gcc_assert (dims[GOMP_DIM_GANG] != 0);
6551 if (offload_region_p)
6553 /* -1 : not set
6554 0 : set using variable, f.i. num_gangs (n)
6555 >= 1: set using constant, f.i. num_gangs (1). */
6556 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
6557 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
6558 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
6561 if (offload_region_p)
6562 default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
6563 else
6564 /* oacc_default_dims_p. */
6565 default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
6567 int old_dims[GOMP_DIM_MAX];
6568 unsigned int i;
6569 for (i = 0; i < GOMP_DIM_MAX; ++i)
6570 old_dims[i] = dims[i];
6572 const char *vector_reason = NULL;
6573 if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
6575 default_vector_length = PTX_WARP_SIZE;
6577 if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
6579 vector_reason = G_("using %<vector_length (%d)%> due to call to"
6580 " vector-partitionable routine, ignoring %d");
6581 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6585 if (dims[GOMP_DIM_VECTOR] == 0)
6587 vector_reason = G_("using %<vector_length (%d)%>, ignoring runtime setting");
6588 dims[GOMP_DIM_VECTOR] = default_vector_length;
6591 if (dims[GOMP_DIM_VECTOR] > 0
6592 && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
6593 dims[GOMP_DIM_VECTOR] = default_vector_length;
6595 nvptx_apply_dim_limits (dims);
6597 if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
6598 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
6599 vector_reason != NULL
6600 ? vector_reason
6601 : G_("using %<vector_length (%d)%>, ignoring %d"),
6602 dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
6604 if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
6605 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
6606 G_("using %<num_workers (%d)%>, ignoring %d"),
6607 dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
6609 if (oacc_default_dims_p)
6611 if (dims[GOMP_DIM_VECTOR] < 0)
6612 dims[GOMP_DIM_VECTOR] = default_vector_length;
6613 if (dims[GOMP_DIM_WORKER] < 0)
6614 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
6615 if (dims[GOMP_DIM_GANG] < 0)
6616 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
6617 nvptx_apply_dim_limits (dims);
6620 if (offload_region_p)
6622 for (i = 0; i < GOMP_DIM_MAX; i++)
6624 if (!(dims[i] < 0))
6625 continue;
6627 if ((used & GOMP_DIM_MASK (i)) == 0)
6628 /* Function oacc_validate_dims will apply the minimal dimension. */
6629 continue;
6631 dims[i] = (i == GOMP_DIM_VECTOR
6632 ? default_vector_length
6633 : oacc_get_default_dim (i));
6636 nvptx_apply_dim_limits (dims);
6640 /* Validate compute dimensions of an OpenACC offload or routine, fill
6641 in non-unity defaults. FN_LEVEL indicates the level at which a
6642 routine might spawn a loop. It is negative for non-routines. If
6643 DECL is null, we are validating the default dimensions. */
6645 static bool
6646 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
6648 int old_dims[GOMP_DIM_MAX];
6649 unsigned int i;
6651 for (i = 0; i < GOMP_DIM_MAX; ++i)
6652 old_dims[i] = dims[i];
6654 nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
6656 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
6657 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
6658 gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
6660 for (i = 0; i < GOMP_DIM_MAX; ++i)
6661 if (old_dims[i] != dims[i])
6662 return true;
6664 return false;
6667 /* Return maximum dimension size, or zero for unbounded. */
6669 static int
6670 nvptx_dim_limit (int axis)
6672 switch (axis)
6674 case GOMP_DIM_VECTOR:
6675 return PTX_MAX_VECTOR_LENGTH;
6677 default:
6678 break;
6680 return 0;
6683 /* Determine whether fork & joins are needed. */
6685 static bool
6686 nvptx_goacc_fork_join (gcall *call, const int dims[],
6687 bool ARG_UNUSED (is_fork))
6689 tree arg = gimple_call_arg (call, 2);
6690 unsigned axis = TREE_INT_CST_LOW (arg);
6692 /* We only care about worker and vector partitioning. */
6693 if (axis < GOMP_DIM_WORKER)
6694 return false;
6696 /* If the size is 1, there's no partitioning. */
6697 if (dims[axis] == 1)
6698 return false;
6700 return true;
6703 /* Generate a PTX builtin function call that returns the address in
6704 the worker reduction buffer at OFFSET. TYPE is the type of the
6705 data at that location. */
6707 static tree
6708 nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
6710 enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
6711 if (vector)
6712 addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
6713 machine_mode mode = TYPE_MODE (type);
6714 tree fndecl = nvptx_builtin_decl (addr_dim, true);
6715 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
6716 tree align = build_int_cst (unsigned_type_node,
6717 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
6718 tree call = build_call_expr (fndecl, 3, offset, size, align);
6720 return fold_convert (build_pointer_type (type), call);
6723 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
6724 will cast the variable if necessary. */
6726 static void
6727 nvptx_generate_vector_shuffle (location_t loc,
6728 tree dest_var, tree var, unsigned shift,
6729 gimple_seq *seq)
6731 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
6732 tree_code code = NOP_EXPR;
6733 tree arg_type = unsigned_type_node;
6734 tree var_type = TREE_TYPE (var);
6735 tree dest_type = var_type;
6737 if (TREE_CODE (var_type) == COMPLEX_TYPE)
6738 var_type = TREE_TYPE (var_type);
6740 if (SCALAR_FLOAT_TYPE_P (var_type))
6741 code = VIEW_CONVERT_EXPR;
6743 if (TYPE_SIZE (var_type)
6744 == TYPE_SIZE (long_long_unsigned_type_node))
6746 fn = NVPTX_BUILTIN_SHUFFLELL;
6747 arg_type = long_long_unsigned_type_node;
6750 tree call = nvptx_builtin_decl (fn, true);
6751 tree bits = build_int_cst (unsigned_type_node, shift);
6752 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
6753 tree expr;
6755 if (var_type != dest_type)
6757 /* Do real and imaginary parts separately. */
6758 tree real = fold_build1 (REALPART_EXPR, var_type, var);
6759 real = fold_build1 (code, arg_type, real);
6760 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
6761 real = fold_build1 (code, var_type, real);
6763 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
6764 imag = fold_build1 (code, arg_type, imag);
6765 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
6766 imag = fold_build1 (code, var_type, imag);
6768 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
6770 else
6772 expr = fold_build1 (code, arg_type, var);
6773 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
6774 expr = fold_build1 (code, dest_type, expr);
6777 gimplify_assign (dest_var, expr, seq);
6780 /* Lazily generate the global lock var decl and return its address. */
6782 static tree
6783 nvptx_global_lock_addr ()
6785 tree v = global_lock_var;
6787 if (!v)
6789 tree name = get_identifier ("__reduction_lock");
6790 tree type = build_qualified_type (unsigned_type_node,
6791 TYPE_QUAL_VOLATILE);
6792 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
6793 global_lock_var = v;
6794 DECL_ARTIFICIAL (v) = 1;
6795 DECL_EXTERNAL (v) = 1;
6796 TREE_STATIC (v) = 1;
6797 TREE_PUBLIC (v) = 1;
6798 TREE_USED (v) = 1;
6799 mark_addressable (v);
6800 mark_decl_referenced (v);
6803 return build_fold_addr_expr (v);
6806 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
6807 GSI. We use a lockless scheme for nearly all case, which looks
6808 like:
6809 actual = initval(OP);
6810 do {
6811 guess = actual;
6812 write = guess OP myval;
6813 actual = cmp&swap (ptr, guess, write)
6814 } while (actual bit-different-to guess);
6815 return write;
6817 This relies on a cmp&swap instruction, which is available for 32-
6818 and 64-bit types. Larger types must use a locking scheme. */
6820 static tree
6821 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
6822 tree ptr, tree var, tree_code op)
6824 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
6825 tree_code code = NOP_EXPR;
6826 tree arg_type = unsigned_type_node;
6827 tree var_type = TREE_TYPE (var);
6829 if (TREE_CODE (var_type) == COMPLEX_TYPE
6830 || SCALAR_FLOAT_TYPE_P (var_type))
6831 code = VIEW_CONVERT_EXPR;
6833 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
6835 arg_type = long_long_unsigned_type_node;
6836 fn = NVPTX_BUILTIN_CMP_SWAPLL;
6839 tree swap_fn = nvptx_builtin_decl (fn, true);
6841 gimple_seq init_seq = NULL;
6842 tree init_var = make_ssa_name (arg_type);
6843 tree init_expr = omp_reduction_init_op (loc, op, var_type);
6844 init_expr = fold_build1 (code, arg_type, init_expr);
6845 gimplify_assign (init_var, init_expr, &init_seq);
6846 gimple *init_end = gimple_seq_last (init_seq);
6848 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
6850 /* Split the block just after the init stmts. */
6851 basic_block pre_bb = gsi_bb (*gsi);
6852 edge pre_edge = split_block (pre_bb, init_end);
6853 basic_block loop_bb = pre_edge->dest;
6854 pre_bb = pre_edge->src;
6855 /* Reset the iterator. */
6856 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6858 tree expect_var = make_ssa_name (arg_type);
6859 tree actual_var = make_ssa_name (arg_type);
6860 tree write_var = make_ssa_name (arg_type);
6862 /* Build and insert the reduction calculation. */
6863 gimple_seq red_seq = NULL;
6864 tree write_expr = fold_build1 (code, var_type, expect_var);
6865 write_expr = fold_build2 (op, var_type, write_expr, var);
6866 write_expr = fold_build1 (code, arg_type, write_expr);
6867 gimplify_assign (write_var, write_expr, &red_seq);
6869 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6871 /* Build & insert the cmp&swap sequence. */
6872 gimple_seq latch_seq = NULL;
6873 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
6874 ptr, expect_var, write_var);
6875 gimplify_assign (actual_var, swap_expr, &latch_seq);
6877 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
6878 NULL_TREE, NULL_TREE);
6879 gimple_seq_add_stmt (&latch_seq, cond);
6881 gimple *latch_end = gimple_seq_last (latch_seq);
6882 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
6884 /* Split the block just after the latch stmts. */
6885 edge post_edge = split_block (loop_bb, latch_end);
6886 basic_block post_bb = post_edge->dest;
6887 loop_bb = post_edge->src;
6888 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6890 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6891 post_edge->probability = profile_probability::even ();
6892 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
6893 loop_edge->probability = profile_probability::even ();
6894 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
6895 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
6897 gphi *phi = create_phi_node (expect_var, loop_bb);
6898 add_phi_arg (phi, init_var, pre_edge, loc);
6899 add_phi_arg (phi, actual_var, loop_edge, loc);
6901 loop *loop = alloc_loop ();
6902 loop->header = loop_bb;
6903 loop->latch = loop_bb;
6904 add_loop (loop, loop_bb->loop_father);
6906 return fold_build1 (code, var_type, write_var);
6909 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
6910 GSI. This is necessary for types larger than 64 bits, where there
6911 is no cmp&swap instruction to implement a lockless scheme. We use
6912 a lock variable in global memory.
6914 while (cmp&swap (&lock_var, 0, 1))
6915 continue;
6916 T accum = *ptr;
6917 accum = accum OP var;
6918 *ptr = accum;
6919 cmp&swap (&lock_var, 1, 0);
6920 return accum;
6922 A lock in global memory is necessary to force execution engine
6923 descheduling and avoid resource starvation that can occur if the
6924 lock is in .shared memory. */
6926 static tree
6927 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
6928 tree ptr, tree var, tree_code op, int level)
6930 tree var_type = TREE_TYPE (var);
6931 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
6932 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
6933 tree uns_locked = build_int_cst (unsigned_type_node, 1);
6935 /* Split the block just before the gsi. Insert a gimple nop to make
6936 this easier. */
6937 gimple *nop = gimple_build_nop ();
6938 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
6939 basic_block entry_bb = gsi_bb (*gsi);
6940 edge entry_edge = split_block (entry_bb, nop);
6941 basic_block lock_bb = entry_edge->dest;
6942 /* Reset the iterator. */
6943 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6945 /* Build and insert the locking sequence. */
6946 gimple_seq lock_seq = NULL;
6947 tree lock_var = make_ssa_name (unsigned_type_node);
6948 tree lock_expr = nvptx_global_lock_addr ();
6949 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6950 uns_unlocked, uns_locked);
6951 gimplify_assign (lock_var, lock_expr, &lock_seq);
6952 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6953 NULL_TREE, NULL_TREE);
6954 gimple_seq_add_stmt (&lock_seq, cond);
6955 gimple *lock_end = gimple_seq_last (lock_seq);
6956 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6958 /* Split the block just after the lock sequence. */
6959 edge locked_edge = split_block (lock_bb, lock_end);
6960 basic_block update_bb = locked_edge->dest;
6961 lock_bb = locked_edge->src;
6962 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6964 /* Create the lock loop ... */
6965 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6966 locked_edge->probability = profile_probability::even ();
6967 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6968 loop_edge->probability = profile_probability::even ();
6969 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6970 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6972 /* ... and the loop structure. */
6973 loop *lock_loop = alloc_loop ();
6974 lock_loop->header = lock_bb;
6975 lock_loop->latch = lock_bb;
6976 lock_loop->nb_iterations_estimate = 1;
6977 lock_loop->any_estimate = true;
6978 add_loop (lock_loop, entry_bb->loop_father);
6980 /* Build the pre-barrier. */
6981 gimple_seq red_seq = NULL;
6982 enum nvptx_builtins barrier_builtin
6983 = (level == GOMP_DIM_GANG
6984 ? NVPTX_BUILTIN_MEMBAR_GL
6985 : NVPTX_BUILTIN_MEMBAR_CTA);
6986 tree barrier_fn = nvptx_builtin_decl (barrier_builtin, true);
6987 tree barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
6988 gimplify_stmt (&barrier_expr, &red_seq);
6990 /* Build the reduction calculation. */
6991 tree acc_in = make_ssa_name (var_type);
6992 tree ref_in = build_simple_mem_ref (ptr);
6993 TREE_THIS_VOLATILE (ref_in) = 1;
6994 gimplify_assign (acc_in, ref_in, &red_seq);
6996 tree acc_out = make_ssa_name (var_type);
6997 tree update_expr = fold_build2 (op, var_type, ref_in, var);
6998 gimplify_assign (acc_out, update_expr, &red_seq);
7000 tree ref_out = build_simple_mem_ref (ptr);
7001 TREE_THIS_VOLATILE (ref_out) = 1;
7002 gimplify_assign (ref_out, acc_out, &red_seq);
7004 /* Build the post-barrier. */
7005 barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
7006 gimplify_stmt (&barrier_expr, &red_seq);
7008 /* Insert the reduction calculation. */
7009 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
7011 /* Build & insert the unlock sequence. */
7012 gimple_seq unlock_seq = NULL;
7013 tree unlock_expr = nvptx_global_lock_addr ();
7014 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
7015 uns_locked, uns_unlocked);
7016 gimplify_and_add (unlock_expr, &unlock_seq);
7017 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
7019 return acc_out;
7022 /* Emit a sequence to update a reduction accumlator at *PTR with the
7023 value held in VAR using operator OP. Return the updated value.
7025 TODO: optimize for atomic ops and indepedent complex ops. */
7027 static tree
7028 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
7029 tree ptr, tree var, tree_code op, int level)
7031 tree type = TREE_TYPE (var);
7032 tree size = TYPE_SIZE (type);
7034 if (size == TYPE_SIZE (unsigned_type_node)
7035 || size == TYPE_SIZE (long_long_unsigned_type_node))
7036 return nvptx_lockless_update (loc, gsi, ptr, var, op);
7037 else
7038 return nvptx_lockfull_update (loc, gsi, ptr, var, op, level);
7041 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
7043 static void
7044 nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
7046 gimple_stmt_iterator gsi = gsi_for_stmt (call);
7047 tree lhs = gimple_call_lhs (call);
7048 tree var = gimple_call_arg (call, 2);
7049 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
7050 gimple_seq seq = NULL;
7052 push_gimplify_context (true);
7054 if (level != GOMP_DIM_GANG)
7056 /* Copy the receiver object. */
7057 tree ref_to_res = gimple_call_arg (call, 1);
7059 if (!integer_zerop (ref_to_res))
7060 var = build_simple_mem_ref (ref_to_res);
7063 if (level == GOMP_DIM_WORKER
7064 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
7066 /* Store incoming value to worker reduction buffer. */
7067 tree offset = gimple_call_arg (call, 5);
7068 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
7069 level == GOMP_DIM_VECTOR);
7070 tree ptr = make_ssa_name (TREE_TYPE (call));
7072 gimplify_assign (ptr, call, &seq);
7073 tree ref = build_simple_mem_ref (ptr);
7074 TREE_THIS_VOLATILE (ref) = 1;
7075 gimplify_assign (ref, var, &seq);
7078 if (lhs)
7079 gimplify_assign (lhs, var, &seq);
7081 pop_gimplify_context (NULL);
7082 gsi_replace_with_seq (&gsi, seq, true);
7085 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
7087 static void
7088 nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
7090 gimple_stmt_iterator gsi = gsi_for_stmt (call);
7091 tree lhs = gimple_call_lhs (call);
7092 tree var = gimple_call_arg (call, 2);
7093 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
7094 enum tree_code rcode
7095 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
7096 tree init = omp_reduction_init_op (gimple_location (call), rcode,
7097 TREE_TYPE (var));
7098 gimple_seq seq = NULL;
7100 push_gimplify_context (true);
7102 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
7104 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
7105 tree tid = make_ssa_name (integer_type_node);
7106 tree dim_vector = gimple_call_arg (call, 3);
7107 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
7108 dim_vector);
7109 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
7110 NULL_TREE, NULL_TREE);
7112 gimple_call_set_lhs (tid_call, tid);
7113 gimple_seq_add_stmt (&seq, tid_call);
7114 gimple_seq_add_stmt (&seq, cond_stmt);
7116 /* Split the block just after the call. */
7117 edge init_edge = split_block (gsi_bb (gsi), call);
7118 basic_block init_bb = init_edge->dest;
7119 basic_block call_bb = init_edge->src;
7121 /* Fixup flags from call_bb to init_bb. */
7122 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
7123 init_edge->probability = profile_probability::even ();
7125 /* Set the initialization stmts. */
7126 gimple_seq init_seq = NULL;
7127 tree init_var = make_ssa_name (TREE_TYPE (var));
7128 gimplify_assign (init_var, init, &init_seq);
7129 gsi = gsi_start_bb (init_bb);
7130 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
7132 /* Split block just after the init stmt. */
7133 gsi_prev (&gsi);
7134 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
7135 basic_block dst_bb = inited_edge->dest;
7137 /* Create false edge from call_bb to dst_bb. */
7138 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
7139 nop_edge->probability = profile_probability::even ();
7141 /* Create phi node in dst block. */
7142 gphi *phi = create_phi_node (lhs, dst_bb);
7143 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
7144 add_phi_arg (phi, var, nop_edge, gimple_location (call));
7146 /* Reset dominator of dst bb. */
7147 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
7149 /* Reset the gsi. */
7150 gsi = gsi_for_stmt (call);
7152 else
7154 if (level == GOMP_DIM_GANG)
7156 /* If there's no receiver object, propagate the incoming VAR. */
7157 tree ref_to_res = gimple_call_arg (call, 1);
7158 if (integer_zerop (ref_to_res))
7159 init = var;
7162 if (lhs != NULL_TREE)
7163 gimplify_assign (lhs, init, &seq);
7166 pop_gimplify_context (NULL);
7167 gsi_replace_with_seq (&gsi, seq, true);
7170 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
7172 static void
7173 nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
7175 gimple_stmt_iterator gsi = gsi_for_stmt (call);
7176 tree lhs = gimple_call_lhs (call);
7177 tree ref_to_res = gimple_call_arg (call, 1);
7178 tree var = gimple_call_arg (call, 2);
7179 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
7180 enum tree_code op
7181 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
7182 gimple_seq seq = NULL;
7183 tree r = NULL_TREE;
7185 push_gimplify_context (true);
7187 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
7189 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
7190 but that requires a method of emitting a unified jump at the
7191 gimple level. */
7192 for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
7194 tree other_var = make_ssa_name (TREE_TYPE (var));
7195 nvptx_generate_vector_shuffle (gimple_location (call),
7196 other_var, var, shfl, &seq);
7198 r = make_ssa_name (TREE_TYPE (var));
7199 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
7200 var, other_var), &seq);
7201 var = r;
7204 else
7206 tree accum = NULL_TREE;
7208 if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
7210 /* Get reduction buffer address. */
7211 tree offset = gimple_call_arg (call, 5);
7212 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
7213 level == GOMP_DIM_VECTOR);
7214 tree ptr = make_ssa_name (TREE_TYPE (call));
7216 gimplify_assign (ptr, call, &seq);
7217 accum = ptr;
7219 else if (integer_zerop (ref_to_res))
7220 r = var;
7221 else
7222 accum = ref_to_res;
7224 if (accum)
7226 /* UPDATE the accumulator. */
7227 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
7228 seq = NULL;
7229 r = nvptx_reduction_update (gimple_location (call), &gsi,
7230 accum, var, op, level);
7234 if (lhs)
7235 gimplify_assign (lhs, r, &seq);
7236 pop_gimplify_context (NULL);
7238 gsi_replace_with_seq (&gsi, seq, true);
7241 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
7243 static void
7244 nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
7246 gimple_stmt_iterator gsi = gsi_for_stmt (call);
7247 tree lhs = gimple_call_lhs (call);
7248 tree var = gimple_call_arg (call, 2);
7249 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
7250 gimple_seq seq = NULL;
7252 push_gimplify_context (true);
7253 if (level == GOMP_DIM_WORKER
7254 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
7256 /* Read the worker reduction buffer. */
7257 tree offset = gimple_call_arg (call, 5);
7258 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
7259 level == GOMP_DIM_VECTOR);
7260 tree ptr = make_ssa_name (TREE_TYPE (call));
7262 gimplify_assign (ptr, call, &seq);
7263 var = build_simple_mem_ref (ptr);
7264 TREE_THIS_VOLATILE (var) = 1;
7267 if (level != GOMP_DIM_GANG)
7269 /* Write to the receiver object. */
7270 tree ref_to_res = gimple_call_arg (call, 1);
7272 if (!integer_zerop (ref_to_res))
7273 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
7276 if (lhs)
7277 gimplify_assign (lhs, var, &seq);
7279 pop_gimplify_context (NULL);
7281 gsi_replace_with_seq (&gsi, seq, true);
7284 /* NVPTX reduction expander. */
7286 static void
7287 nvptx_goacc_reduction (gcall *call)
7289 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
7290 offload_attrs oa;
7292 populate_offload_attrs (&oa);
7294 switch (code)
7296 case IFN_GOACC_REDUCTION_SETUP:
7297 nvptx_goacc_reduction_setup (call, &oa);
7298 break;
7300 case IFN_GOACC_REDUCTION_INIT:
7301 nvptx_goacc_reduction_init (call, &oa);
7302 break;
7304 case IFN_GOACC_REDUCTION_FINI:
7305 nvptx_goacc_reduction_fini (call, &oa);
7306 break;
7308 case IFN_GOACC_REDUCTION_TEARDOWN:
7309 nvptx_goacc_reduction_teardown (call, &oa);
7310 break;
7312 default:
7313 gcc_unreachable ();
7317 static bool
7318 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
7319 rtx x ATTRIBUTE_UNUSED)
7321 return true;
7324 static bool
7325 nvptx_scalar_mode_supported_p (scalar_mode mode)
7327 if (nvptx_experimental && mode == HFmode && TARGET_SM53)
7328 return true;
7330 return default_scalar_mode_supported_p (mode);
7333 static bool
7334 nvptx_libgcc_floating_mode_supported_p (scalar_float_mode mode)
7336 if (nvptx_experimental && mode == HFmode && TARGET_SM53)
7337 return true;
7339 return default_libgcc_floating_mode_supported_p (mode);
7342 static bool
7343 nvptx_vector_mode_supported (machine_mode mode)
7345 return (mode == V2SImode
7346 || mode == V2DImode);
7349 /* Return the preferred mode for vectorizing scalar MODE. */
7351 static machine_mode
7352 nvptx_preferred_simd_mode (scalar_mode mode)
7354 switch (mode)
7356 case E_DImode:
7357 return V2DImode;
7358 case E_SImode:
7359 return V2SImode;
7361 default:
7362 return default_preferred_simd_mode (mode);
7366 unsigned int
7367 nvptx_data_alignment (const_tree type, unsigned int basic_align)
7369 if (TREE_CODE (type) == INTEGER_TYPE)
7371 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
7372 if (size == GET_MODE_SIZE (TImode))
7373 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
7376 return basic_align;
7379 /* Implement TARGET_MODES_TIEABLE_P. */
7381 static bool
7382 nvptx_modes_tieable_p (machine_mode, machine_mode)
7384 return false;
7387 /* Implement TARGET_HARD_REGNO_NREGS. */
7389 static unsigned int
7390 nvptx_hard_regno_nregs (unsigned int, machine_mode)
7392 return 1;
7395 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
7397 static bool
7398 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
7400 return false;
7403 /* Implement TARGET_TRULY_NOOP_TRUNCATION. */
7405 static bool
7406 nvptx_truly_noop_truncation (poly_uint64, poly_uint64)
7408 return false;
7411 /* Implement TARGET_GOACC_ADJUST_PRIVATE_DECL. */
7413 static tree
7414 nvptx_goacc_adjust_private_decl (location_t loc, tree decl, int level)
7416 gcc_checking_assert (!lookup_attribute ("oacc gang-private",
7417 DECL_ATTRIBUTES (decl)));
7419 /* Set "oacc gang-private" attribute for gang-private variable
7420 declarations. */
7421 if (level == GOMP_DIM_GANG)
7423 tree id = get_identifier ("oacc gang-private");
7424 /* For later diagnostic purposes, pass LOC as VALUE (wrapped as a
7425 TREE). */
7426 tree loc_tree = build_empty_stmt (loc);
7427 DECL_ATTRIBUTES (decl)
7428 = tree_cons (id, loc_tree, DECL_ATTRIBUTES (decl));
7431 return decl;
7434 /* Implement TARGET_GOACC_EXPAND_VAR_DECL. */
7436 static rtx
7437 nvptx_goacc_expand_var_decl (tree var)
7439 /* Place "oacc gang-private" variables in shared memory. */
7440 if (tree attr = lookup_attribute ("oacc gang-private",
7441 DECL_ATTRIBUTES (var)))
7443 gcc_checking_assert (VAR_P (var));
7445 unsigned int offset, *poffset;
7446 poffset = gang_private_shared_hmap.get (var);
7447 if (poffset)
7448 offset = *poffset;
7449 else
7451 unsigned HOST_WIDE_INT align = DECL_ALIGN (var);
7452 gang_private_shared_size
7453 = (gang_private_shared_size + align - 1) & ~(align - 1);
7454 if (gang_private_shared_align < align)
7455 gang_private_shared_align = align;
7457 offset = gang_private_shared_size;
7458 bool existed = gang_private_shared_hmap.put (var, offset);
7459 gcc_checking_assert (!existed);
7460 gang_private_shared_size += tree_to_uhwi (DECL_SIZE_UNIT (var));
7462 location_t loc = EXPR_LOCATION (TREE_VALUE (attr));
7463 #if 0 /* For some reason, this doesn't work. */
7464 if (dump_enabled_p ())
7466 dump_flags_t l_dump_flags
7467 = get_openacc_privatization_dump_flags ();
7469 const dump_user_location_t d_u_loc
7470 = dump_user_location_t::from_location_t (loc);
7471 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
7472 #if __GNUC__ >= 10
7473 # pragma GCC diagnostic push
7474 # pragma GCC diagnostic ignored "-Wformat"
7475 #endif
7476 dump_printf_loc (l_dump_flags, d_u_loc,
7477 "variable %<%T%> adjusted for OpenACC"
7478 " privatization level: %qs\n",
7479 var, "gang");
7480 #if __GNUC__ >= 10
7481 # pragma GCC diagnostic pop
7482 #endif
7484 #else /* ..., thus emulate that, good enough for testsuite usage. */
7485 if (param_openacc_privatization != OPENACC_PRIVATIZATION_QUIET)
7486 inform (loc,
7487 "variable %qD adjusted for OpenACC privatization level:"
7488 " %qs",
7489 var, "gang");
7490 if (dump_file && (dump_flags & TDF_DETAILS))
7492 /* 'dumpfile.cc:dump_loc' */
7493 fprintf (dump_file, "%s:%d:%d: ", LOCATION_FILE (loc),
7494 LOCATION_LINE (loc), LOCATION_COLUMN (loc));
7495 fprintf (dump_file, "%s: ", "note");
7497 fprintf (dump_file,
7498 "variable '");
7499 print_generic_expr (dump_file, var, TDF_SLIM);
7500 fprintf (dump_file,
7501 "' adjusted for OpenACC privatization level: '%s'\n",
7502 "gang");
7504 #endif
7506 rtx addr = plus_constant (Pmode, gang_private_shared_sym, offset);
7507 return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var)), addr);
7510 return NULL_RTX;
7513 static GTY(()) tree nvptx_previous_fndecl;
7515 static void
7516 nvptx_set_current_function (tree fndecl)
7518 if (!fndecl || fndecl == nvptx_previous_fndecl)
7519 return;
7521 gang_private_shared_hmap.empty ();
7522 nvptx_previous_fndecl = fndecl;
7523 vector_red_partition = 0;
7524 oacc_bcast_partition = 0;
7527 /* Implement TARGET_LIBC_HAS_FUNCTION. */
7529 bool
7530 nvptx_libc_has_function (enum function_class fn_class, tree type)
7532 if (fn_class == function_sincos)
7534 if (type != NULL_TREE)
7535 /* Currently, newlib does not support sincosl. */
7536 return type == float_type_node || type == double_type_node;
7537 else
7538 return true;
7541 return default_libc_has_function (fn_class, type);
7544 bool
7545 nvptx_mem_local_p (rtx mem)
7547 gcc_assert (GET_CODE (mem) == MEM);
7549 struct address_info info;
7550 decompose_mem_address (&info, mem);
7552 if (info.base != NULL && REG_P (*info.base)
7553 && REGNO_PTR_FRAME_P (REGNO (*info.base)))
7555 if (TARGET_SOFT_STACK)
7557 /* Frame-related doesn't mean local. */
7559 else
7560 return true;
7563 return false;
7566 /* Define locally, for use in NVPTX_ASM_OUTPUT_DEF. */
7567 #define SET_ASM_OP ".alias "
7569 /* Define locally, for use in nvptx_asm_output_def_from_decls. Add NVPTX_
7570 prefix to avoid clash with ASM_OUTPUT_DEF from nvptx.h.
7571 Copy of ASM_OUTPUT_DEF from defaults.h, with added terminating
7572 semicolon. */
7573 #define NVPTX_ASM_OUTPUT_DEF(FILE,LABEL1,LABEL2) \
7574 do \
7576 fprintf ((FILE), "%s", SET_ASM_OP); \
7577 assemble_name (FILE, LABEL1); \
7578 fprintf (FILE, ","); \
7579 assemble_name (FILE, LABEL2); \
7580 fprintf (FILE, ";\n"); \
7582 while (0)
7584 void
7585 nvptx_asm_output_def_from_decls (FILE *stream, tree name, tree value)
7587 if (nvptx_alias == 0 || !TARGET_PTX_6_3)
7589 /* Copied from assemble_alias. */
7590 error_at (DECL_SOURCE_LOCATION (name),
7591 "alias definitions not supported in this configuration");
7592 TREE_ASM_WRITTEN (name) = 1;
7593 return;
7596 if (lookup_attribute ("weak", DECL_ATTRIBUTES (name)))
7598 /* Prevent execution FAILs for gcc.dg/globalalias.c and
7599 gcc.dg/pr77587.c. */
7600 error_at (DECL_SOURCE_LOCATION (name),
7601 "weak alias definitions not supported in this configuration");
7602 TREE_ASM_WRITTEN (name) = 1;
7603 return;
7606 /* Ptx also doesn't support value having weak linkage, but we can't detect
7607 that here, so we'll end up with:
7608 "error: Function test with .weak scope cannot be aliased".
7609 See gcc.dg/localalias.c. */
7611 if (TREE_CODE (name) != FUNCTION_DECL)
7613 error_at (DECL_SOURCE_LOCATION (name),
7614 "non-function alias definitions not supported"
7615 " in this configuration");
7616 TREE_ASM_WRITTEN (name) = 1;
7617 return;
7620 if (!cgraph_node::get (name)->referred_to_p ())
7621 /* Prevent "Internal error: reference to deleted section". */
7622 return;
7624 std::stringstream s;
7625 write_fn_proto (s, false, get_fnname_from_decl (name), name);
7626 fputs (s.str ().c_str (), stream);
7628 tree id = DECL_ASSEMBLER_NAME (name);
7629 NVPTX_ASM_OUTPUT_DEF (stream, IDENTIFIER_POINTER (id),
7630 IDENTIFIER_POINTER (value));
7633 #undef NVPTX_ASM_OUTPUT_DEF
7634 #undef SET_ASM_OP
7636 #undef TARGET_OPTION_OVERRIDE
7637 #define TARGET_OPTION_OVERRIDE nvptx_option_override
7639 #undef TARGET_ATTRIBUTE_TABLE
7640 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
7642 #undef TARGET_LEGITIMATE_ADDRESS_P
7643 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
7645 #undef TARGET_PROMOTE_FUNCTION_MODE
7646 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
7648 #undef TARGET_FUNCTION_ARG
7649 #define TARGET_FUNCTION_ARG nvptx_function_arg
7650 #undef TARGET_FUNCTION_INCOMING_ARG
7651 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
7652 #undef TARGET_FUNCTION_ARG_ADVANCE
7653 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
7654 #undef TARGET_FUNCTION_ARG_BOUNDARY
7655 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
7656 #undef TARGET_PASS_BY_REFERENCE
7657 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
7658 #undef TARGET_FUNCTION_VALUE_REGNO_P
7659 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
7660 #undef TARGET_FUNCTION_VALUE
7661 #define TARGET_FUNCTION_VALUE nvptx_function_value
7662 #undef TARGET_LIBCALL_VALUE
7663 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
7664 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
7665 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
7666 #undef TARGET_GET_DRAP_RTX
7667 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
7668 #undef TARGET_SPLIT_COMPLEX_ARG
7669 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
7670 #undef TARGET_RETURN_IN_MEMORY
7671 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
7672 #undef TARGET_OMIT_STRUCT_RETURN_REG
7673 #define TARGET_OMIT_STRUCT_RETURN_REG true
7674 #undef TARGET_STRICT_ARGUMENT_NAMING
7675 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
7676 #undef TARGET_CALL_ARGS
7677 #define TARGET_CALL_ARGS nvptx_call_args
7678 #undef TARGET_END_CALL_ARGS
7679 #define TARGET_END_CALL_ARGS nvptx_end_call_args
7681 #undef TARGET_ASM_FILE_START
7682 #define TARGET_ASM_FILE_START nvptx_file_start
7683 #undef TARGET_ASM_FILE_END
7684 #define TARGET_ASM_FILE_END nvptx_file_end
7685 #undef TARGET_ASM_GLOBALIZE_LABEL
7686 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
7687 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
7688 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
7689 #undef TARGET_PRINT_OPERAND
7690 #define TARGET_PRINT_OPERAND nvptx_print_operand
7691 #undef TARGET_PRINT_OPERAND_ADDRESS
7692 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
7693 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
7694 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
7695 #undef TARGET_ASM_INTEGER
7696 #define TARGET_ASM_INTEGER nvptx_assemble_integer
7697 #undef TARGET_ASM_DECL_END
7698 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
7699 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
7700 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
7701 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
7702 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
7703 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
7704 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
7706 #undef TARGET_MACHINE_DEPENDENT_REORG
7707 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
7708 #undef TARGET_NO_REGISTER_ALLOCATION
7709 #define TARGET_NO_REGISTER_ALLOCATION true
7711 #undef TARGET_ENCODE_SECTION_INFO
7712 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
7713 #undef TARGET_RECORD_OFFLOAD_SYMBOL
7714 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
7716 #undef TARGET_VECTOR_ALIGNMENT
7717 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
7719 #undef TARGET_CANNOT_COPY_INSN_P
7720 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
7722 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
7723 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
7725 #undef TARGET_INIT_BUILTINS
7726 #define TARGET_INIT_BUILTINS nvptx_init_builtins
7727 #undef TARGET_EXPAND_BUILTIN
7728 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
7729 #undef TARGET_BUILTIN_DECL
7730 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
7732 #undef TARGET_SIMT_VF
7733 #define TARGET_SIMT_VF nvptx_simt_vf
7735 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
7736 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa
7738 #undef TARGET_GOACC_VALIDATE_DIMS
7739 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
7741 #undef TARGET_GOACC_DIM_LIMIT
7742 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
7744 #undef TARGET_GOACC_FORK_JOIN
7745 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
7747 #undef TARGET_GOACC_REDUCTION
7748 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
7750 #undef TARGET_CANNOT_FORCE_CONST_MEM
7751 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
7753 #undef TARGET_SCALAR_MODE_SUPPORTED_P
7754 #define TARGET_SCALAR_MODE_SUPPORTED_P nvptx_scalar_mode_supported_p
7756 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
7757 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
7758 nvptx_libgcc_floating_mode_supported_p
7760 #undef TARGET_VECTOR_MODE_SUPPORTED_P
7761 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
7763 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
7764 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
7765 nvptx_preferred_simd_mode
7767 #undef TARGET_MODES_TIEABLE_P
7768 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
7770 #undef TARGET_HARD_REGNO_NREGS
7771 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
7773 #undef TARGET_CAN_CHANGE_MODE_CLASS
7774 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
7776 #undef TARGET_TRULY_NOOP_TRUNCATION
7777 #define TARGET_TRULY_NOOP_TRUNCATION nvptx_truly_noop_truncation
7779 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
7780 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
7782 #undef TARGET_GOACC_ADJUST_PRIVATE_DECL
7783 #define TARGET_GOACC_ADJUST_PRIVATE_DECL nvptx_goacc_adjust_private_decl
7785 #undef TARGET_GOACC_EXPAND_VAR_DECL
7786 #define TARGET_GOACC_EXPAND_VAR_DECL nvptx_goacc_expand_var_decl
7788 #undef TARGET_SET_CURRENT_FUNCTION
7789 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
7791 #undef TARGET_LIBC_HAS_FUNCTION
7792 #define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function
7794 #undef TARGET_HAVE_STRUB_SUPPORT_FOR
7795 #define TARGET_HAVE_STRUB_SUPPORT_FOR hook_bool_tree_false
7797 struct gcc_target targetm = TARGET_INITIALIZER;
7799 #include "gt-nvptx.h"