Add assember CFI directives to millicode division and remainder routines.
[official-gcc.git] / gcc / config / nvptx / nvptx.cc
blob89349dae9e62100dbcc2bff922f254489da7b481
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2023 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "cfgrtl.h"
56 #include "gimple.h"
57 #include "stor-layout.h"
58 #include "builtins.h"
59 #include "omp-general.h"
60 #include "omp-low.h"
61 #include "omp-offload.h"
62 #include "gomp-constants.h"
63 #include "dumpfile.h"
64 #include "internal-fn.h"
65 #include "gimple-iterator.h"
66 #include "stringpool.h"
67 #include "attribs.h"
68 #include "tree-vrp.h"
69 #include "tree-ssa-operands.h"
70 #include "tree-ssanames.h"
71 #include "gimplify.h"
72 #include "tree-phinodes.h"
73 #include "cfgloop.h"
74 #include "fold-const.h"
75 #include "intl.h"
76 #include "opts.h"
77 #include "tree-pretty-print.h"
78 #include "rtl-iter.h"
79 #include "cgraph.h"
81 /* This file should be included last. */
82 #include "target-def.h"
84 #define WORKAROUND_PTXJIT_BUG 1
85 #define WORKAROUND_PTXJIT_BUG_2 1
86 #define WORKAROUND_PTXJIT_BUG_3 1
88 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
89 block, which has had a maximum number of threads of 1024 since CUDA version
90 2.x. */
91 #define PTX_CTA_SIZE 1024
93 #define PTX_CTA_NUM_BARRIERS 16
94 #define PTX_WARP_SIZE 32
96 #define PTX_PER_CTA_BARRIER 0
97 #define PTX_NUM_PER_CTA_BARRIERS 1
98 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
99 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
101 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
102 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
103 #define PTX_WORKER_LENGTH 32
104 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
106 /* The various PTX memory areas an object might reside in. */
107 enum nvptx_data_area
109 DATA_AREA_GENERIC,
110 DATA_AREA_GLOBAL,
111 DATA_AREA_SHARED,
112 DATA_AREA_LOCAL,
113 DATA_AREA_CONST,
114 DATA_AREA_PARAM,
115 DATA_AREA_MAX
118 /* We record the data area in the target symbol flags. */
119 #define SYMBOL_DATA_AREA(SYM) \
120 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
121 & 7)
122 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
123 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
125 /* Record the function decls we've written, and the libfuncs and function
126 decls corresponding to them. */
127 static std::stringstream func_decls;
129 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
131 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
132 static bool equal (rtx a, rtx b) { return a == b; }
135 static GTY((cache))
136 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
138 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
140 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
141 static bool equal (tree a, tree b) { return a == b; }
144 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
145 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
147 /* Buffer needed to broadcast across workers and vectors. This is
148 used for both worker-neutering and worker broadcasting, and
149 vector-neutering and boardcasting when vector_length > 32. It is
150 shared by all functions emitted. The buffer is placed in shared
151 memory. It'd be nice if PTX supported common blocks, because then
152 this could be shared across TUs (taking the largest size). */
153 static unsigned oacc_bcast_size;
154 static unsigned oacc_bcast_partition;
155 static unsigned oacc_bcast_align;
156 static GTY(()) rtx oacc_bcast_sym;
158 /* Buffer needed for worker reductions. This has to be distinct from
159 the worker broadcast array, as both may be live concurrently. */
160 static unsigned worker_red_size;
161 static unsigned worker_red_align;
162 static GTY(()) rtx worker_red_sym;
164 /* Buffer needed for vector reductions, when vector_length >
165 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
166 array, as both may be live concurrently. */
167 static unsigned vector_red_size;
168 static unsigned vector_red_align;
169 static unsigned vector_red_partition;
170 static GTY(()) rtx vector_red_sym;
172 /* Shared memory block for gang-private variables. */
173 static unsigned gang_private_shared_size;
174 static unsigned gang_private_shared_align;
175 static GTY(()) rtx gang_private_shared_sym;
176 static hash_map<tree_decl_hash, unsigned int> gang_private_shared_hmap;
178 /* Global lock variable, needed for 128bit worker & gang reductions. */
179 static GTY(()) tree global_lock_var;
181 /* True if any function references __nvptx_stacks. */
182 static bool need_softstack_decl;
184 /* True if any function references __nvptx_uni. */
185 static bool need_unisimt_decl;
187 static int nvptx_mach_max_workers ();
189 /* Allocate a new, cleared machine_function structure. */
191 static struct machine_function *
192 nvptx_init_machine_status (void)
194 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
195 p->return_mode = VOIDmode;
196 return p;
199 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
200 and -fopenacc is also enabled. */
202 static void
203 diagnose_openacc_conflict (bool optval, const char *optname)
205 if (flag_openacc && optval)
206 error ("option %s is not supported together with %<-fopenacc%>", optname);
209 static enum ptx_version
210 first_ptx_version_supporting_sm (enum ptx_isa sm)
212 switch (sm)
214 case PTX_ISA_SM30:
215 return PTX_VERSION_3_0;
216 case PTX_ISA_SM35:
217 return PTX_VERSION_3_1;
218 case PTX_ISA_SM53:
219 return PTX_VERSION_4_2;
220 case PTX_ISA_SM70:
221 return PTX_VERSION_6_0;
222 case PTX_ISA_SM75:
223 return PTX_VERSION_6_3;
224 case PTX_ISA_SM80:
225 return PTX_VERSION_7_0;
226 default:
227 gcc_unreachable ();
231 static enum ptx_version
232 default_ptx_version_option (void)
234 enum ptx_version first
235 = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option);
237 /* Pick a version that supports the sm. */
238 enum ptx_version res = first;
240 /* Pick at least 3.1. This has been the smallest version historically. */
241 res = MAX (res, PTX_VERSION_3_1);
243 /* Pick at least 6.0, to enable using bar.warp.sync to have a way to force
244 warp convergence. */
245 res = MAX (res, PTX_VERSION_6_0);
247 /* Verify that we pick a version that supports the sm. */
248 gcc_assert (first <= res);
249 return res;
252 static const char *
253 ptx_version_to_string (enum ptx_version v)
255 switch (v)
257 case PTX_VERSION_3_0:
258 return "3.0";
259 case PTX_VERSION_3_1:
260 return "3.1";
261 case PTX_VERSION_4_2:
262 return "4.2";
263 case PTX_VERSION_6_0:
264 return "6.0";
265 case PTX_VERSION_6_3:
266 return "6.3";
267 case PTX_VERSION_7_0:
268 return "7.0";
269 default:
270 gcc_unreachable ();
274 unsigned int
275 ptx_version_to_number (enum ptx_version v, bool major_p)
277 switch (v)
279 case PTX_VERSION_3_0:
280 return major_p ? 3 : 0;
281 case PTX_VERSION_3_1:
282 return major_p ? 3 : 1;
283 case PTX_VERSION_4_2:
284 return major_p ? 4 : 2;
285 case PTX_VERSION_6_0:
286 return major_p ? 6 : 0;
287 case PTX_VERSION_6_3:
288 return major_p ? 6 : 3;
289 case PTX_VERSION_7_0:
290 return major_p ? 7 : 0;
291 default:
292 gcc_unreachable ();
296 static const char *
297 sm_version_to_string (enum ptx_isa sm)
299 switch (sm)
301 #define NVPTX_SM(XX, SEP) \
302 case PTX_ISA_SM ## XX: \
303 return #XX;
304 #include "nvptx-sm.def"
305 #undef NVPTX_SM
306 default:
307 gcc_unreachable ();
311 static void
312 handle_ptx_version_option (void)
314 if (!OPTION_SET_P (ptx_version_option)
315 || ptx_version_option == PTX_VERSION_default)
317 ptx_version_option = default_ptx_version_option ();
318 return;
321 enum ptx_version first
322 = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option);
324 if (ptx_version_option < first)
325 error ("PTX version (%<-mptx%>) needs to be at least %s to support selected"
326 " %<-misa%> (sm_%s)", ptx_version_to_string (first),
327 sm_version_to_string ((enum ptx_isa)ptx_isa_option));
330 /* Implement TARGET_OPTION_OVERRIDE. */
332 static void
333 nvptx_option_override (void)
335 init_machine_status = nvptx_init_machine_status;
337 /* Via nvptx 'OPTION_DEFAULT_SPECS', '-misa' always appears on the command
338 line. */
339 gcc_checking_assert (OPTION_SET_P (ptx_isa_option));
341 handle_ptx_version_option ();
343 /* Set toplevel_reorder, unless explicitly disabled. We need
344 reordering so that we emit necessary assembler decls of
345 undeclared variables. */
346 if (!OPTION_SET_P (flag_toplevel_reorder))
347 flag_toplevel_reorder = 1;
349 debug_nonbind_markers_p = 0;
351 /* Set flag_no_common, unless explicitly disabled. We fake common
352 using .weak, and that's not entirely accurate, so avoid it
353 unless forced. */
354 if (!OPTION_SET_P (flag_no_common))
355 flag_no_common = 1;
357 /* The patch area requires nops, which we don't have. */
358 HOST_WIDE_INT patch_area_size, patch_area_entry;
359 parse_and_check_patch_area (flag_patchable_function_entry, false,
360 &patch_area_size, &patch_area_entry);
361 if (patch_area_size > 0)
362 sorry ("not generating patch area, nops not supported");
364 /* Assumes that it will see only hard registers. */
365 flag_var_tracking = 0;
367 if (nvptx_optimize < 0)
368 nvptx_optimize = optimize > 0;
370 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
371 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
372 declared_libfuncs_htab
373 = hash_table<declared_libfunc_hasher>::create_ggc (17);
375 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
376 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
377 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
378 oacc_bcast_partition = 0;
380 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
381 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
382 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
384 vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
385 SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
386 vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
387 vector_red_partition = 0;
389 gang_private_shared_sym = gen_rtx_SYMBOL_REF (Pmode, "__gang_private_shared");
390 SET_SYMBOL_DATA_AREA (gang_private_shared_sym, DATA_AREA_SHARED);
391 gang_private_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
393 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
394 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
395 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
397 if (TARGET_GOMP)
398 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
401 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
402 deal with ptx ideosyncracies. */
404 const char *
405 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
407 switch (mode)
409 case E_BLKmode:
410 return ".b8";
411 case E_BImode:
412 return ".pred";
413 case E_QImode:
414 if (promote)
415 return ".u32";
416 else
417 return ".u8";
418 case E_HImode:
419 return ".u16";
420 case E_SImode:
421 return ".u32";
422 case E_DImode:
423 return ".u64";
425 case E_HFmode:
426 return ".f16";
427 case E_SFmode:
428 return ".f32";
429 case E_DFmode:
430 return ".f64";
432 case E_V2SImode:
433 return ".v2.u32";
434 case E_V2DImode:
435 return ".v2.u64";
437 default:
438 gcc_unreachable ();
442 /* Encode the PTX data area that DECL (which might not actually be a
443 _DECL) should reside in. */
445 static void
446 nvptx_encode_section_info (tree decl, rtx rtl, int first)
448 default_encode_section_info (decl, rtl, first);
449 if (first && MEM_P (rtl))
451 nvptx_data_area area = DATA_AREA_GENERIC;
453 if (TREE_CONSTANT (decl))
454 area = DATA_AREA_CONST;
455 else if (TREE_CODE (decl) == VAR_DECL)
457 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
459 area = DATA_AREA_SHARED;
460 if (DECL_INITIAL (decl))
461 error ("static initialization of variable %q+D in %<.shared%>"
462 " memory is not supported", decl);
464 else
465 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
468 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
472 /* Return the PTX name of the data area in which SYM should be
473 placed. The symbol must have already been processed by
474 nvptx_encode_seciton_info, or equivalent. */
476 static const char *
477 section_for_sym (rtx sym)
479 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
480 /* Same order as nvptx_data_area enum. */
481 static char const *const areas[] =
482 {"", ".global", ".shared", ".local", ".const", ".param"};
484 return areas[area];
487 /* Similarly for a decl. */
489 static const char *
490 section_for_decl (const_tree decl)
492 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
495 /* Check NAME for special function names and redirect them by returning a
496 replacement. This applies to malloc, free and realloc, for which we
497 want to use libgcc wrappers, and call, which triggers a bug in
498 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
499 not active in an offload compiler -- the names are all set by the
500 host-side compiler. */
502 static const char *
503 nvptx_name_replacement (const char *name)
505 if (strcmp (name, "call") == 0)
506 return "__nvptx_call";
507 if (strcmp (name, "malloc") == 0)
508 return "__nvptx_malloc";
509 if (strcmp (name, "free") == 0)
510 return "__nvptx_free";
511 if (strcmp (name, "realloc") == 0)
512 return "__nvptx_realloc";
513 return name;
516 /* Return NULL if NAME contains no dot. Otherwise return a copy of NAME
517 with the dots replaced with dollar signs. */
519 static char *
520 nvptx_replace_dot (const char *name)
522 if (strchr (name, '.') == NULL)
523 return NULL;
525 char *p = xstrdup (name);
526 for (size_t i = 0; i < strlen (p); ++i)
527 if (p[i] == '.')
528 p[i] = '$';
529 return p;
532 /* If MODE should be treated as two registers of an inner mode, return
533 that inner mode. Otherwise return VOIDmode. */
535 static machine_mode
536 maybe_split_mode (machine_mode mode)
538 if (COMPLEX_MODE_P (mode))
539 return GET_MODE_INNER (mode);
541 if (mode == TImode)
542 return DImode;
544 return VOIDmode;
547 /* Return true if mode should be treated as two registers. */
549 static bool
550 split_mode_p (machine_mode mode)
552 return maybe_split_mode (mode) != VOIDmode;
555 /* Output a register, subreg, or register pair (with optional
556 enclosing braces). */
558 static void
559 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
560 int subreg_offset = -1)
562 if (inner_mode == VOIDmode)
564 if (HARD_REGISTER_NUM_P (regno))
565 fprintf (file, "%s", reg_names[regno]);
566 else
567 fprintf (file, "%%r%d", regno);
569 else if (subreg_offset >= 0)
571 output_reg (file, regno, VOIDmode);
572 fprintf (file, "$%d", subreg_offset);
574 else
576 if (subreg_offset == -1)
577 fprintf (file, "{");
578 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
579 fprintf (file, ",");
580 output_reg (file, regno, inner_mode, 0);
581 if (subreg_offset == -1)
582 fprintf (file, "}");
586 /* Emit forking instructions for MASK. */
588 static void
589 nvptx_emit_forking (unsigned mask, bool is_call)
591 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
592 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
593 if (mask)
595 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
597 /* Emit fork at all levels. This helps form SESE regions, as
598 it creates a block with a single successor before entering a
599 partitooned region. That is a good candidate for the end of
600 an SESE region. */
601 emit_insn (gen_nvptx_fork (op));
602 emit_insn (gen_nvptx_forked (op));
606 /* Emit joining instructions for MASK. */
608 static void
609 nvptx_emit_joining (unsigned mask, bool is_call)
611 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
612 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
613 if (mask)
615 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
617 /* Emit joining for all non-call pars to ensure there's a single
618 predecessor for the block the join insn ends up in. This is
619 needed for skipping entire loops. */
620 emit_insn (gen_nvptx_joining (op));
621 emit_insn (gen_nvptx_join (op));
626 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
627 returned in memory. Integer and floating types supported by the
628 machine are passed in registers, everything else is passed in
629 memory. Complex types are split. */
631 static bool
632 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
634 if (type)
636 if (AGGREGATE_TYPE_P (type))
637 return true;
638 if (TREE_CODE (type) == VECTOR_TYPE)
639 return true;
642 if (!for_return && COMPLEX_MODE_P (mode))
643 /* Complex types are passed as two underlying args. */
644 mode = GET_MODE_INNER (mode);
646 if (GET_MODE_CLASS (mode) != MODE_INT
647 && GET_MODE_CLASS (mode) != MODE_FLOAT)
648 return true;
650 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
651 return true;
653 return false;
656 /* A non-memory argument of mode MODE is being passed, determine the mode it
657 should be promoted to. This is also used for determining return
658 type promotion. */
660 static machine_mode
661 promote_arg (machine_mode mode, bool prototyped)
663 if (!prototyped && mode == SFmode)
664 /* K&R float promotion for unprototyped functions. */
665 mode = DFmode;
666 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
667 mode = SImode;
669 return mode;
672 /* A non-memory return type of MODE is being returned. Determine the
673 mode it should be promoted to. */
675 static machine_mode
676 promote_return (machine_mode mode)
678 return promote_arg (mode, true);
681 /* Implement TARGET_FUNCTION_ARG. */
683 static rtx
684 nvptx_function_arg (cumulative_args_t, const function_arg_info &arg)
686 if (arg.end_marker_p () || !arg.named)
687 return NULL_RTX;
689 return gen_reg_rtx (arg.mode);
692 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
694 static rtx
695 nvptx_function_incoming_arg (cumulative_args_t cum_v,
696 const function_arg_info &arg)
698 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
700 if (arg.end_marker_p () || !arg.named)
701 return NULL_RTX;
703 /* No need to deal with split modes here, the only case that can
704 happen is complex modes and those are dealt with by
705 TARGET_SPLIT_COMPLEX_ARG. */
706 return gen_rtx_UNSPEC (arg.mode,
707 gen_rtvec (1, GEN_INT (cum->count)),
708 UNSPEC_ARG_REG);
711 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
713 static void
714 nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &)
716 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
718 cum->count++;
721 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
723 For nvptx This is only used for varadic args. The type has already
724 been promoted and/or converted to invisible reference. */
726 static unsigned
727 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
729 return GET_MODE_ALIGNMENT (mode);
732 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
734 For nvptx, we know how to handle functions declared as stdarg: by
735 passing an extra pointer to the unnamed arguments. However, the
736 Fortran frontend can produce a different situation, where a
737 function pointer is declared with no arguments, but the actual
738 function and calls to it take more arguments. In that case, we
739 want to ensure the call matches the definition of the function. */
741 static bool
742 nvptx_strict_argument_naming (cumulative_args_t cum_v)
744 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
746 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
749 /* Implement TARGET_LIBCALL_VALUE. */
751 static rtx
752 nvptx_libcall_value (machine_mode mode, const_rtx)
754 if (!cfun || !cfun->machine->doing_call)
755 /* Pretend to return in a hard reg for early uses before pseudos can be
756 generated. */
757 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
759 return gen_reg_rtx (mode);
762 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
763 where function FUNC returns or receives a value of data type TYPE. */
765 static rtx
766 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
767 bool outgoing)
769 machine_mode mode = promote_return (TYPE_MODE (type));
771 if (outgoing)
773 gcc_assert (cfun);
774 cfun->machine->return_mode = mode;
775 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
778 return nvptx_libcall_value (mode, NULL_RTX);
781 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
783 static bool
784 nvptx_function_value_regno_p (const unsigned int regno)
786 return regno == NVPTX_RETURN_REGNUM;
789 /* Types with a mode other than those supported by the machine are passed by
790 reference in memory. */
792 static bool
793 nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
795 return pass_in_memory (arg.mode, arg.type, false);
798 /* Implement TARGET_RETURN_IN_MEMORY. */
800 static bool
801 nvptx_return_in_memory (const_tree type, const_tree)
803 return pass_in_memory (TYPE_MODE (type), type, true);
806 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
808 static machine_mode
809 nvptx_promote_function_mode (const_tree type, machine_mode mode,
810 int *ARG_UNUSED (punsignedp),
811 const_tree funtype, int for_return)
813 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
816 /* Helper for write_arg. Emit a single PTX argument of MODE, either
817 in a prototype, or as copy in a function prologue. ARGNO is the
818 index of this argument in the PTX function. FOR_REG is negative,
819 if we're emitting the PTX prototype. It is zero if we're copying
820 to an argument register and it is greater than zero if we're
821 copying to a specific hard register. */
823 static int
824 write_arg_mode (std::stringstream &s, int for_reg, int argno,
825 machine_mode mode)
827 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
829 if (for_reg < 0)
831 /* Writing PTX prototype. */
832 s << (argno ? ", " : " (");
833 s << ".param" << ptx_type << " %in_ar" << argno;
835 else
837 s << "\t.reg" << ptx_type << " ";
838 if (for_reg)
839 s << reg_names[for_reg];
840 else
841 s << "%ar" << argno;
842 s << ";\n";
843 if (argno >= 0)
845 s << "\tld.param" << ptx_type << " ";
846 if (for_reg)
847 s << reg_names[for_reg];
848 else
849 s << "%ar" << argno;
850 s << ", [%in_ar" << argno << "];\n";
853 return argno + 1;
856 /* Process function parameter TYPE to emit one or more PTX
857 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
858 is true, if this is a prototyped function, rather than an old-style
859 C declaration. Returns the next argument number to use.
861 The promotion behavior here must match the regular GCC function
862 parameter marshalling machinery. */
864 static int
865 write_arg_type (std::stringstream &s, int for_reg, int argno,
866 tree type, bool prototyped)
868 machine_mode mode = TYPE_MODE (type);
870 if (mode == VOIDmode)
871 return argno;
873 if (pass_in_memory (mode, type, false))
874 mode = Pmode;
875 else
877 bool split = TREE_CODE (type) == COMPLEX_TYPE;
879 if (split)
881 /* Complex types are sent as two separate args. */
882 type = TREE_TYPE (type);
883 mode = TYPE_MODE (type);
884 prototyped = true;
887 mode = promote_arg (mode, prototyped);
888 if (split)
889 argno = write_arg_mode (s, for_reg, argno, mode);
892 return write_arg_mode (s, for_reg, argno, mode);
895 /* Emit a PTX return as a prototype or function prologue declaration
896 for MODE. */
898 static void
899 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
901 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
902 const char *pfx = "\t.reg";
903 const char *sfx = ";\n";
905 if (for_proto)
906 pfx = "(.param", sfx = "_out) ";
908 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
911 /* Process a function return TYPE to emit a PTX return as a prototype
912 or function prologue declaration. Returns true if return is via an
913 additional pointer parameter. The promotion behavior here must
914 match the regular GCC function return mashalling. */
916 static bool
917 write_return_type (std::stringstream &s, bool for_proto, tree type)
919 machine_mode mode = TYPE_MODE (type);
921 if (mode == VOIDmode)
922 return false;
924 bool return_in_mem = pass_in_memory (mode, type, true);
926 if (return_in_mem)
928 if (for_proto)
929 return return_in_mem;
931 /* Named return values can cause us to return a pointer as well
932 as expect an argument for the return location. This is
933 optimization-level specific, so no caller can make use of
934 this data, but more importantly for us, we must ensure it
935 doesn't change the PTX prototype. */
936 mode = (machine_mode) cfun->machine->return_mode;
938 if (mode == VOIDmode)
939 return return_in_mem;
941 /* Clear return_mode to inhibit copy of retval to non-existent
942 retval parameter. */
943 cfun->machine->return_mode = VOIDmode;
945 else
946 mode = promote_return (mode);
948 write_return_mode (s, for_proto, mode);
950 return return_in_mem;
953 /* Look for attributes in ATTRS that would indicate we must write a function
954 as a .entry kernel rather than a .func. Return true if one is found. */
956 static bool
957 write_as_kernel (tree attrs)
959 return (lookup_attribute ("kernel", attrs) != NULL_TREE
960 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
961 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
962 /* For OpenMP target regions, the corresponding kernel entry is emitted from
963 write_omp_entry as a separate function. */
966 /* Emit a linker marker for a function decl or defn. */
968 static void
969 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
970 const char *name)
972 s << "\n// BEGIN";
973 if (globalize)
974 s << " GLOBAL";
975 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
976 s << name << "\n";
979 /* Emit a linker marker for a variable decl or defn. */
981 static void
982 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
984 fprintf (file, "\n// BEGIN%s VAR %s: ",
985 globalize ? " GLOBAL" : "",
986 is_defn ? "DEF" : "DECL");
987 assemble_name_raw (file, name);
988 fputs ("\n", file);
991 /* Helper function for write_fn_proto. */
993 static void
994 write_fn_proto_1 (std::stringstream &s, bool is_defn,
995 const char *name, const_tree decl, bool force_public)
997 if (lookup_attribute ("alias", DECL_ATTRIBUTES (decl)) == NULL)
998 write_fn_marker (s, is_defn, TREE_PUBLIC (decl) || force_public, name);
1000 /* PTX declaration. */
1001 if (DECL_EXTERNAL (decl))
1002 s << ".extern ";
1003 else if (TREE_PUBLIC (decl) || force_public)
1004 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
1005 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
1007 tree fntype = TREE_TYPE (decl);
1008 tree result_type = TREE_TYPE (fntype);
1010 /* atomic_compare_exchange_$n builtins have an exceptional calling
1011 convention. */
1012 int not_atomic_weak_arg = -1;
1013 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
1014 switch (DECL_FUNCTION_CODE (decl))
1016 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
1017 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
1018 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
1019 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
1020 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
1021 /* These atomics skip the 'weak' parm in an actual library
1022 call. We must skip it in the prototype too. */
1023 not_atomic_weak_arg = 3;
1024 break;
1026 default:
1027 break;
1030 /* Declare the result. */
1031 bool return_in_mem = write_return_type (s, true, result_type);
1033 s << name;
1035 int argno = 0;
1037 /* Emit argument list. */
1038 if (return_in_mem)
1039 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1041 /* We get:
1042 NULL in TYPE_ARG_TYPES, for old-style functions
1043 NULL in DECL_ARGUMENTS, for builtin functions without another
1044 declaration.
1045 So we have to pick the best one we have. */
1046 tree args = TYPE_ARG_TYPES (fntype);
1047 bool prototyped = true;
1048 if (!args)
1050 args = DECL_ARGUMENTS (decl);
1051 prototyped = false;
1054 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
1056 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1058 if (not_atomic_weak_arg)
1059 argno = write_arg_type (s, -1, argno, type, prototyped);
1060 else
1061 gcc_assert (TREE_CODE (type) == BOOLEAN_TYPE);
1064 if (stdarg_p (fntype))
1065 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1067 if (DECL_STATIC_CHAIN (decl))
1068 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1070 if (argno < 2 && strcmp (name, "main") == 0)
1072 if (argno == 0)
1073 argno = write_arg_type (s, -1, argno, integer_type_node, true);
1075 if (argno == 1)
1076 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1079 if (argno)
1080 s << ")";
1082 s << (is_defn ? "\n" : ";\n");
1085 /* Write a .func or .kernel declaration or definition along with
1086 a helper comment for use by ld. S is the stream to write to, DECL
1087 the decl for the function with name NAME. For definitions, emit
1088 a declaration too. */
1090 static void
1091 write_fn_proto (std::stringstream &s, bool is_defn,
1092 const char *name, const_tree decl, bool force_public=false)
1094 const char *replacement = nvptx_name_replacement (name);
1095 char *replaced_dots = NULL;
1096 if (replacement != name)
1097 name = replacement;
1098 else
1100 replaced_dots = nvptx_replace_dot (name);
1101 if (replaced_dots)
1102 name = replaced_dots;
1104 if (name[0] == '*')
1105 name++;
1107 if (is_defn)
1108 /* Emit a declaration. The PTX assembler gets upset without it. */
1109 write_fn_proto_1 (s, false, name, decl, force_public);
1111 write_fn_proto_1 (s, is_defn, name, decl, force_public);
1113 if (replaced_dots)
1114 XDELETE (replaced_dots);
1117 /* Construct a function declaration from a call insn. This can be
1118 necessary for two reasons - either we have an indirect call which
1119 requires a .callprototype declaration, or we have a libcall
1120 generated by emit_library_call for which no decl exists. */
1122 static void
1123 write_fn_proto_from_insn (std::stringstream &s, const char *name,
1124 rtx result, rtx pat)
1126 char *replaced_dots = NULL;
1128 if (!name)
1130 s << "\t.callprototype ";
1131 name = "_";
1133 else
1135 const char *replacement = nvptx_name_replacement (name);
1136 if (replacement != name)
1137 name = replacement;
1138 else
1140 replaced_dots = nvptx_replace_dot (name);
1141 if (replaced_dots)
1142 name = replaced_dots;
1144 write_fn_marker (s, false, true, name);
1145 s << "\t.extern .func ";
1148 if (result != NULL_RTX)
1149 write_return_mode (s, true, GET_MODE (result));
1151 s << name;
1152 if (replaced_dots)
1153 XDELETE (replaced_dots);
1155 int arg_end = XVECLEN (pat, 0);
1156 for (int i = 1; i < arg_end; i++)
1158 /* We don't have to deal with mode splitting & promotion here,
1159 as that was already done when generating the call
1160 sequence. */
1161 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
1163 write_arg_mode (s, -1, i - 1, mode);
1165 if (arg_end != 1)
1166 s << ")";
1167 s << ";\n";
1170 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
1171 table and write a ptx prototype. These are emitted at end of
1172 compilation. */
1174 static void
1175 nvptx_record_fndecl (tree decl)
1177 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
1178 if (*slot == NULL)
1180 *slot = decl;
1181 const char *name = get_fnname_from_decl (decl);
1182 write_fn_proto (func_decls, false, name, decl);
1186 /* Record a libcall or unprototyped external function. CALLEE is the
1187 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
1188 declaration for it. */
1190 static void
1191 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
1193 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1194 if (*slot == NULL)
1196 *slot = callee;
1198 const char *name = XSTR (callee, 0);
1199 write_fn_proto_from_insn (func_decls, name, retval, pat);
1203 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
1204 is prototyped, record it now. Otherwise record it as needed at end
1205 of compilation, when we might have more information about it. */
1207 void
1208 nvptx_record_needed_fndecl (tree decl)
1210 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1212 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1213 if (*slot == NULL)
1214 *slot = decl;
1216 else
1217 nvptx_record_fndecl (decl);
1220 /* SYM is a SYMBOL_REF. If it refers to an external function, record
1221 it as needed. */
1223 static void
1224 nvptx_maybe_record_fnsym (rtx sym)
1226 tree decl = SYMBOL_REF_DECL (sym);
1228 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1229 nvptx_record_needed_fndecl (decl);
1232 /* Emit a local array to hold some part of a conventional stack frame
1233 and initialize REGNO to point to it. If the size is zero, it'll
1234 never be valid to dereference, so we can simply initialize to
1235 zero. */
1237 static void
1238 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1240 if (size)
1241 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1242 align, reg_names[regno], size);
1243 fprintf (file, "\t.reg.u%d %s;\n",
1244 POINTER_SIZE, reg_names[regno]);
1245 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1246 : "\tmov.u%d %s, 0;\n"),
1247 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1250 /* Emit soft stack frame setup sequence. */
1252 static void
1253 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1255 /* Maintain 64-bit stack alignment. */
1256 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1257 size = ROUND_UP (size, keep_align);
1258 int bits = POINTER_SIZE;
1259 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1260 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1261 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1262 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1263 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1264 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1265 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1266 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1267 fprintf (file, "\t{\n");
1268 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1269 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1270 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1271 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1272 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1273 bits == 64 ? ".wide" : ".lo", bits / 8);
1274 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1276 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1277 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1279 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1280 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1281 bits, reg_sspprev, reg_sspslot);
1283 /* Initialize %frame = %sspprev - size. */
1284 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1285 bits, reg_frame, reg_sspprev, size);
1287 /* Apply alignment, if larger than 64. */
1288 if (alignment > keep_align)
1289 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1290 bits, reg_frame, reg_frame, -alignment);
1292 size = crtl->outgoing_args_size;
1293 gcc_assert (size % keep_align == 0);
1295 /* Initialize %stack. */
1296 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1297 bits, reg_stack, reg_frame, size);
1299 if (!crtl->is_leaf)
1300 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1301 bits, reg_sspslot, reg_stack);
1302 fprintf (file, "\t}\n");
1303 cfun->machine->has_softstack = true;
1304 need_softstack_decl = true;
1307 /* Emit code to initialize the REGNO predicate register to indicate
1308 whether we are not lane zero on the NAME axis. */
1310 static void
1311 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1313 fprintf (file, "\t{\n");
1314 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1315 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1317 fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1318 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1320 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1321 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1322 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1324 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1325 fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1326 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1327 "// vector reduction buffer\n",
1328 REGNO (cfun->machine->red_partition),
1329 vector_red_partition);
1331 /* Verify vector_red_size. */
1332 gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1333 <= vector_red_size);
1334 fprintf (file, "\t}\n");
1337 /* Emit code to initialize OpenACC worker broadcast and synchronization
1338 registers. */
1340 static void
1341 nvptx_init_oacc_workers (FILE *file)
1343 fprintf (file, "\t{\n");
1344 fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1345 if (cfun->machine->bcast_partition)
1347 fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1348 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1350 fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1351 if (cfun->machine->bcast_partition)
1353 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1354 fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1355 fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1356 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1357 "// vector broadcast offset\n",
1358 REGNO (cfun->machine->bcast_partition),
1359 oacc_bcast_partition);
1361 /* Verify oacc_bcast_size. */
1362 gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1363 <= oacc_bcast_size);
1364 if (cfun->machine->sync_bar)
1365 fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1366 "// vector synchronization barrier\n",
1367 REGNO (cfun->machine->sync_bar));
1368 fprintf (file, "\t}\n");
1371 /* Emit code to initialize predicate and master lane index registers for
1372 -muniform-simt code generation variant. */
1374 static void
1375 nvptx_init_unisimt_predicate (FILE *file)
1377 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1378 int loc = REGNO (cfun->machine->unisimt_location);
1379 int bits = POINTER_SIZE;
1380 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1381 fprintf (file, "\t{\n");
1382 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1383 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1384 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1385 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1386 bits == 64 ? ".wide" : ".lo");
1387 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1388 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1389 if (cfun->machine->unisimt_predicate)
1391 int master = REGNO (cfun->machine->unisimt_master);
1392 int pred = REGNO (cfun->machine->unisimt_predicate);
1393 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1394 if (cfun->machine->unisimt_outside_simt_predicate)
1396 int pred_outside_simt
1397 = REGNO (cfun->machine->unisimt_outside_simt_predicate);
1398 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, 0;\n",
1399 pred_outside_simt, master);
1401 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1402 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1403 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1404 /* Compute predicate as 'tid.x == master'. */
1405 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1407 fprintf (file, "\t}\n");
1408 need_unisimt_decl = true;
1411 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1413 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1414 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1416 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1417 __nvptx_uni[tid.y] = 0;
1418 gomp_nvptx_main (ORIG, arg);
1420 ORIG itself should not be emitted as a PTX .entry function. */
1422 static void
1423 write_omp_entry (FILE *file, const char *name, const char *orig)
1425 static bool gomp_nvptx_main_declared;
1426 if (!gomp_nvptx_main_declared)
1428 gomp_nvptx_main_declared = true;
1429 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1430 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1431 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1433 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1434 #define NTID_Y "%ntid.y"
1435 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1436 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1437 {\n\
1438 .reg.u32 %r<3>;\n\
1439 .reg.u" PS " %R<4>;\n\
1440 mov.u32 %r0, %tid.y;\n\
1441 mov.u32 %r1, " NTID_Y ";\n\
1442 mov.u32 %r2, %ctaid.x;\n\
1443 cvt.u" PS ".u32 %R1, %r0;\n\
1444 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1445 mov.u" PS " %R0, __nvptx_stacks;\n\
1446 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1447 ld.param.u" PS " %R2, [%stack];\n\
1448 ld.param.u" PS " %R3, [%sz];\n\
1449 add.u" PS " %R2, %R2, %R3;\n\
1450 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1451 st.shared.u" PS " [%R0], %R2;\n\
1452 mov.u" PS " %R0, __nvptx_uni;\n\
1453 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1454 mov.u32 %r0, 0;\n\
1455 st.shared.u32 [%R0], %r0;\n\
1456 mov.u" PS " %R0, \0;\n\
1457 ld.param.u" PS " %R1, [%arg];\n\
1458 {\n\
1459 .param.u" PS " %P<2>;\n\
1460 st.param.u" PS " [%P0], %R0;\n\
1461 st.param.u" PS " [%P1], %R1;\n\
1462 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1463 }\n\
1464 ret.uni;\n\
1465 }\n"
1466 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1467 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1468 #undef ENTRY_TEMPLATE
1469 #undef NTID_Y
1470 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1471 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1472 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1473 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1474 need_softstack_decl = need_unisimt_decl = true;
1477 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1478 function, including local var decls and copies from the arguments to
1479 local regs. */
1481 void
1482 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1484 tree fntype = TREE_TYPE (decl);
1485 tree result_type = TREE_TYPE (fntype);
1486 int argno = 0;
1487 bool force_public = false;
1489 /* For reverse-offload 'nohost' functions: In order to be collectable in
1490 '$offload_func_table', cf. mkoffload.cc, the function has to be visible. */
1491 if (lookup_attribute ("omp target device_ancestor_nohost",
1492 DECL_ATTRIBUTES (decl)))
1493 force_public = true;
1494 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1495 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1497 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1498 sprintf (buf, "%s$impl", name);
1499 write_omp_entry (file, name, buf);
1500 name = buf;
1502 /* We construct the initial part of the function into a string
1503 stream, in order to share the prototype writing code. */
1504 std::stringstream s;
1505 write_fn_proto (s, true, name, decl, force_public);
1506 s << "{\n";
1508 bool return_in_mem = write_return_type (s, false, result_type);
1509 if (return_in_mem)
1510 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1512 /* Declare and initialize incoming arguments. */
1513 tree args = TYPE_ARG_TYPES (fntype);
1514 bool prototyped = true;
1515 if (!args)
1517 args = DECL_ARGUMENTS (decl);
1518 prototyped = false;
1521 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1523 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1525 argno = write_arg_type (s, 0, argno, type, prototyped);
1528 if (stdarg_p (fntype))
1529 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1530 true);
1532 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1533 write_arg_type (s, STATIC_CHAIN_REGNUM,
1534 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1535 true);
1537 fprintf (file, "%s", s.str().c_str());
1539 /* Usually 'crtl->is_leaf' is computed during register allocator
1540 initialization (which is not done on NVPTX) or for pressure-sensitive
1541 optimizations. Initialize it here, except if already set. */
1542 if (!crtl->is_leaf)
1543 crtl->is_leaf = leaf_function_p ();
1545 HOST_WIDE_INT sz = get_frame_size ();
1546 bool need_frameptr = sz || cfun->machine->has_chain;
1547 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1548 if (!TARGET_SOFT_STACK)
1550 /* Declare a local var for outgoing varargs. */
1551 if (cfun->machine->has_varadic)
1552 init_frame (file, STACK_POINTER_REGNUM,
1553 UNITS_PER_WORD, crtl->outgoing_args_size);
1555 /* Declare a local variable for the frame. Force its size to be
1556 DImode-compatible. */
1557 if (need_frameptr)
1558 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1559 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1561 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1562 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1563 init_softstack_frame (file, alignment, sz);
1565 if (cfun->machine->has_simtreg)
1567 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1568 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1569 align = MAX (align, GET_MODE_SIZE (DImode));
1570 if (!crtl->is_leaf || cfun->calls_alloca)
1571 simtsz = HOST_WIDE_INT_M1U;
1572 if (simtsz == HOST_WIDE_INT_M1U)
1573 simtsz = nvptx_softstack_size;
1574 if (cfun->machine->has_softstack)
1575 simtsz += POINTER_SIZE / 8;
1576 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1577 if (align > GET_MODE_SIZE (DImode))
1578 simtsz += align - GET_MODE_SIZE (DImode);
1579 if (simtsz)
1580 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1581 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1584 /* Restore the vector reduction partition register, if necessary.
1585 FIXME: Find out when and why this is necessary, and fix it. */
1586 if (cfun->machine->red_partition)
1587 regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1588 = cfun->machine->red_partition;
1590 /* Declare the pseudos we have as ptx registers. */
1591 int maxregs = max_reg_num ();
1592 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1594 if (regno_reg_rtx[i] != const0_rtx)
1596 machine_mode mode = PSEUDO_REGNO_MODE (i);
1597 machine_mode split = maybe_split_mode (mode);
1599 if (split_mode_p (mode))
1600 mode = split;
1601 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1602 output_reg (file, i, split, -2);
1603 fprintf (file, ";\n");
1607 /* Emit axis predicates. */
1608 if (cfun->machine->axis_predicate[0])
1609 nvptx_init_axis_predicate (file,
1610 REGNO (cfun->machine->axis_predicate[0]), "y");
1611 if (cfun->machine->axis_predicate[1])
1612 nvptx_init_axis_predicate (file,
1613 REGNO (cfun->machine->axis_predicate[1]), "x");
1614 if (cfun->machine->unisimt_predicate
1615 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1616 nvptx_init_unisimt_predicate (file);
1617 if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1618 nvptx_init_oacc_workers (file);
1621 /* Output code for switching uniform-simt state. ENTERING indicates whether
1622 we are entering or leaving non-uniform execution region. */
1624 static void
1625 nvptx_output_unisimt_switch (FILE *file, bool entering)
1627 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1628 return;
1629 fprintf (file, "\t{\n");
1630 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1631 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1632 if (cfun->machine->unisimt_outside_simt_predicate)
1634 int pred_outside_simt
1635 = REGNO (cfun->machine->unisimt_outside_simt_predicate);
1636 fprintf (file, "\t\tmov.pred %%r%d, %d;\n", pred_outside_simt,
1637 entering ? 0 : 1);
1639 if (!crtl->is_leaf)
1641 int loc = REGNO (cfun->machine->unisimt_location);
1642 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1644 if (cfun->machine->unisimt_predicate)
1646 int master = REGNO (cfun->machine->unisimt_master);
1647 int pred = REGNO (cfun->machine->unisimt_predicate);
1648 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1649 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1650 master, entering ? "%ustmp2" : "0");
1651 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1653 fprintf (file, "\t}\n");
1656 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1657 ENTERING indicates whether we are entering or leaving non-uniform execution.
1658 PTR is the register pointing to allocated storage, it is assigned to on
1659 entering and used to restore state on leaving. SIZE and ALIGN are used only
1660 on entering. */
1662 static void
1663 nvptx_output_softstack_switch (FILE *file, bool entering,
1664 rtx ptr, rtx size, rtx align)
1666 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1667 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1668 return;
1669 int bits = POINTER_SIZE, regno = REGNO (ptr);
1670 fprintf (file, "\t{\n");
1671 if (entering)
1673 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1674 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1675 cfun->machine->simt_stack_size);
1676 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1677 if (CONST_INT_P (size))
1678 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1679 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1680 else
1681 output_reg (file, REGNO (size), VOIDmode);
1682 fputs (";\n", file);
1683 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1684 fprintf (file,
1685 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1686 bits, regno, regno, UINTVAL (align));
1688 if (cfun->machine->has_softstack)
1690 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1691 if (entering)
1693 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1694 bits, regno, bits / 8, reg_stack);
1695 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1696 bits, reg_stack, regno, bits / 8);
1698 else
1700 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1701 bits, reg_stack, regno, bits / 8);
1703 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1705 fprintf (file, "\t}\n");
1708 /* Output code to enter non-uniform execution region. DEST is a register
1709 to hold a per-lane allocation given by SIZE and ALIGN. */
1711 const char *
1712 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1714 nvptx_output_unisimt_switch (asm_out_file, true);
1715 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1716 return "";
1719 /* Output code to leave non-uniform execution region. SRC is the register
1720 holding per-lane storage previously allocated by omp_simt_enter insn. */
1722 const char *
1723 nvptx_output_simt_exit (rtx src)
1725 nvptx_output_unisimt_switch (asm_out_file, false);
1726 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1727 return "";
1730 /* Output instruction that sets soft stack pointer in shared memory to the
1731 value in register given by SRC_REGNO. */
1733 const char *
1734 nvptx_output_set_softstack (unsigned src_regno)
1736 if (cfun->machine->has_softstack && !crtl->is_leaf)
1738 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1739 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1740 output_reg (asm_out_file, src_regno, VOIDmode);
1741 fprintf (asm_out_file, ";\n");
1743 return "";
1745 /* Output a return instruction. Also copy the return value to its outgoing
1746 location. */
1748 const char *
1749 nvptx_output_return (void)
1751 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1753 if (mode != VOIDmode)
1754 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1755 nvptx_ptx_type_from_mode (mode, false),
1756 reg_names[NVPTX_RETURN_REGNUM],
1757 reg_names[NVPTX_RETURN_REGNUM]);
1759 return "ret;";
1762 /* Terminate a function by writing a closing brace to FILE. */
1764 void
1765 nvptx_function_end (FILE *file)
1767 fprintf (file, "}\n");
1770 /* Decide whether we can make a sibling call to a function. For ptx, we
1771 can't. */
1773 static bool
1774 nvptx_function_ok_for_sibcall (tree, tree)
1776 return false;
1779 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1781 static rtx
1782 nvptx_get_drap_rtx (void)
1784 if (TARGET_SOFT_STACK && stack_realign_drap)
1785 return arg_pointer_rtx;
1786 return NULL_RTX;
1789 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1790 argument to the next call. */
1792 static void
1793 nvptx_call_args (rtx arg, tree fntype)
1795 if (!cfun->machine->doing_call)
1797 cfun->machine->doing_call = true;
1798 cfun->machine->is_varadic = false;
1799 cfun->machine->num_args = 0;
1801 if (fntype && stdarg_p (fntype))
1803 cfun->machine->is_varadic = true;
1804 cfun->machine->has_varadic = true;
1805 cfun->machine->num_args++;
1809 if (REG_P (arg) && arg != pc_rtx)
1811 cfun->machine->num_args++;
1812 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1813 cfun->machine->call_args);
1817 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1818 information we recorded. */
1820 static void
1821 nvptx_end_call_args (void)
1823 cfun->machine->doing_call = false;
1824 free_EXPR_LIST_list (&cfun->machine->call_args);
1827 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1828 track of whether calls involving static chains or varargs were seen
1829 in the current function.
1830 For libcalls, maintain a hash table of decls we have seen, and
1831 record a function decl for later when encountering a new one. */
1833 void
1834 nvptx_expand_call (rtx retval, rtx address)
1836 rtx callee = XEXP (address, 0);
1837 rtx varargs = NULL_RTX;
1838 unsigned parallel = 0;
1840 if (!call_insn_operand (callee, Pmode))
1842 callee = force_reg (Pmode, callee);
1843 address = change_address (address, QImode, callee);
1846 if (GET_CODE (callee) == SYMBOL_REF)
1848 tree decl = SYMBOL_REF_DECL (callee);
1849 if (decl != NULL_TREE)
1851 if (DECL_STATIC_CHAIN (decl))
1852 cfun->machine->has_chain = true;
1854 tree attr = oacc_get_fn_attrib (decl);
1855 if (attr)
1857 tree dims = TREE_VALUE (attr);
1859 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1860 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1862 if (TREE_PURPOSE (dims)
1863 && !integer_zerop (TREE_PURPOSE (dims)))
1864 break;
1865 /* Not on this axis. */
1866 parallel ^= GOMP_DIM_MASK (ix);
1867 dims = TREE_CHAIN (dims);
1873 unsigned nargs = cfun->machine->num_args;
1874 if (cfun->machine->is_varadic)
1876 varargs = gen_reg_rtx (Pmode);
1877 emit_move_insn (varargs, stack_pointer_rtx);
1880 rtvec vec = rtvec_alloc (nargs + 1);
1881 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1882 int vec_pos = 0;
1884 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1885 rtx tmp_retval = retval;
1886 if (retval)
1888 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1889 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1890 call = gen_rtx_SET (tmp_retval, call);
1892 XVECEXP (pat, 0, vec_pos++) = call;
1894 /* Construct the call insn, including a USE for each argument pseudo
1895 register. These will be used when printing the insn. */
1896 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1897 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1899 if (varargs)
1900 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1902 gcc_assert (vec_pos = XVECLEN (pat, 0));
1904 nvptx_emit_forking (parallel, true);
1905 emit_call_insn (pat);
1906 nvptx_emit_joining (parallel, true);
1908 if (tmp_retval != retval)
1909 emit_move_insn (retval, tmp_retval);
1912 /* Emit a comparison COMPARE, and return the new test to be used in the
1913 jump. */
1916 nvptx_expand_compare (rtx compare)
1918 rtx pred = gen_reg_rtx (BImode);
1919 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1920 XEXP (compare, 0), XEXP (compare, 1));
1921 emit_insn (gen_rtx_SET (pred, cmp));
1922 return gen_rtx_NE (BImode, pred, const0_rtx);
1925 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1927 void
1928 nvptx_expand_oacc_fork (unsigned mode)
1930 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1933 void
1934 nvptx_expand_oacc_join (unsigned mode)
1936 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1939 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1940 objects. */
1942 static rtx
1943 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1945 rtx res;
1947 switch (GET_MODE (src))
1949 case E_DImode:
1950 res = gen_unpackdisi2 (dst0, dst1, src);
1951 break;
1952 case E_DFmode:
1953 res = gen_unpackdfsi2 (dst0, dst1, src);
1954 break;
1955 default: gcc_unreachable ();
1957 return res;
1960 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1961 object. */
1963 static rtx
1964 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1966 rtx res;
1968 switch (GET_MODE (dst))
1970 case E_DImode:
1971 res = gen_packsidi2 (dst, src0, src1);
1972 break;
1973 case E_DFmode:
1974 res = gen_packsidf2 (dst, src0, src1);
1975 break;
1976 default: gcc_unreachable ();
1978 return res;
1981 /* Generate an instruction or sequence to broadcast register REG
1982 across the vectors of a single warp. */
1985 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1987 rtx res;
1989 switch (GET_MODE (dst))
1991 case E_DCmode:
1992 case E_CDImode:
1994 gcc_assert (GET_CODE (dst) == CONCAT);
1995 gcc_assert (GET_CODE (src) == CONCAT);
1996 rtx dst_real = XEXP (dst, 0);
1997 rtx dst_imag = XEXP (dst, 1);
1998 rtx src_real = XEXP (src, 0);
1999 rtx src_imag = XEXP (src, 1);
2001 start_sequence ();
2002 emit_insn (nvptx_gen_shuffle (dst_real, src_real, idx, kind));
2003 emit_insn (nvptx_gen_shuffle (dst_imag, src_imag, idx, kind));
2004 res = get_insns ();
2005 end_sequence ();
2007 break;
2008 case E_SImode:
2009 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
2010 break;
2011 case E_SFmode:
2012 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
2013 break;
2014 case E_DImode:
2015 case E_DFmode:
2017 rtx tmp0 = gen_reg_rtx (SImode);
2018 rtx tmp1 = gen_reg_rtx (SImode);
2020 start_sequence ();
2021 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
2022 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
2023 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
2024 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
2025 res = get_insns ();
2026 end_sequence ();
2028 break;
2029 case E_V2SImode:
2031 rtx src0 = gen_rtx_SUBREG (SImode, src, 0);
2032 rtx src1 = gen_rtx_SUBREG (SImode, src, 4);
2033 rtx dst0 = gen_rtx_SUBREG (SImode, dst, 0);
2034 rtx dst1 = gen_rtx_SUBREG (SImode, dst, 4);
2035 rtx tmp0 = gen_reg_rtx (SImode);
2036 rtx tmp1 = gen_reg_rtx (SImode);
2037 start_sequence ();
2038 emit_insn (gen_movsi (tmp0, src0));
2039 emit_insn (gen_movsi (tmp1, src1));
2040 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
2041 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
2042 emit_insn (gen_movsi (dst0, tmp0));
2043 emit_insn (gen_movsi (dst1, tmp1));
2044 res = get_insns ();
2045 end_sequence ();
2047 break;
2048 case E_V2DImode:
2050 rtx src0 = gen_rtx_SUBREG (DImode, src, 0);
2051 rtx src1 = gen_rtx_SUBREG (DImode, src, 8);
2052 rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0);
2053 rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8);
2054 rtx tmp0 = gen_reg_rtx (DImode);
2055 rtx tmp1 = gen_reg_rtx (DImode);
2056 start_sequence ();
2057 emit_insn (gen_movdi (tmp0, src0));
2058 emit_insn (gen_movdi (tmp1, src1));
2059 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
2060 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
2061 emit_insn (gen_movdi (dst0, tmp0));
2062 emit_insn (gen_movdi (dst1, tmp1));
2063 res = get_insns ();
2064 end_sequence ();
2066 break;
2067 case E_BImode:
2069 rtx tmp = gen_reg_rtx (SImode);
2071 start_sequence ();
2072 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
2073 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
2074 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
2075 res = get_insns ();
2076 end_sequence ();
2078 break;
2079 case E_QImode:
2080 case E_HImode:
2082 rtx tmp = gen_reg_rtx (SImode);
2084 start_sequence ();
2085 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
2086 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
2087 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
2088 tmp)));
2089 res = get_insns ();
2090 end_sequence ();
2092 break;
2094 default:
2095 gcc_unreachable ();
2097 return res;
2100 /* Generate an instruction or sequence to broadcast register REG
2101 across the vectors of a single warp. */
2103 static rtx
2104 nvptx_gen_warp_bcast (rtx reg)
2106 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
2109 /* Structure used when generating a worker-level spill or fill. */
2111 struct broadcast_data_t
2113 rtx base; /* Register holding base addr of buffer. */
2114 rtx ptr; /* Iteration var, if needed. */
2115 unsigned offset; /* Offset into worker buffer. */
2118 /* Direction of the spill/fill and looping setup/teardown indicator. */
2120 enum propagate_mask
2122 PM_read = 1 << 0,
2123 PM_write = 1 << 1,
2124 PM_loop_begin = 1 << 2,
2125 PM_loop_end = 1 << 3,
2127 PM_read_write = PM_read | PM_write
2130 /* Generate instruction(s) to spill or fill register REG to/from the
2131 worker broadcast array. PM indicates what is to be done, REP
2132 how many loop iterations will be executed (0 for not a loop). */
2134 static rtx
2135 nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
2136 broadcast_data_t *data, bool vector)
2138 rtx res;
2139 machine_mode mode = GET_MODE (reg);
2141 switch (mode)
2143 case E_BImode:
2145 rtx tmp = gen_reg_rtx (SImode);
2147 start_sequence ();
2148 if (pm & PM_read)
2149 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
2150 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
2151 if (pm & PM_write)
2152 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
2153 res = get_insns ();
2154 end_sequence ();
2156 break;
2158 default:
2160 rtx addr = data->ptr;
2162 if (!addr)
2164 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
2166 oacc_bcast_align = MAX (oacc_bcast_align, align);
2167 data->offset = ROUND_UP (data->offset, align);
2168 addr = data->base;
2169 gcc_assert (data->base != NULL);
2170 if (data->offset)
2171 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
2174 addr = gen_rtx_MEM (mode, addr);
2175 if (pm == PM_read)
2176 res = gen_rtx_SET (addr, reg);
2177 else if (pm == PM_write)
2178 res = gen_rtx_SET (reg, addr);
2179 else
2180 gcc_unreachable ();
2182 if (data->ptr)
2184 /* We're using a ptr, increment it. */
2185 start_sequence ();
2187 emit_insn (res);
2188 emit_insn (gen_adddi3 (data->ptr, data->ptr,
2189 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
2190 res = get_insns ();
2191 end_sequence ();
2193 else
2194 rep = 1;
2195 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
2197 break;
2199 return res;
2202 /* Returns true if X is a valid address for use in a memory reference. */
2204 static bool
2205 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
2207 enum rtx_code code = GET_CODE (x);
2209 switch (code)
2211 case REG:
2212 return true;
2214 case PLUS:
2215 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
2216 return true;
2217 return false;
2219 case CONST:
2220 case SYMBOL_REF:
2221 case LABEL_REF:
2222 return true;
2224 default:
2225 return false;
2229 /* Machinery to output constant initializers. When beginning an
2230 initializer, we decide on a fragment size (which is visible in ptx
2231 in the type used), and then all initializer data is buffered until
2232 a fragment is filled and ready to be written out. */
2234 static struct
2236 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
2237 unsigned HOST_WIDE_INT val; /* Current fragment value. */
2238 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
2239 out. */
2240 unsigned size; /* Fragment size to accumulate. */
2241 unsigned offset; /* Offset within current fragment. */
2242 bool started; /* Whether we've output any initializer. */
2243 } init_frag;
2245 /* The current fragment is full, write it out. SYM may provide a
2246 symbolic reference we should output, in which case the fragment
2247 value is the addend. */
2249 static void
2250 output_init_frag (rtx sym)
2252 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
2253 unsigned HOST_WIDE_INT val = init_frag.val;
2255 init_frag.started = true;
2256 init_frag.val = 0;
2257 init_frag.offset = 0;
2258 init_frag.remaining--;
2260 if (sym)
2262 bool function = (SYMBOL_REF_DECL (sym)
2263 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
2264 if (!function)
2265 fprintf (asm_out_file, "generic(");
2266 output_address (VOIDmode, sym);
2267 if (!function)
2268 fprintf (asm_out_file, ")");
2269 if (val)
2270 fprintf (asm_out_file, " + ");
2273 if (!sym || val)
2274 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
2277 /* Add value VAL of size SIZE to the data we're emitting, and keep
2278 writing out chunks as they fill up. */
2280 static void
2281 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
2283 bool negative_p
2284 = val & (HOST_WIDE_INT_1U << (HOST_BITS_PER_WIDE_INT - 1));
2286 /* Avoid undefined behaviour. */
2287 if (size * BITS_PER_UNIT < HOST_BITS_PER_WIDE_INT)
2288 val &= (HOST_WIDE_INT_1U << (size * BITS_PER_UNIT)) - 1;
2290 for (unsigned part = 0; size; size -= part)
2292 if (part * BITS_PER_UNIT == HOST_BITS_PER_WIDE_INT)
2293 /* Avoid undefined behaviour. */
2294 val = negative_p ? -1 : 0;
2295 else
2296 val >>= (part * BITS_PER_UNIT);
2297 part = init_frag.size - init_frag.offset;
2298 part = MIN (part, size);
2300 unsigned HOST_WIDE_INT partial
2301 = val << (init_frag.offset * BITS_PER_UNIT);
2302 init_frag.val |= partial & init_frag.mask;
2303 init_frag.offset += part;
2305 if (init_frag.offset == init_frag.size)
2306 output_init_frag (NULL);
2310 /* Target hook for assembling integer object X of size SIZE. */
2312 static bool
2313 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2315 HOST_WIDE_INT val = 0;
2317 switch (GET_CODE (x))
2319 default:
2320 /* Let the generic machinery figure it out, usually for a
2321 CONST_WIDE_INT. */
2322 return false;
2324 case CONST_INT:
2325 nvptx_assemble_value (INTVAL (x), size);
2326 break;
2328 case CONST:
2329 x = XEXP (x, 0);
2330 gcc_assert (GET_CODE (x) == PLUS);
2331 val = INTVAL (XEXP (x, 1));
2332 x = XEXP (x, 0);
2333 gcc_assert (GET_CODE (x) == SYMBOL_REF);
2334 gcc_fallthrough (); /* FALLTHROUGH */
2336 case SYMBOL_REF:
2337 gcc_assert (size == init_frag.size);
2338 if (init_frag.offset)
2339 sorry ("cannot emit unaligned pointers in ptx assembly");
2341 nvptx_maybe_record_fnsym (x);
2342 init_frag.val = val;
2343 output_init_frag (x);
2344 break;
2347 return true;
2350 /* Output SIZE zero bytes. We ignore the FILE argument since the
2351 functions we're calling to perform the output just use
2352 asm_out_file. */
2354 void
2355 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2357 /* Finish the current fragment, if it's started. */
2358 if (init_frag.offset)
2360 unsigned part = init_frag.size - init_frag.offset;
2361 part = MIN (part, (unsigned)size);
2362 size -= part;
2363 nvptx_assemble_value (0, part);
2366 /* If this skip doesn't terminate the initializer, write as many
2367 remaining pieces as possible directly. */
2368 if (size < init_frag.remaining * init_frag.size)
2370 while (size >= init_frag.size)
2372 size -= init_frag.size;
2373 output_init_frag (NULL_RTX);
2375 if (size)
2376 nvptx_assemble_value (0, size);
2380 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2381 ignore the FILE arg. */
2383 void
2384 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2386 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2387 nvptx_assemble_value (str[i], 1);
2390 /* Return true if TYPE is a record type where the last field is an array without
2391 given dimension. */
2393 static bool
2394 flexible_array_member_type_p (const_tree type)
2396 if (TREE_CODE (type) != RECORD_TYPE)
2397 return false;
2399 const_tree last_field = NULL_TREE;
2400 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2401 last_field = f;
2403 if (!last_field)
2404 return false;
2406 const_tree last_field_type = TREE_TYPE (last_field);
2407 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2408 return false;
2410 return (! TYPE_DOMAIN (last_field_type)
2411 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2414 /* Emit a PTX variable decl and prepare for emission of its
2415 initializer. NAME is the symbol name and SETION the PTX data
2416 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2417 The caller has already emitted any indentation and linkage
2418 specifier. It is responsible for any initializer, terminating ;
2419 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2420 this is the opposite way round that PTX wants them! */
2422 static void
2423 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2424 const_tree type, HOST_WIDE_INT size, unsigned align,
2425 bool undefined = false)
2427 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2428 && (TYPE_DOMAIN (type) == NULL_TREE);
2430 if (undefined && flexible_array_member_type_p (type))
2432 size = 0;
2433 atype = true;
2436 while (TREE_CODE (type) == ARRAY_TYPE)
2437 type = TREE_TYPE (type);
2439 if (TREE_CODE (type) == VECTOR_TYPE
2440 || TREE_CODE (type) == COMPLEX_TYPE)
2441 /* Neither vector nor complex types can contain the other. */
2442 type = TREE_TYPE (type);
2444 unsigned HOST_WIDE_INT elt_size = int_size_in_bytes (type);
2446 /* Largest mode we're prepared to accept. For BLKmode types we
2447 don't know if it'll contain pointer constants, so have to choose
2448 pointer size, otherwise we can choose DImode. */
2449 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2451 elt_size |= GET_MODE_SIZE (elt_mode);
2452 elt_size &= -elt_size; /* Extract LSB set. */
2454 init_frag.size = elt_size;
2455 /* Avoid undefined shift behavior by using '2'. */
2456 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2457 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2458 init_frag.val = 0;
2459 init_frag.offset = 0;
2460 init_frag.started = false;
2461 /* Size might not be a multiple of elt size, if there's an
2462 initialized trailing struct array with smaller type than
2463 elt_size. */
2464 init_frag.remaining = (size + elt_size - 1) / elt_size;
2466 fprintf (file, "%s .align %d .u" HOST_WIDE_INT_PRINT_UNSIGNED " ",
2467 section, align / BITS_PER_UNIT,
2468 elt_size * BITS_PER_UNIT);
2469 assemble_name (file, name);
2471 if (size)
2472 /* We make everything an array, to simplify any initialization
2473 emission. */
2474 fprintf (file, "[" HOST_WIDE_INT_PRINT_UNSIGNED "]", init_frag.remaining);
2475 else if (atype)
2476 fprintf (file, "[]");
2479 /* Called when the initializer for a decl has been completely output through
2480 combinations of the three functions above. */
2482 static void
2483 nvptx_assemble_decl_end (void)
2485 if (init_frag.offset)
2486 /* This can happen with a packed struct with trailing array member. */
2487 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2488 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2491 /* Output an uninitialized common or file-scope variable. */
2493 void
2494 nvptx_output_aligned_decl (FILE *file, const char *name,
2495 const_tree decl, HOST_WIDE_INT size, unsigned align)
2497 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2499 /* If this is public, it is common. The nearest thing we have to
2500 common is weak. */
2501 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2503 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2504 TREE_TYPE (decl), size, align);
2505 nvptx_assemble_decl_end ();
2508 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2509 writing a constant variable EXP with NAME and SIZE and its
2510 initializer to FILE. */
2512 static void
2513 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2514 const_tree exp, HOST_WIDE_INT obj_size)
2516 write_var_marker (file, true, false, name);
2518 fprintf (file, "\t");
2520 tree type = TREE_TYPE (exp);
2521 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2522 TYPE_ALIGN (type));
2525 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2526 a variable DECL with NAME to FILE. */
2528 void
2529 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2531 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2533 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2534 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2536 tree type = TREE_TYPE (decl);
2537 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2538 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2539 type, obj_size, DECL_ALIGN (decl));
2542 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2544 static void
2545 nvptx_globalize_label (FILE *, const char *)
2549 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2550 declaration only for variable DECL with NAME to FILE. */
2552 static void
2553 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2555 /* The middle end can place constant pool decls into the varpool as
2556 undefined. Until that is fixed, catch the problem here. */
2557 if (DECL_IN_CONSTANT_POOL (decl))
2558 return;
2560 /* We support weak defintions, and hence have the right
2561 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2562 if (DECL_WEAK (decl))
2563 error_at (DECL_SOURCE_LOCATION (decl),
2564 "PTX does not support weak declarations"
2565 " (only weak definitions)");
2566 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2568 fprintf (file, "\t.extern ");
2569 tree size = DECL_SIZE_UNIT (decl);
2570 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2571 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2572 DECL_ALIGN (decl), true);
2573 nvptx_assemble_decl_end ();
2576 /* Output a pattern for a move instruction. */
2578 const char *
2579 nvptx_output_mov_insn (rtx dst, rtx src)
2581 machine_mode dst_mode = GET_MODE (dst);
2582 machine_mode src_mode = GET_MODE (src);
2583 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2584 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2585 machine_mode src_inner = (GET_CODE (src) == SUBREG
2586 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2588 rtx sym = src;
2589 if (GET_CODE (sym) == CONST)
2590 sym = XEXP (XEXP (sym, 0), 0);
2591 if (SYMBOL_REF_P (sym))
2593 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2594 return "%.\tcvta%D1%t0\t%0, %1;";
2595 nvptx_maybe_record_fnsym (sym);
2598 if (src_inner == dst_inner)
2599 return "%.\tmov%t0\t%0, %1;";
2601 if (CONSTANT_P (src))
2602 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2603 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2604 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2606 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2608 if (GET_MODE_BITSIZE (dst_mode) == 128
2609 && GET_MODE_BITSIZE (src_mode) == 128)
2611 /* mov.b128 is not supported. */
2612 if (dst_inner == V2DImode && src_inner == TImode)
2613 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2614 else if (dst_inner == TImode && src_inner == V2DImode)
2615 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2617 gcc_unreachable ();
2619 return "%.\tmov.b%T0\t%0, %1;";
2622 if (GET_MODE_BITSIZE (src_inner) == 128
2623 && GET_MODE_BITSIZE (src_mode) == 64)
2624 return "%.\tmov.b%T0\t%0, %1;";
2626 return "%.\tcvt%t0%t1\t%0, %1;";
2629 /* Output a pre/post barrier for MEM_OPERAND according to MEMMODEL. */
2631 static void
2632 nvptx_output_barrier (rtx *mem_operand, int memmodel, bool pre_p)
2634 bool post_p = !pre_p;
2636 switch (memmodel)
2638 case MEMMODEL_RELAXED:
2639 return;
2640 case MEMMODEL_CONSUME:
2641 case MEMMODEL_ACQUIRE:
2642 case MEMMODEL_SYNC_ACQUIRE:
2643 if (post_p)
2644 break;
2645 return;
2646 case MEMMODEL_RELEASE:
2647 case MEMMODEL_SYNC_RELEASE:
2648 if (pre_p)
2649 break;
2650 return;
2651 case MEMMODEL_ACQ_REL:
2652 case MEMMODEL_SEQ_CST:
2653 case MEMMODEL_SYNC_SEQ_CST:
2654 if (pre_p || post_p)
2655 break;
2656 return;
2657 default:
2658 gcc_unreachable ();
2661 output_asm_insn ("%.\tmembar%B0;", mem_operand);
2664 const char *
2665 nvptx_output_atomic_insn (const char *asm_template, rtx *operands, int mem_pos,
2666 int memmodel_pos)
2668 nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]),
2669 true);
2670 output_asm_insn (asm_template, operands);
2671 nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]),
2672 false);
2673 return "";
2676 static void nvptx_print_operand (FILE *, rtx, int);
2678 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2679 involves writing .param declarations and in/out copies into them. For
2680 indirect calls, also write the .callprototype. */
2682 const char *
2683 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2685 char buf[16];
2686 static int labelno;
2687 bool needs_tgt = register_operand (callee, Pmode);
2688 rtx pat = PATTERN (insn);
2689 if (GET_CODE (pat) == COND_EXEC)
2690 pat = COND_EXEC_CODE (pat);
2691 int arg_end = XVECLEN (pat, 0);
2692 tree decl = NULL_TREE;
2694 fprintf (asm_out_file, "\t{\n");
2695 if (result != NULL)
2696 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2697 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2698 reg_names[NVPTX_RETURN_REGNUM]);
2700 /* Ensure we have a ptx declaration in the output if necessary. */
2701 if (GET_CODE (callee) == SYMBOL_REF)
2703 decl = SYMBOL_REF_DECL (callee);
2704 if (!decl
2705 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2706 nvptx_record_libfunc (callee, result, pat);
2707 else if (DECL_EXTERNAL (decl))
2708 nvptx_record_fndecl (decl);
2711 if (needs_tgt)
2713 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2714 labelno++;
2715 ASM_OUTPUT_LABEL (asm_out_file, buf);
2716 std::stringstream s;
2717 write_fn_proto_from_insn (s, NULL, result, pat);
2718 fputs (s.str().c_str(), asm_out_file);
2721 for (int argno = 1; argno < arg_end; argno++)
2723 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2724 machine_mode mode = GET_MODE (t);
2725 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2727 /* Mode splitting has already been done. */
2728 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2729 "\t\tst.param%s [%%out_arg%d], ",
2730 ptx_type, argno, ptx_type, argno);
2731 output_reg (asm_out_file, REGNO (t), VOIDmode);
2732 fprintf (asm_out_file, ";\n");
2735 /* The '.' stands for the call's predicate, if any. */
2736 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2737 fprintf (asm_out_file, "\t\tcall ");
2738 if (result != NULL_RTX)
2739 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2741 if (decl)
2743 char *replaced_dots = NULL;
2744 const char *name = get_fnname_from_decl (decl);
2745 const char *replacement = nvptx_name_replacement (name);
2746 if (replacement != name)
2747 name = replacement;
2748 else
2750 replaced_dots = nvptx_replace_dot (name);
2751 if (replaced_dots)
2752 name = replaced_dots;
2754 assemble_name (asm_out_file, name);
2755 if (replaced_dots)
2756 XDELETE (replaced_dots);
2758 else
2759 output_address (VOIDmode, callee);
2761 const char *open = "(";
2762 for (int argno = 1; argno < arg_end; argno++)
2764 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2765 open = "";
2767 if (decl && DECL_STATIC_CHAIN (decl))
2769 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2770 open = "";
2772 if (!open[0])
2773 fprintf (asm_out_file, ")");
2775 if (needs_tgt)
2777 fprintf (asm_out_file, ", ");
2778 assemble_name (asm_out_file, buf);
2780 fprintf (asm_out_file, ";\n");
2782 if (find_reg_note (insn, REG_NORETURN, NULL))
2784 /* No return functions confuse the PTX JIT, as it doesn't realize
2785 the flow control barrier they imply. It can seg fault if it
2786 encounters what looks like an unexitable loop. Emit a trailing
2787 trap and exit, which it does grok. */
2788 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2789 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2792 if (result)
2794 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2796 if (!rval[0])
2797 /* We must escape the '%' that starts RETURN_REGNUM. */
2798 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2799 reg_names[NVPTX_RETURN_REGNUM]);
2800 return rval;
2803 return "}";
2806 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2808 static bool
2809 nvptx_print_operand_punct_valid_p (unsigned char c)
2811 return c == '.' || c== '#';
2814 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2816 static void
2817 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2819 rtx off;
2820 if (GET_CODE (x) == CONST)
2821 x = XEXP (x, 0);
2822 switch (GET_CODE (x))
2824 case PLUS:
2825 off = XEXP (x, 1);
2826 output_address (VOIDmode, XEXP (x, 0));
2827 fprintf (file, "+");
2828 output_address (VOIDmode, off);
2829 break;
2831 case SYMBOL_REF:
2832 case LABEL_REF:
2833 output_addr_const (file, x);
2834 break;
2836 default:
2837 gcc_assert (GET_CODE (x) != MEM);
2838 nvptx_print_operand (file, x, 0);
2839 break;
2843 /* Write assembly language output for the address ADDR to FILE. */
2845 static void
2846 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2848 nvptx_print_address_operand (file, addr, mode);
2851 static nvptx_data_area
2852 nvptx_mem_data_area (const_rtx x)
2854 gcc_assert (GET_CODE (x) == MEM);
2856 const_rtx addr = XEXP (x, 0);
2857 subrtx_iterator::array_type array;
2858 FOR_EACH_SUBRTX (iter, array, addr, ALL)
2859 if (SYMBOL_REF_P (*iter))
2860 return SYMBOL_DATA_AREA (*iter);
2862 return DATA_AREA_GENERIC;
2865 bool
2866 nvptx_mem_maybe_shared_p (const_rtx x)
2868 nvptx_data_area area = nvptx_mem_data_area (x);
2869 return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC;
2872 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2874 Meaning of CODE:
2875 . -- print the predicate for the instruction or an emptry string for an
2876 unconditional one.
2877 # -- print a rounding mode for the instruction
2879 A -- print a data area for a MEM
2880 c -- print an opcode suffix for a comparison operator, including a type code
2881 D -- print a data area for a MEM operand
2882 S -- print a shuffle kind specified by CONST_INT
2883 t -- print a type opcode suffix, promoting QImode to 32 bits
2884 T -- print a type size in bits
2885 u -- print a type opcode suffix without promotions.
2886 p -- print a '!' for constant 0.
2887 x -- print a destination operand that may also be a bit bucket. */
2889 static void
2890 nvptx_print_operand (FILE *file, rtx x, int code)
2892 if (code == '.')
2894 x = current_insn_predicate;
2895 if (x)
2897 fputs ("@", file);
2898 if (GET_CODE (x) == EQ)
2899 fputs ("!", file);
2900 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2902 return;
2904 else if (code == '#')
2906 fputs (".rn", file);
2907 return;
2910 enum rtx_code x_code = GET_CODE (x);
2911 machine_mode mode = GET_MODE (x);
2913 switch (code)
2915 case 'x':
2916 if (current_output_insn != NULL
2917 && find_reg_note (current_output_insn, REG_UNUSED, x) != NULL_RTX)
2919 fputs ("_", file);
2920 return;
2922 goto common;
2923 case 'B':
2924 if (SYMBOL_REF_P (XEXP (x, 0)))
2925 switch (SYMBOL_DATA_AREA (XEXP (x, 0)))
2927 case DATA_AREA_GENERIC:
2928 /* Assume worst-case: global. */
2929 gcc_fallthrough (); /* FALLTHROUGH. */
2930 case DATA_AREA_GLOBAL:
2931 break;
2932 case DATA_AREA_SHARED:
2933 fputs (".cta", file);
2934 return;
2935 case DATA_AREA_LOCAL:
2936 case DATA_AREA_CONST:
2937 case DATA_AREA_PARAM:
2938 default:
2939 gcc_unreachable ();
2942 /* There are 2 cases where membar.sys differs from membar.gl:
2943 - host accesses global memory (f.i. systemwide atomics)
2944 - 2 or more devices are setup in peer-to-peer mode, and one
2945 peer can access global memory of other peer.
2946 Neither are currently supported by openMP/OpenACC on nvptx, but
2947 that could change, so we default to membar.sys. We could support
2948 this more optimally by adding DATA_AREA_SYS and then emitting
2949 .gl for DATA_AREA_GLOBAL and .sys for DATA_AREA_SYS. */
2950 fputs (".sys", file);
2951 return;
2953 case 'A':
2954 x = XEXP (x, 0);
2955 gcc_fallthrough (); /* FALLTHROUGH. */
2957 case 'D':
2958 if (GET_CODE (x) == CONST)
2959 x = XEXP (x, 0);
2960 if (GET_CODE (x) == PLUS)
2961 x = XEXP (x, 0);
2963 if (GET_CODE (x) == SYMBOL_REF)
2964 fputs (section_for_sym (x), file);
2965 break;
2967 case 't':
2968 case 'u':
2969 if (x_code == SUBREG)
2971 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2972 if (VECTOR_MODE_P (inner_mode)
2973 && (GET_MODE_SIZE (mode)
2974 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2975 mode = GET_MODE_INNER (inner_mode);
2976 else if (split_mode_p (inner_mode))
2977 mode = maybe_split_mode (inner_mode);
2978 else
2979 mode = inner_mode;
2981 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2982 break;
2984 case 'H':
2985 case 'L':
2987 rtx inner_x = SUBREG_REG (x);
2988 machine_mode inner_mode = GET_MODE (inner_x);
2989 machine_mode split = maybe_split_mode (inner_mode);
2991 output_reg (file, REGNO (inner_x), split,
2992 (code == 'H'
2993 ? GET_MODE_SIZE (inner_mode) / 2
2994 : 0));
2996 break;
2998 case 'S':
3000 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
3001 /* Same order as nvptx_shuffle_kind. */
3002 static const char *const kinds[] =
3003 {".up", ".down", ".bfly", ".idx"};
3004 fputs (kinds[kind], file);
3006 break;
3008 case 'T':
3009 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
3010 break;
3012 case 'j':
3013 fprintf (file, "@");
3014 goto common;
3016 case 'J':
3017 fprintf (file, "@!");
3018 goto common;
3020 case 'p':
3021 if (INTVAL (x) == 0)
3022 fprintf (file, "!");
3023 break;
3025 case 'c':
3026 mode = GET_MODE (XEXP (x, 0));
3027 switch (x_code)
3029 case EQ:
3030 fputs (".eq", file);
3031 break;
3032 case NE:
3033 if (FLOAT_MODE_P (mode))
3034 fputs (".neu", file);
3035 else
3036 fputs (".ne", file);
3037 break;
3038 case LE:
3039 case LEU:
3040 fputs (".le", file);
3041 break;
3042 case GE:
3043 case GEU:
3044 fputs (".ge", file);
3045 break;
3046 case LT:
3047 case LTU:
3048 fputs (".lt", file);
3049 break;
3050 case GT:
3051 case GTU:
3052 fputs (".gt", file);
3053 break;
3054 case LTGT:
3055 fputs (".ne", file);
3056 break;
3057 case UNEQ:
3058 fputs (".equ", file);
3059 break;
3060 case UNLE:
3061 fputs (".leu", file);
3062 break;
3063 case UNGE:
3064 fputs (".geu", file);
3065 break;
3066 case UNLT:
3067 fputs (".ltu", file);
3068 break;
3069 case UNGT:
3070 fputs (".gtu", file);
3071 break;
3072 case UNORDERED:
3073 fputs (".nan", file);
3074 break;
3075 case ORDERED:
3076 fputs (".num", file);
3077 break;
3078 default:
3079 gcc_unreachable ();
3081 if (FLOAT_MODE_P (mode)
3082 || x_code == EQ || x_code == NE
3083 || x_code == GEU || x_code == GTU
3084 || x_code == LEU || x_code == LTU)
3085 fputs (nvptx_ptx_type_from_mode (mode, true), file);
3086 else
3087 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
3088 break;
3089 default:
3090 common:
3091 switch (x_code)
3093 case SUBREG:
3095 rtx inner_x = SUBREG_REG (x);
3096 machine_mode inner_mode = GET_MODE (inner_x);
3097 machine_mode split = maybe_split_mode (inner_mode);
3099 if (VECTOR_MODE_P (inner_mode)
3100 && (GET_MODE_SIZE (mode)
3101 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
3103 output_reg (file, REGNO (inner_x), VOIDmode);
3104 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
3106 else if (split_mode_p (inner_mode)
3107 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
3108 output_reg (file, REGNO (inner_x), split);
3109 else
3110 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
3112 break;
3114 case REG:
3115 output_reg (file, REGNO (x), maybe_split_mode (mode));
3116 break;
3118 case MEM:
3119 fputc ('[', file);
3120 nvptx_print_address_operand (file, XEXP (x, 0), mode);
3121 fputc (']', file);
3122 break;
3124 case CONST_INT:
3125 output_addr_const (file, x);
3126 break;
3128 case CONST:
3129 case SYMBOL_REF:
3130 case LABEL_REF:
3131 /* We could use output_addr_const, but that can print things like
3132 "x-8", which breaks ptxas. Need to ensure it is output as
3133 "x+-8". */
3134 nvptx_print_address_operand (file, x, VOIDmode);
3135 break;
3137 case CONST_DOUBLE:
3138 long vals[2];
3139 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
3140 vals[0] &= 0xffffffff;
3141 vals[1] &= 0xffffffff;
3142 if (mode == SFmode)
3143 fprintf (file, "0f%08lx", vals[0]);
3144 else
3145 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
3146 break;
3148 case CONST_VECTOR:
3150 unsigned n = CONST_VECTOR_NUNITS (x);
3151 fprintf (file, "{ ");
3152 for (unsigned i = 0; i < n; ++i)
3154 if (i != 0)
3155 fprintf (file, ", ");
3157 rtx elem = CONST_VECTOR_ELT (x, i);
3158 output_addr_const (file, elem);
3160 fprintf (file, " }");
3162 break;
3164 default:
3165 output_addr_const (file, x);
3170 /* Record replacement regs used to deal with subreg operands. */
3171 struct reg_replace
3173 rtx replacement[MAX_RECOG_OPERANDS];
3174 machine_mode mode;
3175 int n_allocated;
3176 int n_in_use;
3179 /* Allocate or reuse a replacement in R and return the rtx. */
3181 static rtx
3182 get_replacement (struct reg_replace *r)
3184 if (r->n_allocated == r->n_in_use)
3185 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
3186 return r->replacement[r->n_in_use++];
3189 /* Clean up subreg operands. In ptx assembly, everything is typed, and
3190 the presence of subregs would break the rules for most instructions.
3191 Replace them with a suitable new register of the right size, plus
3192 conversion copyin/copyout instructions. */
3194 static void
3195 nvptx_reorg_subreg (void)
3197 struct reg_replace qiregs, hiregs, siregs, diregs;
3198 rtx_insn *insn, *next;
3200 qiregs.n_allocated = 0;
3201 hiregs.n_allocated = 0;
3202 siregs.n_allocated = 0;
3203 diregs.n_allocated = 0;
3204 qiregs.mode = QImode;
3205 hiregs.mode = HImode;
3206 siregs.mode = SImode;
3207 diregs.mode = DImode;
3209 for (insn = get_insns (); insn; insn = next)
3211 next = NEXT_INSN (insn);
3212 if (!NONDEBUG_INSN_P (insn)
3213 || asm_noperands (PATTERN (insn)) >= 0
3214 || GET_CODE (PATTERN (insn)) == USE
3215 || GET_CODE (PATTERN (insn)) == CLOBBER)
3216 continue;
3218 qiregs.n_in_use = 0;
3219 hiregs.n_in_use = 0;
3220 siregs.n_in_use = 0;
3221 diregs.n_in_use = 0;
3222 extract_insn (insn);
3223 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
3225 for (int i = 0; i < recog_data.n_operands; i++)
3227 rtx op = recog_data.operand[i];
3228 if (GET_CODE (op) != SUBREG)
3229 continue;
3231 rtx inner = SUBREG_REG (op);
3233 machine_mode outer_mode = GET_MODE (op);
3234 machine_mode inner_mode = GET_MODE (inner);
3235 gcc_assert (s_ok);
3236 if (s_ok
3237 && (GET_MODE_PRECISION (inner_mode)
3238 >= GET_MODE_PRECISION (outer_mode)))
3239 continue;
3240 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
3241 struct reg_replace *r = (outer_mode == QImode ? &qiregs
3242 : outer_mode == HImode ? &hiregs
3243 : outer_mode == SImode ? &siregs
3244 : &diregs);
3245 rtx new_reg = get_replacement (r);
3247 if (recog_data.operand_type[i] != OP_OUT)
3249 enum rtx_code code;
3250 if (GET_MODE_PRECISION (inner_mode)
3251 < GET_MODE_PRECISION (outer_mode))
3252 code = ZERO_EXTEND;
3253 else
3254 code = TRUNCATE;
3256 rtx pat = gen_rtx_SET (new_reg,
3257 gen_rtx_fmt_e (code, outer_mode, inner));
3258 emit_insn_before (pat, insn);
3261 if (recog_data.operand_type[i] != OP_IN)
3263 enum rtx_code code;
3264 if (GET_MODE_PRECISION (inner_mode)
3265 < GET_MODE_PRECISION (outer_mode))
3266 code = TRUNCATE;
3267 else
3268 code = ZERO_EXTEND;
3270 rtx pat = gen_rtx_SET (inner,
3271 gen_rtx_fmt_e (code, inner_mode, new_reg));
3272 emit_insn_after (pat, insn);
3274 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
3279 /* Return a SImode "master lane index" register for uniform-simt, allocating on
3280 first use. */
3282 static rtx
3283 nvptx_get_unisimt_master ()
3285 rtx &master = cfun->machine->unisimt_master;
3286 return master ? master : master = gen_reg_rtx (SImode);
3289 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
3291 static rtx
3292 nvptx_get_unisimt_predicate ()
3294 rtx &pred = cfun->machine->unisimt_predicate;
3295 return pred ? pred : pred = gen_reg_rtx (BImode);
3298 static rtx
3299 nvptx_get_unisimt_outside_simt_predicate ()
3301 rtx &pred = cfun->machine->unisimt_outside_simt_predicate;
3302 return pred ? pred : pred = gen_reg_rtx (BImode);
3305 /* Return true if given call insn references one of the functions provided by
3306 the CUDA runtime: malloc, free, vprintf. */
3308 static bool
3309 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
3311 rtx pat = PATTERN (insn);
3312 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
3313 pat = XVECEXP (pat, 0, 0);
3314 if (GET_CODE (pat) == SET)
3315 pat = SET_SRC (pat);
3316 gcc_checking_assert (GET_CODE (pat) == CALL
3317 && GET_CODE (XEXP (pat, 0)) == MEM);
3318 rtx addr = XEXP (XEXP (pat, 0), 0);
3319 if (GET_CODE (addr) != SYMBOL_REF)
3320 return false;
3321 const char *name = XSTR (addr, 0);
3322 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
3323 references with forced assembler name refer to PTX syscalls. For vprintf,
3324 accept both normal and forced-assembler-name references. */
3325 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
3326 || !strcmp (name, "*malloc")
3327 || !strcmp (name, "*free"));
3330 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
3331 propagate its value from lane MASTER to current lane. */
3333 static bool
3334 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
3336 rtx reg;
3337 if (GET_CODE (set) == SET
3338 && REG_P (reg = SET_DEST (set))
3339 && find_reg_note (insn, REG_UNUSED, reg) == NULL_RTX)
3341 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX),
3342 insn);
3343 return true;
3346 return false;
3349 static void
3350 predicate_insn (rtx_insn *insn, rtx pred)
3352 rtx pat = PATTERN (insn);
3353 pred = gen_rtx_NE (BImode, pred, const0_rtx);
3354 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
3355 bool changed_p = validate_change (insn, &PATTERN (insn), pat, false);
3356 gcc_assert (changed_p);
3359 /* Adjust code for uniform-simt code generation variant by making atomics and
3360 "syscalls" conditionally executed, and inserting shuffle-based propagation
3361 for registers being set. */
3363 static void
3364 nvptx_reorg_uniform_simt ()
3366 rtx_insn *insn, *next;
3368 for (insn = get_insns (); insn; insn = next)
3370 next = NEXT_INSN (insn);
3372 /* Skip NOTE, USE, etc. */
3373 if (!INSN_P (insn) || recog_memoized (insn) == -1)
3374 continue;
3376 if (CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
3378 /* Handle syscall. */
3380 else if (get_attr_atomic (insn))
3382 /* Handle atomic insn. */
3384 else
3385 continue;
3387 rtx pat = PATTERN (insn);
3388 rtx master = nvptx_get_unisimt_master ();
3389 bool shuffle_p = false;
3390 switch (GET_CODE (pat))
3392 case PARALLEL:
3393 for (int i = 0; i < XVECLEN (pat, 0); i++)
3394 shuffle_p
3395 |= nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
3396 break;
3397 case SET:
3398 shuffle_p |= nvptx_unisimt_handle_set (pat, insn, master);
3399 break;
3400 default:
3401 gcc_unreachable ();
3404 if (shuffle_p && TARGET_PTX_6_0)
3406 /* The shuffle is a sync, so uniformity is guaranteed. */
3408 else
3410 if (TARGET_PTX_6_0)
3412 gcc_assert (!shuffle_p);
3413 /* Emit after the insn, to guarantee uniformity. */
3414 emit_insn_after (gen_nvptx_warpsync (), insn);
3416 else
3418 /* Emit after the insn (and before the shuffle, if there are any)
3419 to check uniformity. */
3420 emit_insn_after (gen_nvptx_uniform_warp_check (), insn);
3424 rtx pred = nvptx_get_unisimt_predicate ();
3425 predicate_insn (insn, pred);
3427 pred = NULL_RTX;
3428 for (rtx_insn *post = NEXT_INSN (insn); post != next;
3429 post = NEXT_INSN (post))
3431 if (pred == NULL_RTX)
3432 pred = nvptx_get_unisimt_outside_simt_predicate ();
3433 predicate_insn (post, pred);
3438 /* Offloading function attributes. */
3440 struct offload_attrs
3442 unsigned mask;
3443 int num_gangs;
3444 int num_workers;
3445 int vector_length;
3448 /* Define entries for cfun->machine->axis_dim. */
3450 #define MACH_VECTOR_LENGTH 0
3451 #define MACH_MAX_WORKERS 1
3453 static void populate_offload_attrs (offload_attrs *oa);
3455 static void
3456 init_axis_dim (void)
3458 offload_attrs oa;
3459 int max_workers;
3461 populate_offload_attrs (&oa);
3463 if (oa.num_workers == 0)
3464 max_workers = PTX_CTA_SIZE / oa.vector_length;
3465 else
3466 max_workers = oa.num_workers;
3468 cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
3469 cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
3470 cfun->machine->axis_dim_init_p = true;
3473 static int ATTRIBUTE_UNUSED
3474 nvptx_mach_max_workers ()
3476 if (!cfun->machine->axis_dim_init_p)
3477 init_axis_dim ();
3478 return cfun->machine->axis_dim[MACH_MAX_WORKERS];
3481 static int ATTRIBUTE_UNUSED
3482 nvptx_mach_vector_length ()
3484 if (!cfun->machine->axis_dim_init_p)
3485 init_axis_dim ();
3486 return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3489 /* Loop structure of the function. The entire function is described as
3490 a NULL loop. */
3491 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:struct parallel_g'. */
3493 struct parallel
3495 /* Parent parallel. */
3496 parallel *parent;
3498 /* Next sibling parallel. */
3499 parallel *next;
3501 /* First child parallel. */
3502 parallel *inner;
3504 /* Partitioning mask of the parallel. */
3505 unsigned mask;
3507 /* Partitioning used within inner parallels. */
3508 unsigned inner_mask;
3510 /* Location of parallel forked and join. The forked is the first
3511 block in the parallel and the join is the first block after of
3512 the partition. */
3513 basic_block forked_block;
3514 basic_block join_block;
3516 rtx_insn *forked_insn;
3517 rtx_insn *join_insn;
3519 rtx_insn *fork_insn;
3520 rtx_insn *joining_insn;
3522 /* Basic blocks in this parallel, but not in child parallels. The
3523 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3524 blocks are not. */
3525 auto_vec<basic_block> blocks;
3527 public:
3528 parallel (parallel *parent, unsigned mode);
3529 ~parallel ();
3532 /* Constructor links the new parallel into it's parent's chain of
3533 children. */
3535 parallel::parallel (parallel *parent_, unsigned mask_)
3536 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3538 forked_block = join_block = 0;
3539 forked_insn = join_insn = 0;
3540 fork_insn = joining_insn = 0;
3542 if (parent)
3544 next = parent->inner;
3545 parent->inner = this;
3549 parallel::~parallel ()
3551 delete inner;
3552 delete next;
3555 /* Map of basic blocks to insns */
3556 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3558 /* A tuple of an insn of interest and the BB in which it resides. */
3559 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3560 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3562 /* Split basic blocks such that each forked and join unspecs are at
3563 the start of their basic blocks. Thus afterwards each block will
3564 have a single partitioning mode. We also do the same for return
3565 insns, as they are executed by every thread. Return the
3566 partitioning mode of the function as a whole. Populate MAP with
3567 head and tail blocks. We also clear the BB visited flag, which is
3568 used when finding partitions. */
3569 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_split_blocks'. */
3571 static void
3572 nvptx_split_blocks (bb_insn_map_t *map)
3574 insn_bb_vec_t worklist;
3575 basic_block block;
3576 rtx_insn *insn;
3578 /* Locate all the reorg instructions of interest. */
3579 FOR_ALL_BB_FN (block, cfun)
3581 bool seen_insn = false;
3583 /* Clear visited flag, for use by parallel locator */
3584 block->flags &= ~BB_VISITED;
3586 FOR_BB_INSNS (block, insn)
3588 if (!INSN_P (insn))
3589 continue;
3590 switch (recog_memoized (insn))
3592 default:
3593 seen_insn = true;
3594 continue;
3595 case CODE_FOR_nvptx_forked:
3596 case CODE_FOR_nvptx_join:
3597 break;
3599 case CODE_FOR_return:
3600 /* We also need to split just before return insns, as
3601 that insn needs executing by all threads, but the
3602 block it is in probably does not. */
3603 break;
3606 if (seen_insn)
3607 /* We've found an instruction that must be at the start of
3608 a block, but isn't. Add it to the worklist. */
3609 worklist.safe_push (insn_bb_t (insn, block));
3610 else
3611 /* It was already the first instruction. Just add it to
3612 the map. */
3613 map->get_or_insert (block) = insn;
3614 seen_insn = true;
3618 /* Split blocks on the worklist. */
3619 unsigned ix;
3620 insn_bb_t *elt;
3621 basic_block remap = 0;
3622 for (ix = 0; worklist.iterate (ix, &elt); ix++)
3624 if (remap != elt->second)
3626 block = elt->second;
3627 remap = block;
3630 /* Split block before insn. The insn is in the new block */
3631 edge e = split_block (block, PREV_INSN (elt->first));
3633 block = e->dest;
3634 map->get_or_insert (block) = elt->first;
3638 /* Return true if MASK contains parallelism that requires shared
3639 memory to broadcast. */
3641 static bool
3642 nvptx_needs_shared_bcast (unsigned mask)
3644 bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3645 bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3646 && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3648 return worker || large_vector;
3651 /* BLOCK is a basic block containing a head or tail instruction.
3652 Locate the associated prehead or pretail instruction, which must be
3653 in the single predecessor block. */
3655 static rtx_insn *
3656 nvptx_discover_pre (basic_block block, int expected)
3658 gcc_assert (block->preds->length () == 1);
3659 basic_block pre_block = (*block->preds)[0]->src;
3660 rtx_insn *pre_insn;
3662 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3663 pre_insn = PREV_INSN (pre_insn))
3664 gcc_assert (pre_insn != BB_HEAD (pre_block));
3666 gcc_assert (recog_memoized (pre_insn) == expected);
3667 return pre_insn;
3670 /* Dump this parallel and all its inner parallels. */
3671 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_dump_pars'. */
3673 static void
3674 nvptx_dump_pars (parallel *par, unsigned depth)
3676 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3677 depth, par->mask,
3678 par->forked_block ? par->forked_block->index : -1,
3679 par->join_block ? par->join_block->index : -1);
3681 fprintf (dump_file, " blocks:");
3683 basic_block block;
3684 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3685 fprintf (dump_file, " %d", block->index);
3686 fprintf (dump_file, "\n");
3687 if (par->inner)
3688 nvptx_dump_pars (par->inner, depth + 1);
3690 if (par->next)
3691 nvptx_dump_pars (par->next, depth);
3694 /* If BLOCK contains a fork/join marker, process it to create or
3695 terminate a loop structure. Add this block to the current loop,
3696 and then walk successor blocks. */
3697 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_find_par'. */
3699 static parallel *
3700 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3702 if (block->flags & BB_VISITED)
3703 return par;
3704 block->flags |= BB_VISITED;
3706 if (rtx_insn **endp = map->get (block))
3708 rtx_insn *end = *endp;
3710 /* This is a block head or tail, or return instruction. */
3711 switch (recog_memoized (end))
3713 case CODE_FOR_return:
3714 /* Return instructions are in their own block, and we
3715 don't need to do anything more. */
3716 return par;
3718 case CODE_FOR_nvptx_forked:
3719 /* Loop head, create a new inner loop and add it into
3720 our parent's child list. */
3722 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3724 gcc_assert (mask);
3725 par = new parallel (par, mask);
3726 par->forked_block = block;
3727 par->forked_insn = end;
3728 if (nvptx_needs_shared_bcast (mask))
3729 par->fork_insn
3730 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3732 break;
3734 case CODE_FOR_nvptx_join:
3735 /* A loop tail. Finish the current loop and return to
3736 parent. */
3738 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3740 gcc_assert (par->mask == mask);
3741 gcc_assert (par->join_block == NULL);
3742 par->join_block = block;
3743 par->join_insn = end;
3744 if (nvptx_needs_shared_bcast (mask))
3745 par->joining_insn
3746 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3747 par = par->parent;
3749 break;
3751 default:
3752 gcc_unreachable ();
3756 if (par)
3757 /* Add this block onto the current loop's list of blocks. */
3758 par->blocks.safe_push (block);
3759 else
3760 /* This must be the entry block. Create a NULL parallel. */
3761 par = new parallel (0, 0);
3763 /* Walk successor blocks. */
3764 edge e;
3765 edge_iterator ei;
3767 FOR_EACH_EDGE (e, ei, block->succs)
3768 nvptx_find_par (map, par, e->dest);
3770 return par;
3773 /* DFS walk the CFG looking for fork & join markers. Construct
3774 loop structures as we go. MAP is a mapping of basic blocks
3775 to head & tail markers, discovered when splitting blocks. This
3776 speeds up the discovery. We rely on the BB visited flag having
3777 been cleared when splitting blocks. */
3778 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_discover_pars'. */
3780 static parallel *
3781 nvptx_discover_pars (bb_insn_map_t *map)
3783 basic_block block;
3785 /* Mark exit blocks as visited. */
3786 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3787 block->flags |= BB_VISITED;
3789 /* And entry block as not. */
3790 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3791 block->flags &= ~BB_VISITED;
3793 parallel *par = nvptx_find_par (map, 0, block);
3795 if (dump_file)
3797 fprintf (dump_file, "\nLoops\n");
3798 nvptx_dump_pars (par, 0);
3799 fprintf (dump_file, "\n");
3802 return par;
3805 /* Analyse a group of BBs within a partitioned region and create N
3806 Single-Entry-Single-Exit regions. Some of those regions will be
3807 trivial ones consisting of a single BB. The blocks of a
3808 partitioned region might form a set of disjoint graphs -- because
3809 the region encloses a differently partitoned sub region.
3811 We use the linear time algorithm described in 'Finding Regions Fast:
3812 Single Entry Single Exit and control Regions in Linear Time'
3813 Johnson, Pearson & Pingali. That algorithm deals with complete
3814 CFGs, where a back edge is inserted from END to START, and thus the
3815 problem becomes one of finding equivalent loops.
3817 In this case we have a partial CFG. We complete it by redirecting
3818 any incoming edge to the graph to be from an arbitrary external BB,
3819 and similarly redirecting any outgoing edge to be to that BB.
3820 Thus we end up with a closed graph.
3822 The algorithm works by building a spanning tree of an undirected
3823 graph and keeping track of back edges from nodes further from the
3824 root in the tree to nodes nearer to the root in the tree. In the
3825 description below, the root is up and the tree grows downwards.
3827 We avoid having to deal with degenerate back-edges to the same
3828 block, by splitting each BB into 3 -- one for input edges, one for
3829 the node itself and one for the output edges. Such back edges are
3830 referred to as 'Brackets'. Cycle equivalent nodes will have the
3831 same set of brackets.
3833 Determining bracket equivalency is done by maintaining a list of
3834 brackets in such a manner that the list length and final bracket
3835 uniquely identify the set.
3837 We use coloring to mark all BBs with cycle equivalency with the
3838 same color. This is the output of the 'Finding Regions Fast'
3839 algorithm. Notice it doesn't actually find the set of nodes within
3840 a particular region, just unorderd sets of nodes that are the
3841 entries and exits of SESE regions.
3843 After determining cycle equivalency, we need to find the minimal
3844 set of SESE regions. Do this with a DFS coloring walk of the
3845 complete graph. We're either 'looking' or 'coloring'. When
3846 looking, and we're in the subgraph, we start coloring the color of
3847 the current node, and remember that node as the start of the
3848 current color's SESE region. Every time we go to a new node, we
3849 decrement the count of nodes with thet color. If it reaches zero,
3850 we remember that node as the end of the current color's SESE region
3851 and return to 'looking'. Otherwise we color the node the current
3852 color.
3854 This way we end up with coloring the inside of non-trivial SESE
3855 regions with the color of that region. */
3857 /* A pair of BBs. We use this to represent SESE regions. */
3858 typedef std::pair<basic_block, basic_block> bb_pair_t;
3859 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3861 /* A node in the undirected CFG. The discriminator SECOND indicates just
3862 above or just below the BB idicated by FIRST. */
3863 typedef std::pair<basic_block, int> pseudo_node_t;
3865 /* A bracket indicates an edge towards the root of the spanning tree of the
3866 undirected graph. Each bracket has a color, determined
3867 from the currrent set of brackets. */
3868 struct bracket
3870 pseudo_node_t back; /* Back target */
3872 /* Current color and size of set. */
3873 unsigned color;
3874 unsigned size;
3876 bracket (pseudo_node_t back_)
3877 : back (back_), color (~0u), size (~0u)
3881 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3883 if (length != size)
3885 size = length;
3886 color = color_counts.length ();
3887 color_counts.quick_push (0);
3889 color_counts[color]++;
3890 return color;
3894 typedef auto_vec<bracket> bracket_vec_t;
3896 /* Basic block info for finding SESE regions. */
3898 struct bb_sese
3900 int node; /* Node number in spanning tree. */
3901 int parent; /* Parent node number. */
3903 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3904 edges arrive at pseudo-node Ai and the outgoing edges leave at
3905 pseudo-node Ao. We have to remember which way we arrived at a
3906 particular node when generating the spanning tree. dir > 0 means
3907 we arrived at Ai, dir < 0 means we arrived at Ao. */
3908 int dir;
3910 /* Lowest numbered pseudo-node reached via a backedge from thsis
3911 node, or any descendant. */
3912 pseudo_node_t high;
3914 int color; /* Cycle-equivalence color */
3916 /* Stack of brackets for this node. */
3917 bracket_vec_t brackets;
3919 bb_sese (unsigned node_, unsigned p, int dir_)
3920 :node (node_), parent (p), dir (dir_)
3923 ~bb_sese ();
3925 /* Push a bracket ending at BACK. */
3926 void push (const pseudo_node_t &back)
3928 if (dump_file)
3929 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3930 back.first ? back.first->index : 0, back.second);
3931 brackets.safe_push (bracket (back));
3934 void append (bb_sese *child);
3935 void remove (const pseudo_node_t &);
3937 /* Set node's color. */
3938 void set_color (auto_vec<unsigned> &color_counts)
3940 color = brackets.last ().get_color (color_counts, brackets.length ());
3944 bb_sese::~bb_sese ()
3948 /* Destructively append CHILD's brackets. */
3950 void
3951 bb_sese::append (bb_sese *child)
3953 if (int len = child->brackets.length ())
3955 int ix;
3957 if (dump_file)
3959 for (ix = 0; ix < len; ix++)
3961 const pseudo_node_t &pseudo = child->brackets[ix].back;
3962 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3963 child->node, pseudo.first ? pseudo.first->index : 0,
3964 pseudo.second);
3967 if (!brackets.length ())
3968 std::swap (brackets, child->brackets);
3969 else
3971 brackets.reserve (len);
3972 for (ix = 0; ix < len; ix++)
3973 brackets.quick_push (child->brackets[ix]);
3978 /* Remove brackets that terminate at PSEUDO. */
3980 void
3981 bb_sese::remove (const pseudo_node_t &pseudo)
3983 unsigned removed = 0;
3984 int len = brackets.length ();
3986 for (int ix = 0; ix < len; ix++)
3988 if (brackets[ix].back == pseudo)
3990 if (dump_file)
3991 fprintf (dump_file, "Removing backedge %d:%+d\n",
3992 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3993 removed++;
3995 else if (removed)
3996 brackets[ix-removed] = brackets[ix];
3998 while (removed--)
3999 brackets.pop ();
4002 /* Accessors for BB's aux pointer. */
4003 #define BB_SET_SESE(B, S) ((B)->aux = (S))
4004 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
4006 /* DFS walk creating SESE data structures. Only cover nodes with
4007 BB_VISITED set. Append discovered blocks to LIST. We number in
4008 increments of 3 so that the above and below pseudo nodes can be
4009 implicitly numbered too. */
4011 static int
4012 nvptx_sese_number (int n, int p, int dir, basic_block b,
4013 auto_vec<basic_block> *list)
4015 if (BB_GET_SESE (b))
4016 return n;
4018 if (dump_file)
4019 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
4020 b->index, n, p, dir);
4022 BB_SET_SESE (b, new bb_sese (n, p, dir));
4023 p = n;
4025 n += 3;
4026 list->quick_push (b);
4028 /* First walk the nodes on the 'other side' of this node, then walk
4029 the nodes on the same side. */
4030 for (unsigned ix = 2; ix; ix--)
4032 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
4033 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
4034 : offsetof (edge_def, src));
4035 edge e;
4036 edge_iterator ei;
4038 FOR_EACH_EDGE (e, ei, edges)
4040 basic_block target = *(basic_block *)((char *)e + offset);
4042 if (target->flags & BB_VISITED)
4043 n = nvptx_sese_number (n, p, dir, target, list);
4045 dir = -dir;
4047 return n;
4050 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
4051 EDGES are the outgoing edges and OFFSET is the offset to the src
4052 or dst block on the edges. */
4054 static void
4055 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
4056 vec<edge, va_gc> *edges, size_t offset)
4058 edge e;
4059 edge_iterator ei;
4060 int hi_back = depth;
4061 pseudo_node_t node_back (nullptr, depth);
4062 int hi_child = depth;
4063 pseudo_node_t node_child (nullptr, depth);
4064 basic_block child = NULL;
4065 unsigned num_children = 0;
4066 int usd = -dir * sese->dir;
4068 if (dump_file)
4069 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
4070 me->index, sese->node, dir);
4072 if (dir < 0)
4074 /* This is the above pseudo-child. It has the BB itself as an
4075 additional child node. */
4076 node_child = sese->high;
4077 hi_child = node_child.second;
4078 if (node_child.first)
4079 hi_child += BB_GET_SESE (node_child.first)->node;
4080 num_children++;
4083 /* Examine each edge.
4084 - if it is a child (a) append its bracket list and (b) record
4085 whether it is the child with the highest reaching bracket.
4086 - if it is an edge to ancestor, record whether it's the highest
4087 reaching backlink. */
4088 FOR_EACH_EDGE (e, ei, edges)
4090 basic_block target = *(basic_block *)((char *)e + offset);
4092 if (bb_sese *t_sese = BB_GET_SESE (target))
4094 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
4096 /* Child node. Append its bracket list. */
4097 num_children++;
4098 sese->append (t_sese);
4100 /* Compare it's hi value. */
4101 int t_hi = t_sese->high.second;
4103 if (basic_block child_hi_block = t_sese->high.first)
4104 t_hi += BB_GET_SESE (child_hi_block)->node;
4106 if (hi_child > t_hi)
4108 hi_child = t_hi;
4109 node_child = t_sese->high;
4110 child = target;
4113 else if (t_sese->node < sese->node + dir
4114 && !(dir < 0 && sese->parent == t_sese->node))
4116 /* Non-parental ancestor node -- a backlink. */
4117 int d = usd * t_sese->dir;
4118 int back = t_sese->node + d;
4120 if (hi_back > back)
4122 hi_back = back;
4123 node_back = pseudo_node_t (target, d);
4127 else
4128 { /* Fallen off graph, backlink to entry node. */
4129 hi_back = 0;
4130 node_back = pseudo_node_t (nullptr, 0);
4134 /* Remove any brackets that terminate at this pseudo node. */
4135 sese->remove (pseudo_node_t (me, dir));
4137 /* Now push any backlinks from this pseudo node. */
4138 FOR_EACH_EDGE (e, ei, edges)
4140 basic_block target = *(basic_block *)((char *)e + offset);
4141 if (bb_sese *t_sese = BB_GET_SESE (target))
4143 if (t_sese->node < sese->node + dir
4144 && !(dir < 0 && sese->parent == t_sese->node))
4145 /* Non-parental ancestor node - backedge from me. */
4146 sese->push (pseudo_node_t (target, usd * t_sese->dir));
4148 else
4150 /* back edge to entry node */
4151 sese->push (pseudo_node_t (nullptr, 0));
4155 /* If this node leads directly or indirectly to a no-return region of
4156 the graph, then fake a backedge to entry node. */
4157 if (!sese->brackets.length () || !edges || !edges->length ())
4159 hi_back = 0;
4160 node_back = pseudo_node_t (nullptr, 0);
4161 sese->push (node_back);
4164 /* Record the highest reaching backedge from us or a descendant. */
4165 sese->high = hi_back < hi_child ? node_back : node_child;
4167 if (num_children > 1)
4169 /* There is more than one child -- this is a Y shaped piece of
4170 spanning tree. We have to insert a fake backedge from this
4171 node to the highest ancestor reached by not-the-highest
4172 reaching child. Note that there may be multiple children
4173 with backedges to the same highest node. That's ok and we
4174 insert the edge to that highest node. */
4175 hi_child = depth;
4176 if (dir < 0 && child)
4178 node_child = sese->high;
4179 hi_child = node_child.second;
4180 if (node_child.first)
4181 hi_child += BB_GET_SESE (node_child.first)->node;
4184 FOR_EACH_EDGE (e, ei, edges)
4186 basic_block target = *(basic_block *)((char *)e + offset);
4188 if (target == child)
4189 /* Ignore the highest child. */
4190 continue;
4192 bb_sese *t_sese = BB_GET_SESE (target);
4193 if (!t_sese)
4194 continue;
4195 if (t_sese->parent != sese->node)
4196 /* Not a child. */
4197 continue;
4199 /* Compare its hi value. */
4200 int t_hi = t_sese->high.second;
4202 if (basic_block child_hi_block = t_sese->high.first)
4203 t_hi += BB_GET_SESE (child_hi_block)->node;
4205 if (hi_child > t_hi)
4207 hi_child = t_hi;
4208 node_child = t_sese->high;
4212 sese->push (node_child);
4217 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
4218 proceed to successors. Set SESE entry and exit nodes of
4219 REGIONS. */
4221 static void
4222 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
4223 basic_block block, int coloring)
4225 bb_sese *sese = BB_GET_SESE (block);
4227 if (block->flags & BB_VISITED)
4229 /* If we've already encountered this block, either we must not
4230 be coloring, or it must have been colored the current color. */
4231 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
4232 return;
4235 block->flags |= BB_VISITED;
4237 if (sese)
4239 if (coloring < 0)
4241 /* Start coloring a region. */
4242 regions[sese->color].first = block;
4243 coloring = sese->color;
4246 if (!--color_counts[sese->color] && sese->color == coloring)
4248 /* Found final block of SESE region. */
4249 regions[sese->color].second = block;
4250 coloring = -1;
4252 else
4253 /* Color the node, so we can assert on revisiting the node
4254 that the graph is indeed SESE. */
4255 sese->color = coloring;
4257 else
4258 /* Fallen off the subgraph, we cannot be coloring. */
4259 gcc_assert (coloring < 0);
4261 /* Walk each successor block. */
4262 if (block->succs && block->succs->length ())
4264 edge e;
4265 edge_iterator ei;
4267 FOR_EACH_EDGE (e, ei, block->succs)
4268 nvptx_sese_color (color_counts, regions, e->dest, coloring);
4270 else
4271 gcc_assert (coloring < 0);
4274 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
4275 end up with NULL entries in it. */
4277 static void
4278 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
4280 basic_block block;
4281 int ix;
4283 /* First clear each BB of the whole function. */
4284 FOR_ALL_BB_FN (block, cfun)
4286 block->flags &= ~BB_VISITED;
4287 BB_SET_SESE (block, 0);
4290 /* Mark blocks in the function that are in this graph. */
4291 for (ix = 0; blocks.iterate (ix, &block); ix++)
4292 block->flags |= BB_VISITED;
4294 /* Counts of nodes assigned to each color. There cannot be more
4295 colors than blocks (and hopefully there will be fewer). */
4296 auto_vec<unsigned> color_counts;
4297 color_counts.reserve (blocks.length ());
4299 /* Worklist of nodes in the spanning tree. Again, there cannot be
4300 more nodes in the tree than blocks (there will be fewer if the
4301 CFG of blocks is disjoint). */
4302 auto_vec<basic_block> spanlist;
4303 spanlist.reserve (blocks.length ());
4305 /* Make sure every block has its cycle class determined. */
4306 for (ix = 0; blocks.iterate (ix, &block); ix++)
4308 if (BB_GET_SESE (block))
4309 /* We already met this block in an earlier graph solve. */
4310 continue;
4312 if (dump_file)
4313 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
4315 /* Number the nodes reachable from block initial DFS order. */
4316 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
4318 /* Now walk in reverse DFS order to find cycle equivalents. */
4319 while (spanlist.length ())
4321 block = spanlist.pop ();
4322 bb_sese *sese = BB_GET_SESE (block);
4324 /* Do the pseudo node below. */
4325 nvptx_sese_pseudo (block, sese, depth, +1,
4326 sese->dir > 0 ? block->succs : block->preds,
4327 (sese->dir > 0 ? offsetof (edge_def, dest)
4328 : offsetof (edge_def, src)));
4329 sese->set_color (color_counts);
4330 /* Do the pseudo node above. */
4331 nvptx_sese_pseudo (block, sese, depth, -1,
4332 sese->dir < 0 ? block->succs : block->preds,
4333 (sese->dir < 0 ? offsetof (edge_def, dest)
4334 : offsetof (edge_def, src)));
4336 if (dump_file)
4337 fprintf (dump_file, "\n");
4340 if (dump_file)
4342 unsigned count;
4343 const char *comma = "";
4345 fprintf (dump_file, "Found %d cycle equivalents\n",
4346 color_counts.length ());
4347 for (ix = 0; color_counts.iterate (ix, &count); ix++)
4349 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
4351 comma = "";
4352 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
4353 if (BB_GET_SESE (block)->color == ix)
4355 block->flags |= BB_VISITED;
4356 fprintf (dump_file, "%s%d", comma, block->index);
4357 comma=",";
4359 fprintf (dump_file, "}");
4360 comma = ", ";
4362 fprintf (dump_file, "\n");
4365 /* Now we've colored every block in the subgraph. We now need to
4366 determine the minimal set of SESE regions that cover that
4367 subgraph. Do this with a DFS walk of the complete function.
4368 During the walk we're either 'looking' or 'coloring'. When we
4369 reach the last node of a particular color, we stop coloring and
4370 return to looking. */
4372 /* There cannot be more SESE regions than colors. */
4373 regions.reserve (color_counts.length ());
4374 for (ix = color_counts.length (); ix--;)
4375 regions.quick_push (bb_pair_t (0, 0));
4377 for (ix = 0; blocks.iterate (ix, &block); ix++)
4378 block->flags &= ~BB_VISITED;
4380 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
4382 if (dump_file)
4384 const char *comma = "";
4385 int len = regions.length ();
4387 fprintf (dump_file, "SESE regions:");
4388 for (ix = 0; ix != len; ix++)
4390 basic_block from = regions[ix].first;
4391 basic_block to = regions[ix].second;
4393 if (from)
4395 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
4396 if (to != from)
4397 fprintf (dump_file, "->%d", to->index);
4399 int color = BB_GET_SESE (from)->color;
4401 /* Print the blocks within the region (excluding ends). */
4402 FOR_EACH_BB_FN (block, cfun)
4404 bb_sese *sese = BB_GET_SESE (block);
4406 if (sese && sese->color == color
4407 && block != from && block != to)
4408 fprintf (dump_file, ".%d", block->index);
4410 fprintf (dump_file, "}");
4412 comma = ",";
4414 fprintf (dump_file, "\n\n");
4417 for (ix = 0; blocks.iterate (ix, &block); ix++)
4418 delete BB_GET_SESE (block);
4421 #undef BB_SET_SESE
4422 #undef BB_GET_SESE
4424 /* Propagate live state at the start of a partitioned region. IS_CALL
4425 indicates whether the propagation is for a (partitioned) call
4426 instruction. BLOCK provides the live register information, and
4427 might not contain INSN. Propagation is inserted just after INSN. RW
4428 indicates whether we are reading and/or writing state. This
4429 separation is needed for worker-level proppagation where we
4430 essentially do a spill & fill. FN is the underlying worker
4431 function to generate the propagation instructions for single
4432 register. DATA is user data.
4434 Returns true if we didn't emit any instructions.
4436 We propagate the live register set for non-calls and the entire
4437 frame for calls and non-calls. We could do better by (a)
4438 propagating just the live set that is used within the partitioned
4439 regions and (b) only propagating stack entries that are used. The
4440 latter might be quite hard to determine. */
4442 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
4444 static bool
4445 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
4446 propagate_mask rw, propagator_fn fn, void *data, bool vector)
4448 bitmap live = DF_LIVE_IN (block);
4449 bitmap_iterator iterator;
4450 unsigned ix;
4451 bool empty = true;
4453 /* Copy the frame array. */
4454 HOST_WIDE_INT fs = get_frame_size ();
4455 if (fs)
4457 rtx tmp = gen_reg_rtx (DImode);
4458 rtx idx = NULL_RTX;
4459 rtx ptr = gen_reg_rtx (Pmode);
4460 rtx pred = NULL_RTX;
4461 rtx_code_label *label = NULL;
4463 empty = false;
4464 /* The frame size might not be DImode compatible, but the frame
4465 array's declaration will be. So it's ok to round up here. */
4466 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
4467 /* Detect single iteration loop. */
4468 if (fs == 1)
4469 fs = 0;
4471 start_sequence ();
4472 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
4473 if (fs)
4475 idx = gen_reg_rtx (SImode);
4476 pred = gen_reg_rtx (BImode);
4477 label = gen_label_rtx ();
4479 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
4480 /* Allow worker function to initialize anything needed. */
4481 rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
4482 if (init)
4483 emit_insn (init);
4484 emit_label (label);
4485 LABEL_NUSES (label)++;
4486 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
4488 if (rw & PM_read)
4489 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
4490 emit_insn (fn (tmp, rw, fs, data, vector));
4491 if (rw & PM_write)
4492 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4493 if (fs)
4495 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4496 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4497 emit_insn (gen_br_true_uni (pred, label));
4498 rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
4499 if (fini)
4500 emit_insn (fini);
4501 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4503 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4504 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4505 rtx cpy = get_insns ();
4506 end_sequence ();
4507 insn = emit_insn_after (cpy, insn);
4510 if (!is_call)
4511 /* Copy live registers. */
4512 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4514 rtx reg = regno_reg_rtx[ix];
4516 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4518 rtx bcast = fn (reg, rw, 0, data, vector);
4520 insn = emit_insn_after (bcast, insn);
4521 empty = false;
4524 return empty;
4527 /* Worker for nvptx_warp_propagate. */
4529 static rtx
4530 warp_prop_gen (rtx reg, propagate_mask pm,
4531 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4532 bool ARG_UNUSED (vector))
4534 if (!(pm & PM_read_write))
4535 return 0;
4537 return nvptx_gen_warp_bcast (reg);
4540 /* Propagate state that is live at start of BLOCK across the vectors
4541 of a single warp. Propagation is inserted just after INSN.
4542 IS_CALL and return as for nvptx_propagate. */
4544 static bool
4545 nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
4547 return nvptx_propagate (is_call, block, insn, PM_read_write,
4548 warp_prop_gen, 0, false);
4551 /* Worker for nvptx_shared_propagate. */
4553 static rtx
4554 shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4555 bool vector)
4557 broadcast_data_t *data = (broadcast_data_t *)data_;
4559 if (pm & PM_loop_begin)
4561 /* Starting a loop, initialize pointer. */
4562 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4564 oacc_bcast_align = MAX (oacc_bcast_align, align);
4565 data->offset = ROUND_UP (data->offset, align);
4567 data->ptr = gen_reg_rtx (Pmode);
4569 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4571 else if (pm & PM_loop_end)
4573 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4574 data->ptr = NULL_RTX;
4575 return clobber;
4577 else
4578 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
4581 /* Spill or fill live state that is live at start of BLOCK. PRE_P
4582 indicates if this is just before partitioned mode (do spill), or
4583 just after it starts (do fill). Sequence is inserted just after
4584 INSN. IS_CALL and return as for nvptx_propagate. */
4586 static bool
4587 nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4588 rtx_insn *insn, bool vector)
4590 broadcast_data_t data;
4592 data.base = gen_reg_rtx (Pmode);
4593 data.offset = 0;
4594 data.ptr = NULL_RTX;
4596 bool empty = nvptx_propagate (is_call, block, insn,
4597 pre_p ? PM_read : PM_write, shared_prop_gen,
4598 &data, vector);
4599 gcc_assert (empty == !data.offset);
4600 if (data.offset)
4602 rtx bcast_sym = oacc_bcast_sym;
4604 /* Stuff was emitted, initialize the base pointer now. */
4605 if (vector && nvptx_mach_max_workers () > 1)
4607 if (!cfun->machine->bcast_partition)
4609 /* It would be nice to place this register in
4610 DATA_AREA_SHARED. */
4611 cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4613 if (!cfun->machine->sync_bar)
4614 cfun->machine->sync_bar = gen_reg_rtx (SImode);
4616 bcast_sym = cfun->machine->bcast_partition;
4619 rtx init = gen_rtx_SET (data.base, bcast_sym);
4620 emit_insn_after (init, insn);
4622 unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4623 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4624 ? nvptx_mach_max_workers () + 1
4625 : 1);
4627 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4628 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4630 return empty;
4633 /* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4634 which is an integer or a register. THREADS is the number of threads
4635 controlled by the barrier. */
4637 static rtx
4638 nvptx_cta_sync (rtx lock, int threads)
4640 return gen_nvptx_barsync (lock, GEN_INT (threads));
4643 #if WORKAROUND_PTXJIT_BUG
4644 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4645 real insns. */
4647 static rtx_insn *
4648 bb_first_real_insn (basic_block bb)
4650 rtx_insn *insn;
4652 /* Find first insn of from block. */
4653 FOR_BB_INSNS (bb, insn)
4654 if (INSN_P (insn))
4655 return insn;
4657 return 0;
4659 #endif
4661 /* Return true if INSN needs neutering. */
4663 static bool
4664 needs_neutering_p (rtx_insn *insn)
4666 if (!INSN_P (insn))
4667 return false;
4669 switch (recog_memoized (insn))
4671 case CODE_FOR_nvptx_fork:
4672 case CODE_FOR_nvptx_forked:
4673 case CODE_FOR_nvptx_joining:
4674 case CODE_FOR_nvptx_join:
4675 case CODE_FOR_nvptx_barsync:
4676 return false;
4677 default:
4678 return true;
4682 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4684 static bool
4685 verify_neutering_jumps (basic_block from,
4686 rtx_insn *vector_jump, rtx_insn *worker_jump,
4687 rtx_insn *vector_label, rtx_insn *worker_label)
4689 basic_block bb = from;
4690 rtx_insn *insn = BB_HEAD (bb);
4691 bool seen_worker_jump = false;
4692 bool seen_vector_jump = false;
4693 bool seen_worker_label = false;
4694 bool seen_vector_label = false;
4695 bool worker_neutered = false;
4696 bool vector_neutered = false;
4697 while (true)
4699 if (insn == worker_jump)
4701 seen_worker_jump = true;
4702 worker_neutered = true;
4703 gcc_assert (!vector_neutered);
4705 else if (insn == vector_jump)
4707 seen_vector_jump = true;
4708 vector_neutered = true;
4710 else if (insn == worker_label)
4712 seen_worker_label = true;
4713 gcc_assert (worker_neutered);
4714 worker_neutered = false;
4716 else if (insn == vector_label)
4718 seen_vector_label = true;
4719 gcc_assert (vector_neutered);
4720 vector_neutered = false;
4722 else if (INSN_P (insn))
4723 switch (recog_memoized (insn))
4725 case CODE_FOR_nvptx_barsync:
4726 gcc_assert (!vector_neutered && !worker_neutered);
4727 break;
4728 default:
4729 break;
4732 if (insn != BB_END (bb))
4733 insn = NEXT_INSN (insn);
4734 else if (JUMP_P (insn) && single_succ_p (bb)
4735 && !seen_vector_jump && !seen_worker_jump)
4737 bb = single_succ (bb);
4738 insn = BB_HEAD (bb);
4740 else
4741 break;
4744 gcc_assert (!(vector_jump && !seen_vector_jump));
4745 gcc_assert (!(worker_jump && !seen_worker_jump));
4747 if (seen_vector_label || seen_worker_label)
4749 gcc_assert (!(vector_label && !seen_vector_label));
4750 gcc_assert (!(worker_label && !seen_worker_label));
4752 return true;
4755 return false;
4758 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4760 static void
4761 verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4762 rtx_insn *worker_label)
4764 basic_block bb = to;
4765 rtx_insn *insn = BB_END (bb);
4766 bool seen_worker_label = false;
4767 bool seen_vector_label = false;
4768 while (true)
4770 if (insn == worker_label)
4772 seen_worker_label = true;
4773 gcc_assert (!seen_vector_label);
4775 else if (insn == vector_label)
4776 seen_vector_label = true;
4777 else if (INSN_P (insn))
4778 switch (recog_memoized (insn))
4780 case CODE_FOR_nvptx_barsync:
4781 gcc_assert (!seen_vector_label && !seen_worker_label);
4782 break;
4785 if (insn != BB_HEAD (bb))
4786 insn = PREV_INSN (insn);
4787 else
4788 break;
4791 gcc_assert (!(vector_label && !seen_vector_label));
4792 gcc_assert (!(worker_label && !seen_worker_label));
4795 /* Single neutering according to MASK. FROM is the incoming block and
4796 TO is the outgoing block. These may be the same block. Insert at
4797 start of FROM:
4799 if (tid.<axis>) goto end.
4801 and insert before ending branch of TO (if there is such an insn):
4803 end:
4804 <possibly-broadcast-cond>
4805 <branch>
4807 We currently only use differnt FROM and TO when skipping an entire
4808 loop. We could do more if we detected superblocks. */
4810 static void
4811 nvptx_single (unsigned mask, basic_block from, basic_block to)
4813 rtx_insn *head = BB_HEAD (from);
4814 rtx_insn *tail = BB_END (to);
4815 unsigned skip_mask = mask;
4817 while (true)
4819 /* Find first insn of from block. */
4820 while (head != BB_END (from) && !needs_neutering_p (head))
4821 head = NEXT_INSN (head);
4823 if (from == to)
4824 break;
4826 if (!(JUMP_P (head) && single_succ_p (from)))
4827 break;
4829 basic_block jump_target = single_succ (from);
4830 if (!single_pred_p (jump_target))
4831 break;
4833 from = jump_target;
4834 head = BB_HEAD (from);
4837 /* Find last insn of to block */
4838 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4839 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4840 tail = PREV_INSN (tail);
4842 /* Detect if tail is a branch. */
4843 rtx tail_branch = NULL_RTX;
4844 rtx cond_branch = NULL_RTX;
4845 if (tail && INSN_P (tail))
4847 tail_branch = PATTERN (tail);
4848 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4849 tail_branch = NULL_RTX;
4850 else
4852 cond_branch = SET_SRC (tail_branch);
4853 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4854 cond_branch = NULL_RTX;
4858 if (tail == head)
4860 /* If this is empty, do nothing. */
4861 if (!head || !needs_neutering_p (head))
4862 return;
4864 if (cond_branch)
4866 /* If we're only doing vector single, there's no need to
4867 emit skip code because we'll not insert anything. */
4868 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4869 skip_mask = 0;
4871 else if (tail_branch)
4872 /* Block with only unconditional branch. Nothing to do. */
4873 return;
4876 /* Insert the vector test inside the worker test. */
4877 unsigned mode;
4878 rtx_insn *before = tail;
4879 rtx_insn *neuter_start = NULL;
4880 rtx_insn *worker_label = NULL, *vector_label = NULL;
4881 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4882 rtx_insn *warp_sync = NULL;
4883 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4884 if (GOMP_DIM_MASK (mode) & skip_mask)
4886 rtx_code_label *label = gen_label_rtx ();
4887 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4888 rtx_insn **mode_jump
4889 = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4890 rtx_insn **mode_label
4891 = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4893 if (!pred)
4895 pred = gen_reg_rtx (BImode);
4896 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4899 rtx br;
4900 if (mode == GOMP_DIM_VECTOR)
4901 br = gen_br_true (pred, label);
4902 else
4903 br = gen_br_true_uni (pred, label);
4904 if (neuter_start)
4905 neuter_start = emit_insn_after (br, neuter_start);
4906 else
4907 neuter_start = emit_insn_before (br, head);
4908 *mode_jump = neuter_start;
4910 LABEL_NUSES (label)++;
4911 rtx_insn *label_insn;
4912 if (tail_branch)
4914 label_insn = emit_label_before (label, before);
4915 if (mode == GOMP_DIM_VECTOR)
4917 if (TARGET_PTX_6_0)
4918 warp_sync = emit_insn_after (gen_nvptx_warpsync (),
4919 label_insn);
4920 else
4921 warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
4922 label_insn);
4924 before = label_insn;
4926 else
4928 label_insn = emit_label_after (label, tail);
4929 if (mode == GOMP_DIM_VECTOR)
4931 if (TARGET_PTX_6_0)
4932 warp_sync = emit_insn_after (gen_nvptx_warpsync (),
4933 label_insn);
4934 else
4935 warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
4936 label_insn);
4938 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4939 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4940 emit_insn_after (gen_exit (), label_insn);
4943 *mode_label = label_insn;
4946 /* Now deal with propagating the branch condition. */
4947 if (cond_branch)
4949 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4951 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4952 && nvptx_mach_vector_length () == PTX_WARP_SIZE)
4954 /* Vector mode only, do a shuffle. */
4955 #if WORKAROUND_PTXJIT_BUG
4956 /* The branch condition %rcond is propagated like this:
4959 .reg .u32 %x;
4960 mov.u32 %x,%tid.x;
4961 setp.ne.u32 %rnotvzero,%x,0;
4964 @%rnotvzero bra Lskip;
4965 setp.<op>.<type> %rcond,op1,op2;
4966 Lskip:
4967 selp.u32 %rcondu32,1,0,%rcond;
4968 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4969 setp.ne.u32 %rcond,%rcondu32,0;
4971 There seems to be a bug in the ptx JIT compiler (observed at driver
4972 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4973 unless %rcond is initialized to something before 'bra Lskip'. The
4974 bug is not observed with ptxas from cuda 8.0.61.
4976 It is true that the code is non-trivial: at Lskip, %rcond is
4977 uninitialized in threads 1-31, and after the selp the same holds
4978 for %rcondu32. But shfl propagates the defined value in thread 0
4979 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4980 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4982 There is nothing in the PTX spec to suggest that this is wrong, or
4983 to explain why the extra initialization is needed. So, we classify
4984 it as a JIT bug, and the extra initialization as workaround:
4987 .reg .u32 %x;
4988 mov.u32 %x,%tid.x;
4989 setp.ne.u32 %rnotvzero,%x,0;
4992 +.reg .pred %rcond2;
4993 +setp.eq.u32 %rcond2, 1, 0;
4995 @%rnotvzero bra Lskip;
4996 setp.<op>.<type> %rcond,op1,op2;
4997 +mov.pred %rcond2, %rcond;
4998 Lskip:
4999 +mov.pred %rcond, %rcond2;
5000 selp.u32 %rcondu32,1,0,%rcond;
5001 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
5002 setp.ne.u32 %rcond,%rcondu32,0;
5004 rtx_insn *label = PREV_INSN (tail);
5005 if (label == warp_sync)
5006 label = PREV_INSN (label);
5007 gcc_assert (label && LABEL_P (label));
5008 rtx tmp = gen_reg_rtx (BImode);
5009 emit_insn_before (gen_movbi (tmp, const0_rtx),
5010 bb_first_real_insn (from));
5011 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
5012 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
5013 #endif
5014 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
5016 else
5018 /* Includes worker mode, do spill & fill. By construction
5019 we should never have worker mode only. */
5020 broadcast_data_t data;
5021 unsigned size = GET_MODE_SIZE (SImode);
5022 bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
5023 bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
5024 rtx barrier = GEN_INT (0);
5025 int threads = 0;
5027 data.base = oacc_bcast_sym;
5028 data.ptr = 0;
5030 bool use_partitioning_p = (vector && !worker
5031 && nvptx_mach_max_workers () > 1
5032 && cfun->machine->bcast_partition);
5033 if (use_partitioning_p)
5035 data.base = cfun->machine->bcast_partition;
5036 barrier = cfun->machine->sync_bar;
5037 threads = nvptx_mach_vector_length ();
5039 gcc_assert (data.base != NULL);
5040 gcc_assert (barrier);
5042 unsigned int psize = ROUND_UP (size, oacc_bcast_align);
5043 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
5044 ? nvptx_mach_max_workers () + 1
5045 : 1);
5047 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
5048 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
5050 data.offset = 0;
5051 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
5052 vector),
5053 before);
5055 /* Barrier so other workers can see the write. */
5056 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
5057 data.offset = 0;
5058 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
5059 vector),
5060 tail);
5061 /* This barrier is needed to avoid worker zero clobbering
5062 the broadcast buffer before all the other workers have
5063 had a chance to read this instance of it. */
5064 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
5067 extract_insn (tail);
5068 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
5069 UNSPEC_BR_UNIFIED);
5070 validate_change (tail, recog_data.operand_loc[0], unsp, false);
5073 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
5074 vector_label, worker_label);
5075 if (!seen_label)
5076 verify_neutering_labels (to, vector_label, worker_label);
5079 /* PAR is a parallel that is being skipped in its entirety according to
5080 MASK. Treat this as skipping a superblock starting at forked
5081 and ending at joining. */
5083 static void
5084 nvptx_skip_par (unsigned mask, parallel *par)
5086 basic_block tail = par->join_block;
5087 gcc_assert (tail->preds->length () == 1);
5089 basic_block pre_tail = (*tail->preds)[0]->src;
5090 gcc_assert (pre_tail->succs->length () == 1);
5092 nvptx_single (mask, par->forked_block, pre_tail);
5095 /* If PAR has a single inner parallel and PAR itself only contains
5096 empty entry and exit blocks, swallow the inner PAR. */
5098 static void
5099 nvptx_optimize_inner (parallel *par)
5101 parallel *inner = par->inner;
5103 /* We mustn't be the outer dummy par. */
5104 if (!par->mask)
5105 return;
5107 /* We must have a single inner par. */
5108 if (!inner || inner->next)
5109 return;
5111 /* We must only contain 2 blocks ourselves -- the head and tail of
5112 the inner par. */
5113 if (par->blocks.length () != 2)
5114 return;
5116 /* We must be disjoint partitioning. As we only have vector and
5117 worker partitioning, this is sufficient to guarantee the pars
5118 have adjacent partitioning. */
5119 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
5120 /* This indicates malformed code generation. */
5121 return;
5123 /* The outer forked insn should be immediately followed by the inner
5124 fork insn. */
5125 rtx_insn *forked = par->forked_insn;
5126 rtx_insn *fork = BB_END (par->forked_block);
5128 if (NEXT_INSN (forked) != fork)
5129 return;
5130 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
5132 /* The outer joining insn must immediately follow the inner join
5133 insn. */
5134 rtx_insn *joining = par->joining_insn;
5135 rtx_insn *join = inner->join_insn;
5136 if (NEXT_INSN (join) != joining)
5137 return;
5139 /* Preconditions met. Swallow the inner par. */
5140 if (dump_file)
5141 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
5142 inner->mask, inner->forked_block->index,
5143 inner->join_block->index,
5144 par->mask, par->forked_block->index, par->join_block->index);
5146 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
5148 par->blocks.reserve (inner->blocks.length ());
5149 while (inner->blocks.length ())
5150 par->blocks.quick_push (inner->blocks.pop ());
5152 par->inner = inner->inner;
5153 inner->inner = NULL;
5155 delete inner;
5158 /* Process the parallel PAR and all its contained
5159 parallels. We do everything but the neutering. Return mask of
5160 partitioned modes used within this parallel. */
5162 static unsigned
5163 nvptx_process_pars (parallel *par)
5165 if (nvptx_optimize)
5166 nvptx_optimize_inner (par);
5168 unsigned inner_mask = par->mask;
5170 /* Do the inner parallels first. */
5171 if (par->inner)
5173 par->inner_mask = nvptx_process_pars (par->inner);
5174 inner_mask |= par->inner_mask;
5177 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
5178 bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
5179 bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
5180 && nvptx_mach_vector_length () > PTX_WARP_SIZE);
5182 if (worker || large_vector)
5184 nvptx_shared_propagate (false, is_call, par->forked_block,
5185 par->forked_insn, !worker);
5186 bool no_prop_p
5187 = nvptx_shared_propagate (true, is_call, par->forked_block,
5188 par->fork_insn, !worker);
5189 bool empty_loop_p
5190 = !is_call && (NEXT_INSN (par->forked_insn)
5191 && NEXT_INSN (par->forked_insn) == par->joining_insn);
5192 rtx barrier = GEN_INT (0);
5193 int threads = 0;
5195 if (!worker && cfun->machine->sync_bar)
5197 barrier = cfun->machine->sync_bar;
5198 threads = nvptx_mach_vector_length ();
5201 if (no_prop_p && empty_loop_p)
5203 else if (no_prop_p && is_call)
5205 else
5207 /* Insert begin and end synchronizations. */
5208 emit_insn_before (nvptx_cta_sync (barrier, threads),
5209 par->forked_insn);
5210 emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
5213 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
5214 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
5216 /* Now do siblings. */
5217 if (par->next)
5218 inner_mask |= nvptx_process_pars (par->next);
5219 return inner_mask;
5222 /* Neuter the parallel described by PAR. We recurse in depth-first
5223 order. MODES are the partitioning of the execution and OUTER is
5224 the partitioning of the parallels we are contained in. */
5226 static void
5227 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
5229 unsigned me = (par->mask
5230 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
5231 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5232 unsigned skip_mask = 0, neuter_mask = 0;
5234 if (par->inner)
5235 nvptx_neuter_pars (par->inner, modes, outer | me);
5237 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
5239 if ((outer | me) & GOMP_DIM_MASK (mode))
5240 {} /* Mode is partitioned: no neutering. */
5241 else if (!(modes & GOMP_DIM_MASK (mode)))
5242 {} /* Mode is not used: nothing to do. */
5243 else if (par->inner_mask & GOMP_DIM_MASK (mode)
5244 || !par->forked_insn)
5245 /* Partitioned in inner parallels, or we're not a partitioned
5246 at all: neuter individual blocks. */
5247 neuter_mask |= GOMP_DIM_MASK (mode);
5248 else if (!par->parent || !par->parent->forked_insn
5249 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
5250 /* Parent isn't a parallel or contains this paralleling: skip
5251 parallel at this level. */
5252 skip_mask |= GOMP_DIM_MASK (mode);
5253 else
5254 {} /* Parent will skip this parallel itself. */
5257 if (neuter_mask)
5259 int ix, len;
5261 if (nvptx_optimize)
5263 /* Neuter whole SESE regions. */
5264 bb_pair_vec_t regions;
5266 nvptx_find_sese (par->blocks, regions);
5267 len = regions.length ();
5268 for (ix = 0; ix != len; ix++)
5270 basic_block from = regions[ix].first;
5271 basic_block to = regions[ix].second;
5273 if (from)
5274 nvptx_single (neuter_mask, from, to);
5275 else
5276 gcc_assert (!to);
5279 else
5281 /* Neuter each BB individually. */
5282 len = par->blocks.length ();
5283 for (ix = 0; ix != len; ix++)
5285 basic_block block = par->blocks[ix];
5287 nvptx_single (neuter_mask, block, block);
5292 if (skip_mask)
5293 nvptx_skip_par (skip_mask, par);
5295 if (par->next)
5296 nvptx_neuter_pars (par->next, modes, outer);
5299 static void
5300 populate_offload_attrs (offload_attrs *oa)
5302 tree attr = oacc_get_fn_attrib (current_function_decl);
5303 tree dims = TREE_VALUE (attr);
5304 unsigned ix;
5306 oa->mask = 0;
5308 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
5310 tree t = TREE_VALUE (dims);
5311 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
5312 tree allowed = TREE_PURPOSE (dims);
5314 if (size != 1 && !(allowed && integer_zerop (allowed)))
5315 oa->mask |= GOMP_DIM_MASK (ix);
5317 switch (ix)
5319 case GOMP_DIM_GANG:
5320 oa->num_gangs = size;
5321 break;
5323 case GOMP_DIM_WORKER:
5324 oa->num_workers = size;
5325 break;
5327 case GOMP_DIM_VECTOR:
5328 oa->vector_length = size;
5329 break;
5334 #if WORKAROUND_PTXJIT_BUG_2
5335 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
5336 is needed in the nvptx target because the branches generated for
5337 parititioning are NONJUMP_INSN_P, not JUMP_P. */
5339 static rtx
5340 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
5342 rtx pat;
5343 if ((strict && !JUMP_P (insn))
5344 || (!strict && !INSN_P (insn)))
5345 return NULL_RTX;
5346 pat = PATTERN (insn);
5348 /* The set is allowed to appear either as the insn pattern or
5349 the first set in a PARALLEL. */
5350 if (GET_CODE (pat) == PARALLEL)
5351 pat = XVECEXP (pat, 0, 0);
5352 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
5353 return pat;
5355 return NULL_RTX;
5358 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
5360 static rtx
5361 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
5363 rtx x = nvptx_pc_set (insn, strict);
5365 if (!x)
5366 return NULL_RTX;
5367 x = SET_SRC (x);
5368 if (GET_CODE (x) == LABEL_REF)
5369 return x;
5370 if (GET_CODE (x) != IF_THEN_ELSE)
5371 return NULL_RTX;
5372 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
5373 return XEXP (x, 1);
5374 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
5375 return XEXP (x, 2);
5376 return NULL_RTX;
5379 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
5380 insn inbetween the branch and the label. This works around a JIT bug
5381 observed at driver version 384.111, at -O0 for sm_50. */
5383 static void
5384 prevent_branch_around_nothing (void)
5386 rtx_insn *seen_label = NULL;
5387 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
5389 if (INSN_P (insn) && condjump_p (insn))
5391 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
5392 continue;
5395 if (seen_label == NULL)
5396 continue;
5398 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5399 continue;
5401 if (INSN_P (insn))
5402 switch (recog_memoized (insn))
5404 case CODE_FOR_nvptx_fork:
5405 case CODE_FOR_nvptx_forked:
5406 case CODE_FOR_nvptx_joining:
5407 case CODE_FOR_nvptx_join:
5408 case CODE_FOR_nop:
5409 continue;
5410 case -1:
5411 /* Handle asm ("") and similar. */
5412 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
5413 || GET_CODE (PATTERN (insn)) == ASM_OPERANDS
5414 || (GET_CODE (PATTERN (insn)) == PARALLEL
5415 && asm_noperands (PATTERN (insn)) >= 0))
5416 continue;
5417 /* FALLTHROUGH. */
5418 default:
5419 seen_label = NULL;
5420 continue;
5423 if (LABEL_P (insn) && insn == seen_label)
5424 emit_insn_before (gen_fake_nop (), insn);
5426 seen_label = NULL;
5429 #endif
5431 #ifdef WORKAROUND_PTXJIT_BUG_3
5432 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
5433 works around a hang observed at driver version 390.48 for sm_50. */
5435 static void
5436 workaround_barsyncs (void)
5438 bool seen_barsync = false;
5439 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
5441 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
5443 if (seen_barsync)
5445 emit_insn_before (gen_nvptx_membar_cta (), insn);
5446 emit_insn_before (gen_nvptx_membar_cta (), insn);
5449 seen_barsync = true;
5450 continue;
5453 if (!seen_barsync)
5454 continue;
5456 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5457 continue;
5458 else if (INSN_P (insn))
5459 switch (recog_memoized (insn))
5461 case CODE_FOR_nvptx_fork:
5462 case CODE_FOR_nvptx_forked:
5463 case CODE_FOR_nvptx_joining:
5464 case CODE_FOR_nvptx_join:
5465 continue;
5466 default:
5467 break;
5470 seen_barsync = false;
5473 #endif
5475 static rtx
5476 gen_comment (const char *s)
5478 const char *sep = " ";
5479 size_t len = strlen (ASM_COMMENT_START) + strlen (sep) + strlen (s) + 1;
5480 char *comment = (char *) alloca (len);
5481 snprintf (comment, len, "%s%s%s", ASM_COMMENT_START, sep, s);
5482 return gen_rtx_ASM_INPUT_loc (VOIDmode, ggc_strdup (comment),
5483 DECL_SOURCE_LOCATION (cfun->decl));
5486 /* Initialize all declared regs at function entry.
5487 Advantage : Fool-proof.
5488 Disadvantage: Potentially creates a lot of long live ranges and adds a lot
5489 of insns. */
5491 static void
5492 workaround_uninit_method_1 (void)
5494 rtx_insn *first = get_insns ();
5495 rtx_insn *insert_here = NULL;
5497 for (int ix = LAST_VIRTUAL_REGISTER + 1; ix < max_reg_num (); ix++)
5499 rtx reg = regno_reg_rtx[ix];
5501 /* Skip undeclared registers. */
5502 if (reg == const0_rtx)
5503 continue;
5505 gcc_assert (CONST0_RTX (GET_MODE (reg)));
5507 start_sequence ();
5508 if (nvptx_comment && first != NULL)
5509 emit_insn (gen_comment ("Start: Added by -minit-regs=1"));
5510 emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
5511 rtx_insn *inits = get_insns ();
5512 end_sequence ();
5514 if (dump_file && (dump_flags & TDF_DETAILS))
5515 for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init))
5516 fprintf (dump_file, "Default init of reg %u inserted: insn %u\n",
5517 ix, INSN_UID (init));
5519 if (first != NULL)
5521 insert_here = emit_insn_before (inits, first);
5522 first = NULL;
5524 else
5525 insert_here = emit_insn_after (inits, insert_here);
5528 if (nvptx_comment && insert_here != NULL)
5529 emit_insn_after (gen_comment ("End: Added by -minit-regs=1"), insert_here);
5532 /* Find uses of regs that are not defined on all incoming paths, and insert a
5533 corresponding def at function entry.
5534 Advantage : Simple.
5535 Disadvantage: Potentially creates long live ranges.
5536 May not catch all cases. F.i. a clobber cuts a live range in
5537 the compiler and may prevent entry_lr_in from being set for a
5538 reg, but the clobber does not translate to a ptx insn, so in
5539 ptx there still may be an uninitialized ptx reg. See f.i.
5540 gcc.c-torture/compile/20020926-1.c. */
5542 static void
5543 workaround_uninit_method_2 (void)
5545 auto_bitmap entry_pseudo_uninit;
5547 auto_bitmap not_pseudo;
5548 bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER);
5550 bitmap entry_lr_in = DF_LR_IN (ENTRY_BLOCK_PTR_FOR_FN (cfun));
5551 bitmap_and_compl (entry_pseudo_uninit, entry_lr_in, not_pseudo);
5554 rtx_insn *first = get_insns ();
5555 rtx_insn *insert_here = NULL;
5557 bitmap_iterator iterator;
5558 unsigned ix;
5559 EXECUTE_IF_SET_IN_BITMAP (entry_pseudo_uninit, 0, ix, iterator)
5561 rtx reg = regno_reg_rtx[ix];
5562 gcc_assert (CONST0_RTX (GET_MODE (reg)));
5564 start_sequence ();
5565 if (nvptx_comment && first != NULL)
5566 emit_insn (gen_comment ("Start: Added by -minit-regs=2:"));
5567 emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
5568 rtx_insn *inits = get_insns ();
5569 end_sequence ();
5571 if (dump_file && (dump_flags & TDF_DETAILS))
5572 for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init))
5573 fprintf (dump_file, "Missing init of reg %u inserted: insn %u\n",
5574 ix, INSN_UID (init));
5576 if (first != NULL)
5578 insert_here = emit_insn_before (inits, first);
5579 first = NULL;
5581 else
5582 insert_here = emit_insn_after (inits, insert_here);
5585 if (nvptx_comment && insert_here != NULL)
5586 emit_insn_after (gen_comment ("End: Added by -minit-regs=2"), insert_here);
5589 /* Find uses of regs that are not defined on all incoming paths, and insert a
5590 corresponding def on those.
5591 Advantage : Doesn't create long live ranges.
5592 Disadvantage: More complex, and potentially also more defs. */
5594 static void
5595 workaround_uninit_method_3 (void)
5597 auto_bitmap not_pseudo;
5598 bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER);
5600 basic_block bb;
5601 FOR_EACH_BB_FN (bb, cfun)
5603 if (single_pred_p (bb))
5604 continue;
5606 auto_bitmap bb_pseudo_uninit;
5607 bitmap_and_compl (bb_pseudo_uninit, DF_LIVE_IN (bb), DF_MIR_IN (bb));
5608 bitmap_and_compl_into (bb_pseudo_uninit, not_pseudo);
5610 bitmap_iterator iterator;
5611 unsigned ix;
5612 EXECUTE_IF_SET_IN_BITMAP (bb_pseudo_uninit, 0, ix, iterator)
5614 bool have_false = false;
5615 bool have_true = false;
5617 edge e;
5618 edge_iterator ei;
5619 FOR_EACH_EDGE (e, ei, bb->preds)
5621 if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix))
5622 have_true = true;
5623 else
5624 have_false = true;
5626 if (have_false ^ have_true)
5627 continue;
5629 FOR_EACH_EDGE (e, ei, bb->preds)
5631 if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix))
5632 continue;
5634 rtx reg = regno_reg_rtx[ix];
5635 gcc_assert (CONST0_RTX (GET_MODE (reg)));
5637 start_sequence ();
5638 emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
5639 rtx_insn *inits = get_insns ();
5640 end_sequence ();
5642 if (dump_file && (dump_flags & TDF_DETAILS))
5643 for (rtx_insn *init = inits; init != NULL;
5644 init = NEXT_INSN (init))
5645 fprintf (dump_file,
5646 "Missing init of reg %u inserted on edge: %d -> %d:"
5647 " insn %u\n", ix, e->src->index, e->dest->index,
5648 INSN_UID (init));
5650 insert_insn_on_edge (inits, e);
5655 if (nvptx_comment)
5656 FOR_EACH_BB_FN (bb, cfun)
5658 if (single_pred_p (bb))
5659 continue;
5661 edge e;
5662 edge_iterator ei;
5663 FOR_EACH_EDGE (e, ei, bb->preds)
5665 if (e->insns.r == NULL_RTX)
5666 continue;
5667 start_sequence ();
5668 emit_insn (gen_comment ("Start: Added by -minit-regs=3:"));
5669 emit_insn (e->insns.r);
5670 emit_insn (gen_comment ("End: Added by -minit-regs=3:"));
5671 e->insns.r = get_insns ();
5672 end_sequence ();
5676 commit_edge_insertions ();
5679 static void
5680 workaround_uninit (void)
5682 switch (nvptx_init_regs)
5684 case 0:
5685 /* Skip. */
5686 break;
5687 case 1:
5688 workaround_uninit_method_1 ();
5689 break;
5690 case 2:
5691 workaround_uninit_method_2 ();
5692 break;
5693 case 3:
5694 workaround_uninit_method_3 ();
5695 break;
5696 default:
5697 gcc_unreachable ();
5701 /* PTX-specific reorganization
5702 - Split blocks at fork and join instructions
5703 - Compute live registers
5704 - Mark now-unused registers, so function begin doesn't declare
5705 unused registers.
5706 - Insert state propagation when entering partitioned mode
5707 - Insert neutering instructions when in single mode
5708 - Replace subregs with suitable sequences.
5711 static void
5712 nvptx_reorg (void)
5714 /* We are freeing block_for_insn in the toplev to keep compatibility
5715 with old MDEP_REORGS that are not CFG based. Recompute it now. */
5716 compute_bb_for_insn ();
5718 thread_prologue_and_epilogue_insns ();
5720 /* Split blocks and record interesting unspecs. */
5721 bb_insn_map_t bb_insn_map;
5723 nvptx_split_blocks (&bb_insn_map);
5725 /* Compute live regs */
5726 df_clear_flags (DF_LR_RUN_DCE);
5727 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
5728 df_live_add_problem ();
5729 df_live_set_all_dirty ();
5730 if (nvptx_init_regs == 3)
5731 df_mir_add_problem ();
5732 df_analyze ();
5733 regstat_init_n_sets_and_refs ();
5735 if (dump_file)
5736 df_dump (dump_file);
5738 /* Mark unused regs as unused. */
5739 int max_regs = max_reg_num ();
5740 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
5741 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
5742 regno_reg_rtx[i] = const0_rtx;
5744 workaround_uninit ();
5746 /* Determine launch dimensions of the function. If it is not an
5747 offloaded function (i.e. this is a regular compiler), the
5748 function has no neutering. */
5749 tree attr = oacc_get_fn_attrib (current_function_decl);
5750 if (attr)
5752 /* If we determined this mask before RTL expansion, we could
5753 elide emission of some levels of forks and joins. */
5754 offload_attrs oa;
5756 populate_offload_attrs (&oa);
5758 /* If there is worker neutering, there must be vector
5759 neutering. Otherwise the hardware will fail. */
5760 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5761 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5763 /* Discover & process partitioned regions. */
5764 parallel *pars = nvptx_discover_pars (&bb_insn_map);
5765 nvptx_process_pars (pars);
5766 nvptx_neuter_pars (pars, oa.mask, 0);
5767 delete pars;
5770 /* Replace subregs. */
5771 nvptx_reorg_subreg ();
5773 if (TARGET_UNIFORM_SIMT)
5774 nvptx_reorg_uniform_simt ();
5776 #if WORKAROUND_PTXJIT_BUG_2
5777 prevent_branch_around_nothing ();
5778 #endif
5780 #ifdef WORKAROUND_PTXJIT_BUG_3
5781 workaround_barsyncs ();
5782 #endif
5784 regstat_free_n_sets_and_refs ();
5786 df_finish_pass (true);
5789 /* Handle a "kernel" attribute; arguments as in
5790 struct attribute_spec.handler. */
5792 static tree
5793 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5794 int ARG_UNUSED (flags), bool *no_add_attrs)
5796 tree decl = *node;
5798 if (TREE_CODE (decl) != FUNCTION_DECL)
5800 error ("%qE attribute only applies to functions", name);
5801 *no_add_attrs = true;
5803 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
5805 error ("%qE attribute requires a void return type", name);
5806 *no_add_attrs = true;
5809 return NULL_TREE;
5812 /* Handle a "shared" attribute; arguments as in
5813 struct attribute_spec.handler. */
5815 static tree
5816 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5817 int ARG_UNUSED (flags), bool *no_add_attrs)
5819 tree decl = *node;
5821 if (TREE_CODE (decl) != VAR_DECL)
5823 error ("%qE attribute only applies to variables", name);
5824 *no_add_attrs = true;
5826 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5828 error ("%qE attribute not allowed with auto storage class", name);
5829 *no_add_attrs = true;
5832 return NULL_TREE;
5835 /* Table of valid machine attributes. */
5836 static const struct attribute_spec nvptx_attribute_table[] =
5838 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5839 affects_type_identity, handler, exclude } */
5840 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
5841 NULL },
5842 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
5843 NULL },
5844 { NULL, 0, 0, false, false, false, false, NULL, NULL }
5847 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
5849 static HOST_WIDE_INT
5850 nvptx_vector_alignment (const_tree type)
5852 unsigned HOST_WIDE_INT align;
5853 tree size = TYPE_SIZE (type);
5855 /* Ensure align is not bigger than BIGGEST_ALIGNMENT. */
5856 if (tree_fits_uhwi_p (size))
5858 align = tree_to_uhwi (size);
5859 align = MIN (align, BIGGEST_ALIGNMENT);
5861 else
5862 align = BIGGEST_ALIGNMENT;
5864 /* Ensure align is not smaller than mode alignment. */
5865 align = MAX (align, GET_MODE_ALIGNMENT (TYPE_MODE (type)));
5867 return align;
5870 /* Indicate that INSN cannot be duplicated. */
5872 static bool
5873 nvptx_cannot_copy_insn_p (rtx_insn *insn)
5875 switch (recog_memoized (insn))
5877 case CODE_FOR_nvptx_shufflesi:
5878 case CODE_FOR_nvptx_shufflesf:
5879 case CODE_FOR_nvptx_barsync:
5880 case CODE_FOR_nvptx_fork:
5881 case CODE_FOR_nvptx_forked:
5882 case CODE_FOR_nvptx_joining:
5883 case CODE_FOR_nvptx_join:
5884 return true;
5885 default:
5886 return false;
5890 /* Section anchors do not work. Initialization for flag_section_anchor
5891 probes the existence of the anchoring target hooks and prevents
5892 anchoring if they don't exist. However, we may be being used with
5893 a host-side compiler that does support anchoring, and hence see
5894 the anchor flag set (as it's not recalculated). So provide an
5895 implementation denying anchoring. */
5897 static bool
5898 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5900 return false;
5903 /* Record a symbol for mkoffload to enter into the mapping table. */
5905 static void
5906 nvptx_record_offload_symbol (tree decl)
5908 switch (TREE_CODE (decl))
5910 case VAR_DECL:
5911 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5912 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5913 break;
5915 case FUNCTION_DECL:
5917 tree attr = oacc_get_fn_attrib (decl);
5918 /* OpenMP offloading does not set this attribute. */
5919 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
5921 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5922 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5924 for (; dims; dims = TREE_CHAIN (dims))
5926 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
5928 gcc_assert (!TREE_PURPOSE (dims));
5929 fprintf (asm_out_file, ", %#x", size);
5932 fprintf (asm_out_file, "\n");
5934 break;
5936 default:
5937 gcc_unreachable ();
5941 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5942 at the start of a file. */
5944 static void
5945 nvptx_file_start (void)
5947 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5949 fputs ("\t.version\t", asm_out_file);
5950 fputs (ptx_version_to_string ((enum ptx_version)ptx_version_option),
5951 asm_out_file);
5952 fputs ("\n", asm_out_file);
5954 fputs ("\t.target\tsm_", asm_out_file);
5955 fputs (sm_version_to_string ((enum ptx_isa)ptx_isa_option),
5956 asm_out_file);
5957 fputs ("\n", asm_out_file);
5959 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5961 fputs ("// END PREAMBLE\n", asm_out_file);
5964 /* Emit a declaration for a worker and vector-level buffer in .shared
5965 memory. */
5967 static void
5968 write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
5970 const char *name = XSTR (sym, 0);
5972 write_var_marker (file, true, false, name);
5973 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5974 align, name, size);
5977 /* Write out the function declarations we've collected and declare storage
5978 for the broadcast buffer. */
5980 static void
5981 nvptx_file_end (void)
5983 hash_table<tree_hasher>::iterator iter;
5984 tree decl;
5985 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
5986 nvptx_record_fndecl (decl);
5987 fputs (func_decls.str().c_str(), asm_out_file);
5989 if (oacc_bcast_size)
5990 write_shared_buffer (asm_out_file, oacc_bcast_sym,
5991 oacc_bcast_align, oacc_bcast_size);
5993 if (worker_red_size)
5994 write_shared_buffer (asm_out_file, worker_red_sym,
5995 worker_red_align, worker_red_size);
5997 if (vector_red_size)
5998 write_shared_buffer (asm_out_file, vector_red_sym,
5999 vector_red_align, vector_red_size);
6001 if (gang_private_shared_size)
6002 write_shared_buffer (asm_out_file, gang_private_shared_sym,
6003 gang_private_shared_align, gang_private_shared_size);
6005 if (need_softstack_decl)
6007 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
6008 /* 32 is the maximum number of warps in a block. Even though it's an
6009 external declaration, emit the array size explicitly; otherwise, it
6010 may fail at PTX JIT time if the definition is later in link order. */
6011 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
6012 POINTER_SIZE);
6014 if (need_unisimt_decl)
6016 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
6017 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
6021 /* Expander for the shuffle builtins. */
6023 static rtx
6024 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
6026 if (ignore)
6027 return target;
6029 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
6030 NULL_RTX, mode, EXPAND_NORMAL);
6031 if (!REG_P (src))
6032 src = copy_to_mode_reg (mode, src);
6034 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
6035 NULL_RTX, SImode, EXPAND_NORMAL);
6036 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
6037 NULL_RTX, SImode, EXPAND_NORMAL);
6039 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
6040 idx = copy_to_mode_reg (SImode, idx);
6042 rtx pat = nvptx_gen_shuffle (target, src, idx,
6043 (nvptx_shuffle_kind) INTVAL (op));
6044 if (pat)
6045 emit_insn (pat);
6047 return target;
6050 const char *
6051 nvptx_output_red_partition (rtx dst, rtx offset)
6053 const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
6054 const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
6056 if (offset == const0_rtx)
6057 fprintf (asm_out_file, zero_offset, REGNO (dst),
6058 REGNO (cfun->machine->red_partition));
6059 else
6060 fprintf (asm_out_file, with_offset, REGNO (dst),
6061 REGNO (cfun->machine->red_partition), UINTVAL (offset));
6063 return "";
6066 /* Shared-memory reduction address expander. */
6068 static rtx
6069 nvptx_expand_shared_addr (tree exp, rtx target,
6070 machine_mode ARG_UNUSED (mode), int ignore,
6071 int vector)
6073 if (ignore)
6074 return target;
6076 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
6077 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
6078 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
6079 rtx addr = worker_red_sym;
6081 if (vector)
6083 offload_attrs oa;
6085 populate_offload_attrs (&oa);
6087 unsigned int psize = ROUND_UP (size + offset, align);
6088 unsigned int pnum = nvptx_mach_max_workers ();
6089 vector_red_partition = MAX (vector_red_partition, psize);
6090 vector_red_size = MAX (vector_red_size, psize * pnum);
6091 vector_red_align = MAX (vector_red_align, align);
6093 if (cfun->machine->red_partition == NULL)
6094 cfun->machine->red_partition = gen_reg_rtx (Pmode);
6096 addr = gen_reg_rtx (Pmode);
6097 emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
6099 else
6101 worker_red_align = MAX (worker_red_align, align);
6102 worker_red_size = MAX (worker_red_size, size + offset);
6104 if (offset)
6106 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
6107 addr = gen_rtx_CONST (Pmode, addr);
6111 emit_move_insn (target, addr);
6112 return target;
6115 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
6116 not require taking the address of any object, other than the memory
6117 cell being operated on. */
6119 static rtx
6120 nvptx_expand_cmp_swap (tree exp, rtx target,
6121 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
6123 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
6125 if (!target)
6126 target = gen_reg_rtx (mode);
6128 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
6129 NULL_RTX, Pmode, EXPAND_NORMAL);
6130 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
6131 NULL_RTX, mode, EXPAND_NORMAL);
6132 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
6133 NULL_RTX, mode, EXPAND_NORMAL);
6134 rtx pat;
6136 mem = gen_rtx_MEM (mode, mem);
6137 if (!REG_P (cmp))
6138 cmp = copy_to_mode_reg (mode, cmp);
6139 if (!REG_P (src))
6140 src = copy_to_mode_reg (mode, src);
6142 if (mode == SImode)
6143 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
6144 else
6145 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
6147 emit_insn (pat);
6149 return target;
6153 /* Codes for all the NVPTX builtins. */
6154 enum nvptx_builtins
6156 NVPTX_BUILTIN_SHUFFLE,
6157 NVPTX_BUILTIN_SHUFFLELL,
6158 NVPTX_BUILTIN_WORKER_ADDR,
6159 NVPTX_BUILTIN_VECTOR_ADDR,
6160 NVPTX_BUILTIN_CMP_SWAP,
6161 NVPTX_BUILTIN_CMP_SWAPLL,
6162 NVPTX_BUILTIN_MEMBAR_GL,
6163 NVPTX_BUILTIN_MEMBAR_CTA,
6164 NVPTX_BUILTIN_BAR_RED_AND,
6165 NVPTX_BUILTIN_BAR_RED_OR,
6166 NVPTX_BUILTIN_BAR_RED_POPC,
6167 NVPTX_BUILTIN_MAX
6170 /* Expander for 'bar.red' instruction builtins. */
6172 static rtx
6173 nvptx_expand_bar_red (tree exp, rtx target,
6174 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
6176 int code = DECL_MD_FUNCTION_CODE (TREE_OPERAND (CALL_EXPR_FN (exp), 0));
6177 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
6179 if (!target)
6180 target = gen_reg_rtx (mode);
6182 rtx pred, dst;
6183 rtx bar = expand_expr (CALL_EXPR_ARG (exp, 0),
6184 NULL_RTX, SImode, EXPAND_NORMAL);
6185 rtx nthr = expand_expr (CALL_EXPR_ARG (exp, 1),
6186 NULL_RTX, SImode, EXPAND_NORMAL);
6187 rtx cpl = expand_expr (CALL_EXPR_ARG (exp, 2),
6188 NULL_RTX, SImode, EXPAND_NORMAL);
6189 rtx redop = expand_expr (CALL_EXPR_ARG (exp, 3),
6190 NULL_RTX, SImode, EXPAND_NORMAL);
6191 if (CONST_INT_P (bar))
6193 if (INTVAL (bar) < 0 || INTVAL (bar) > 15)
6195 error_at (EXPR_LOCATION (exp),
6196 "barrier value must be within [0,15]");
6197 return const0_rtx;
6200 else if (!REG_P (bar))
6201 bar = copy_to_mode_reg (SImode, bar);
6203 if (!CONST_INT_P (nthr) && !REG_P (nthr))
6204 nthr = copy_to_mode_reg (SImode, nthr);
6206 if (!CONST_INT_P (cpl))
6208 error_at (EXPR_LOCATION (exp),
6209 "complement argument must be constant");
6210 return const0_rtx;
6213 pred = gen_reg_rtx (BImode);
6214 if (!REG_P (redop))
6215 redop = copy_to_mode_reg (SImode, redop);
6216 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, redop, GEN_INT (0))));
6217 redop = pred;
6219 rtx pat;
6220 switch (code)
6222 case NVPTX_BUILTIN_BAR_RED_AND:
6223 dst = gen_reg_rtx (BImode);
6224 pat = gen_nvptx_barred_and (dst, bar, nthr, cpl, redop);
6225 break;
6226 case NVPTX_BUILTIN_BAR_RED_OR:
6227 dst = gen_reg_rtx (BImode);
6228 pat = gen_nvptx_barred_or (dst, bar, nthr, cpl, redop);
6229 break;
6230 case NVPTX_BUILTIN_BAR_RED_POPC:
6231 dst = gen_reg_rtx (SImode);
6232 pat = gen_nvptx_barred_popc (dst, bar, nthr, cpl, redop);
6233 break;
6234 default:
6235 gcc_unreachable ();
6237 emit_insn (pat);
6238 if (GET_MODE (dst) == BImode)
6240 rtx tmp = gen_reg_rtx (mode);
6241 emit_insn (gen_rtx_SET (tmp, gen_rtx_NE (mode, dst, GEN_INT (0))));
6242 dst = tmp;
6244 emit_move_insn (target, dst);
6245 return target;
6248 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
6250 /* Return the NVPTX builtin for CODE. */
6252 static tree
6253 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
6255 if (code >= NVPTX_BUILTIN_MAX)
6256 return error_mark_node;
6258 return nvptx_builtin_decls[code];
6261 /* Set up all builtin functions for this target. */
6263 static void
6264 nvptx_init_builtins (void)
6266 #define DEF(ID, NAME, T) \
6267 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
6268 = add_builtin_function ("__builtin_nvptx_" NAME, \
6269 build_function_type_list T, \
6270 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
6271 #define ST sizetype
6272 #define UINT unsigned_type_node
6273 #define LLUINT long_long_unsigned_type_node
6274 #define PTRVOID ptr_type_node
6275 #define VOID void_type_node
6277 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
6278 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
6279 DEF (WORKER_ADDR, "worker_addr",
6280 (PTRVOID, ST, UINT, UINT, NULL_TREE));
6281 DEF (VECTOR_ADDR, "vector_addr",
6282 (PTRVOID, ST, UINT, UINT, NULL_TREE));
6283 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
6284 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
6285 DEF (MEMBAR_GL, "membar_gl", (VOID, VOID, NULL_TREE));
6286 DEF (MEMBAR_CTA, "membar_cta", (VOID, VOID, NULL_TREE));
6288 DEF (BAR_RED_AND, "bar_red_and",
6289 (UINT, UINT, UINT, UINT, UINT, NULL_TREE));
6290 DEF (BAR_RED_OR, "bar_red_or",
6291 (UINT, UINT, UINT, UINT, UINT, NULL_TREE));
6292 DEF (BAR_RED_POPC, "bar_red_popc",
6293 (UINT, UINT, UINT, UINT, UINT, NULL_TREE));
6295 #undef DEF
6296 #undef ST
6297 #undef UINT
6298 #undef LLUINT
6299 #undef PTRVOID
6302 /* Expand an expression EXP that calls a built-in function,
6303 with result going to TARGET if that's convenient
6304 (and in mode MODE if that's convenient).
6305 SUBTARGET may be used as the target for computing one of EXP's operands.
6306 IGNORE is nonzero if the value is to be ignored. */
6308 static rtx
6309 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
6310 machine_mode mode, int ignore)
6312 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
6313 switch (DECL_MD_FUNCTION_CODE (fndecl))
6315 case NVPTX_BUILTIN_SHUFFLE:
6316 case NVPTX_BUILTIN_SHUFFLELL:
6317 return nvptx_expand_shuffle (exp, target, mode, ignore);
6319 case NVPTX_BUILTIN_WORKER_ADDR:
6320 return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
6322 case NVPTX_BUILTIN_VECTOR_ADDR:
6323 return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
6325 case NVPTX_BUILTIN_CMP_SWAP:
6326 case NVPTX_BUILTIN_CMP_SWAPLL:
6327 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
6329 case NVPTX_BUILTIN_MEMBAR_GL:
6330 emit_insn (gen_nvptx_membar_gl ());
6331 return NULL_RTX;
6333 case NVPTX_BUILTIN_MEMBAR_CTA:
6334 emit_insn (gen_nvptx_membar_cta ());
6335 return NULL_RTX;
6337 case NVPTX_BUILTIN_BAR_RED_AND:
6338 case NVPTX_BUILTIN_BAR_RED_OR:
6339 case NVPTX_BUILTIN_BAR_RED_POPC:
6340 return nvptx_expand_bar_red (exp, target, mode, ignore);
6342 default: gcc_unreachable ();
6346 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
6348 static int
6349 nvptx_simt_vf ()
6351 return PTX_WARP_SIZE;
6354 /* Return 1 if TRAIT NAME is present in the OpenMP context's
6355 device trait set, return 0 if not present in any OpenMP context in the
6356 whole translation unit, or -1 if not present in the current OpenMP context
6357 but might be present in another OpenMP context in the same TU. */
6360 nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
6361 const char *name)
6363 switch (trait)
6365 case omp_device_kind:
6366 return strcmp (name, "gpu") == 0;
6367 case omp_device_arch:
6368 return strcmp (name, "nvptx") == 0;
6369 case omp_device_isa:
6370 #define NVPTX_SM(XX, SEP) \
6372 if (strcmp (name, "sm_" #XX) == 0) \
6373 return ptx_isa_option == PTX_ISA_SM ## XX; \
6375 #include "nvptx-sm.def"
6376 #undef NVPTX_SM
6377 return 0;
6378 default:
6379 gcc_unreachable ();
6383 static bool
6384 nvptx_welformed_vector_length_p (int l)
6386 gcc_assert (l > 0);
6387 return l % PTX_WARP_SIZE == 0;
6390 static void
6391 nvptx_apply_dim_limits (int dims[])
6393 /* Check that the vector_length is not too large. */
6394 if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
6395 dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
6397 /* Check that the number of workers is not too large. */
6398 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
6399 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
6401 /* Ensure that num_worker * vector_length <= cta size. */
6402 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
6403 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
6404 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6406 /* If we need a per-worker barrier ... . */
6407 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
6408 && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
6409 /* Don't use more barriers than available. */
6410 dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
6411 PTX_NUM_PER_WORKER_BARRIERS);
6414 /* Return true if FNDECL contains calls to vector-partitionable routines. */
6416 static bool
6417 has_vector_partitionable_routine_calls_p (tree fndecl)
6419 if (!fndecl)
6420 return false;
6422 basic_block bb;
6423 FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
6424 for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
6425 gsi_next_nondebug (&i))
6427 gimple *stmt = gsi_stmt (i);
6428 if (gimple_code (stmt) != GIMPLE_CALL)
6429 continue;
6431 tree callee = gimple_call_fndecl (stmt);
6432 if (!callee)
6433 continue;
6435 tree attrs = oacc_get_fn_attrib (callee);
6436 if (attrs == NULL_TREE)
6437 return false;
6439 int partition_level = oacc_fn_attrib_level (attrs);
6440 bool seq_routine_p = partition_level == GOMP_DIM_MAX;
6441 if (!seq_routine_p)
6442 return true;
6445 return false;
6448 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
6449 DIMS has changed. */
6451 static void
6452 nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
6454 bool oacc_default_dims_p = false;
6455 bool oacc_min_dims_p = false;
6456 bool offload_region_p = false;
6457 bool routine_p = false;
6458 bool routine_seq_p = false;
6459 int default_vector_length = -1;
6461 if (decl == NULL_TREE)
6463 if (fn_level == -1)
6464 oacc_default_dims_p = true;
6465 else if (fn_level == -2)
6466 oacc_min_dims_p = true;
6467 else
6468 gcc_unreachable ();
6470 else if (fn_level == -1)
6471 offload_region_p = true;
6472 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
6474 routine_p = true;
6475 routine_seq_p = fn_level == GOMP_DIM_MAX;
6477 else
6478 gcc_unreachable ();
6480 if (oacc_min_dims_p)
6482 gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
6483 gcc_assert (dims[GOMP_DIM_WORKER] == 1);
6484 gcc_assert (dims[GOMP_DIM_GANG] == 1);
6486 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6487 return;
6490 if (routine_p)
6492 if (!routine_seq_p)
6493 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6495 return;
6498 if (oacc_default_dims_p)
6500 /* -1 : not set
6501 0 : set at runtime, f.i. -fopenacc-dims=-
6502 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
6503 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
6504 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
6505 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
6507 /* But -fopenacc-dims=- is not yet supported on trunk. */
6508 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
6509 gcc_assert (dims[GOMP_DIM_WORKER] != 0);
6510 gcc_assert (dims[GOMP_DIM_GANG] != 0);
6513 if (offload_region_p)
6515 /* -1 : not set
6516 0 : set using variable, f.i. num_gangs (n)
6517 >= 1: set using constant, f.i. num_gangs (1). */
6518 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
6519 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
6520 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
6523 if (offload_region_p)
6524 default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
6525 else
6526 /* oacc_default_dims_p. */
6527 default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
6529 int old_dims[GOMP_DIM_MAX];
6530 unsigned int i;
6531 for (i = 0; i < GOMP_DIM_MAX; ++i)
6532 old_dims[i] = dims[i];
6534 const char *vector_reason = NULL;
6535 if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
6537 default_vector_length = PTX_WARP_SIZE;
6539 if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
6541 vector_reason = G_("using %<vector_length (%d)%> due to call to"
6542 " vector-partitionable routine, ignoring %d");
6543 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6547 if (dims[GOMP_DIM_VECTOR] == 0)
6549 vector_reason = G_("using %<vector_length (%d)%>, ignoring runtime setting");
6550 dims[GOMP_DIM_VECTOR] = default_vector_length;
6553 if (dims[GOMP_DIM_VECTOR] > 0
6554 && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
6555 dims[GOMP_DIM_VECTOR] = default_vector_length;
6557 nvptx_apply_dim_limits (dims);
6559 if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
6560 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
6561 vector_reason != NULL
6562 ? vector_reason
6563 : G_("using %<vector_length (%d)%>, ignoring %d"),
6564 dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
6566 if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
6567 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
6568 G_("using %<num_workers (%d)%>, ignoring %d"),
6569 dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
6571 if (oacc_default_dims_p)
6573 if (dims[GOMP_DIM_VECTOR] < 0)
6574 dims[GOMP_DIM_VECTOR] = default_vector_length;
6575 if (dims[GOMP_DIM_WORKER] < 0)
6576 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
6577 if (dims[GOMP_DIM_GANG] < 0)
6578 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
6579 nvptx_apply_dim_limits (dims);
6582 if (offload_region_p)
6584 for (i = 0; i < GOMP_DIM_MAX; i++)
6586 if (!(dims[i] < 0))
6587 continue;
6589 if ((used & GOMP_DIM_MASK (i)) == 0)
6590 /* Function oacc_validate_dims will apply the minimal dimension. */
6591 continue;
6593 dims[i] = (i == GOMP_DIM_VECTOR
6594 ? default_vector_length
6595 : oacc_get_default_dim (i));
6598 nvptx_apply_dim_limits (dims);
6602 /* Validate compute dimensions of an OpenACC offload or routine, fill
6603 in non-unity defaults. FN_LEVEL indicates the level at which a
6604 routine might spawn a loop. It is negative for non-routines. If
6605 DECL is null, we are validating the default dimensions. */
6607 static bool
6608 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
6610 int old_dims[GOMP_DIM_MAX];
6611 unsigned int i;
6613 for (i = 0; i < GOMP_DIM_MAX; ++i)
6614 old_dims[i] = dims[i];
6616 nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
6618 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
6619 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
6620 gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
6622 for (i = 0; i < GOMP_DIM_MAX; ++i)
6623 if (old_dims[i] != dims[i])
6624 return true;
6626 return false;
6629 /* Return maximum dimension size, or zero for unbounded. */
6631 static int
6632 nvptx_dim_limit (int axis)
6634 switch (axis)
6636 case GOMP_DIM_VECTOR:
6637 return PTX_MAX_VECTOR_LENGTH;
6639 default:
6640 break;
6642 return 0;
6645 /* Determine whether fork & joins are needed. */
6647 static bool
6648 nvptx_goacc_fork_join (gcall *call, const int dims[],
6649 bool ARG_UNUSED (is_fork))
6651 tree arg = gimple_call_arg (call, 2);
6652 unsigned axis = TREE_INT_CST_LOW (arg);
6654 /* We only care about worker and vector partitioning. */
6655 if (axis < GOMP_DIM_WORKER)
6656 return false;
6658 /* If the size is 1, there's no partitioning. */
6659 if (dims[axis] == 1)
6660 return false;
6662 return true;
6665 /* Generate a PTX builtin function call that returns the address in
6666 the worker reduction buffer at OFFSET. TYPE is the type of the
6667 data at that location. */
6669 static tree
6670 nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
6672 enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
6673 if (vector)
6674 addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
6675 machine_mode mode = TYPE_MODE (type);
6676 tree fndecl = nvptx_builtin_decl (addr_dim, true);
6677 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
6678 tree align = build_int_cst (unsigned_type_node,
6679 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
6680 tree call = build_call_expr (fndecl, 3, offset, size, align);
6682 return fold_convert (build_pointer_type (type), call);
6685 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
6686 will cast the variable if necessary. */
6688 static void
6689 nvptx_generate_vector_shuffle (location_t loc,
6690 tree dest_var, tree var, unsigned shift,
6691 gimple_seq *seq)
6693 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
6694 tree_code code = NOP_EXPR;
6695 tree arg_type = unsigned_type_node;
6696 tree var_type = TREE_TYPE (var);
6697 tree dest_type = var_type;
6699 if (TREE_CODE (var_type) == COMPLEX_TYPE)
6700 var_type = TREE_TYPE (var_type);
6702 if (TREE_CODE (var_type) == REAL_TYPE)
6703 code = VIEW_CONVERT_EXPR;
6705 if (TYPE_SIZE (var_type)
6706 == TYPE_SIZE (long_long_unsigned_type_node))
6708 fn = NVPTX_BUILTIN_SHUFFLELL;
6709 arg_type = long_long_unsigned_type_node;
6712 tree call = nvptx_builtin_decl (fn, true);
6713 tree bits = build_int_cst (unsigned_type_node, shift);
6714 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
6715 tree expr;
6717 if (var_type != dest_type)
6719 /* Do real and imaginary parts separately. */
6720 tree real = fold_build1 (REALPART_EXPR, var_type, var);
6721 real = fold_build1 (code, arg_type, real);
6722 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
6723 real = fold_build1 (code, var_type, real);
6725 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
6726 imag = fold_build1 (code, arg_type, imag);
6727 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
6728 imag = fold_build1 (code, var_type, imag);
6730 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
6732 else
6734 expr = fold_build1 (code, arg_type, var);
6735 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
6736 expr = fold_build1 (code, dest_type, expr);
6739 gimplify_assign (dest_var, expr, seq);
6742 /* Lazily generate the global lock var decl and return its address. */
6744 static tree
6745 nvptx_global_lock_addr ()
6747 tree v = global_lock_var;
6749 if (!v)
6751 tree name = get_identifier ("__reduction_lock");
6752 tree type = build_qualified_type (unsigned_type_node,
6753 TYPE_QUAL_VOLATILE);
6754 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
6755 global_lock_var = v;
6756 DECL_ARTIFICIAL (v) = 1;
6757 DECL_EXTERNAL (v) = 1;
6758 TREE_STATIC (v) = 1;
6759 TREE_PUBLIC (v) = 1;
6760 TREE_USED (v) = 1;
6761 mark_addressable (v);
6762 mark_decl_referenced (v);
6765 return build_fold_addr_expr (v);
6768 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
6769 GSI. We use a lockless scheme for nearly all case, which looks
6770 like:
6771 actual = initval(OP);
6772 do {
6773 guess = actual;
6774 write = guess OP myval;
6775 actual = cmp&swap (ptr, guess, write)
6776 } while (actual bit-different-to guess);
6777 return write;
6779 This relies on a cmp&swap instruction, which is available for 32-
6780 and 64-bit types. Larger types must use a locking scheme. */
6782 static tree
6783 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
6784 tree ptr, tree var, tree_code op)
6786 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
6787 tree_code code = NOP_EXPR;
6788 tree arg_type = unsigned_type_node;
6789 tree var_type = TREE_TYPE (var);
6791 if (TREE_CODE (var_type) == COMPLEX_TYPE
6792 || TREE_CODE (var_type) == REAL_TYPE)
6793 code = VIEW_CONVERT_EXPR;
6795 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
6797 arg_type = long_long_unsigned_type_node;
6798 fn = NVPTX_BUILTIN_CMP_SWAPLL;
6801 tree swap_fn = nvptx_builtin_decl (fn, true);
6803 gimple_seq init_seq = NULL;
6804 tree init_var = make_ssa_name (arg_type);
6805 tree init_expr = omp_reduction_init_op (loc, op, var_type);
6806 init_expr = fold_build1 (code, arg_type, init_expr);
6807 gimplify_assign (init_var, init_expr, &init_seq);
6808 gimple *init_end = gimple_seq_last (init_seq);
6810 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
6812 /* Split the block just after the init stmts. */
6813 basic_block pre_bb = gsi_bb (*gsi);
6814 edge pre_edge = split_block (pre_bb, init_end);
6815 basic_block loop_bb = pre_edge->dest;
6816 pre_bb = pre_edge->src;
6817 /* Reset the iterator. */
6818 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6820 tree expect_var = make_ssa_name (arg_type);
6821 tree actual_var = make_ssa_name (arg_type);
6822 tree write_var = make_ssa_name (arg_type);
6824 /* Build and insert the reduction calculation. */
6825 gimple_seq red_seq = NULL;
6826 tree write_expr = fold_build1 (code, var_type, expect_var);
6827 write_expr = fold_build2 (op, var_type, write_expr, var);
6828 write_expr = fold_build1 (code, arg_type, write_expr);
6829 gimplify_assign (write_var, write_expr, &red_seq);
6831 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6833 /* Build & insert the cmp&swap sequence. */
6834 gimple_seq latch_seq = NULL;
6835 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
6836 ptr, expect_var, write_var);
6837 gimplify_assign (actual_var, swap_expr, &latch_seq);
6839 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
6840 NULL_TREE, NULL_TREE);
6841 gimple_seq_add_stmt (&latch_seq, cond);
6843 gimple *latch_end = gimple_seq_last (latch_seq);
6844 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
6846 /* Split the block just after the latch stmts. */
6847 edge post_edge = split_block (loop_bb, latch_end);
6848 basic_block post_bb = post_edge->dest;
6849 loop_bb = post_edge->src;
6850 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6852 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6853 post_edge->probability = profile_probability::even ();
6854 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
6855 loop_edge->probability = profile_probability::even ();
6856 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
6857 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
6859 gphi *phi = create_phi_node (expect_var, loop_bb);
6860 add_phi_arg (phi, init_var, pre_edge, loc);
6861 add_phi_arg (phi, actual_var, loop_edge, loc);
6863 loop *loop = alloc_loop ();
6864 loop->header = loop_bb;
6865 loop->latch = loop_bb;
6866 add_loop (loop, loop_bb->loop_father);
6868 return fold_build1 (code, var_type, write_var);
6871 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
6872 GSI. This is necessary for types larger than 64 bits, where there
6873 is no cmp&swap instruction to implement a lockless scheme. We use
6874 a lock variable in global memory.
6876 while (cmp&swap (&lock_var, 0, 1))
6877 continue;
6878 T accum = *ptr;
6879 accum = accum OP var;
6880 *ptr = accum;
6881 cmp&swap (&lock_var, 1, 0);
6882 return accum;
6884 A lock in global memory is necessary to force execution engine
6885 descheduling and avoid resource starvation that can occur if the
6886 lock is in .shared memory. */
6888 static tree
6889 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
6890 tree ptr, tree var, tree_code op, int level)
6892 tree var_type = TREE_TYPE (var);
6893 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
6894 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
6895 tree uns_locked = build_int_cst (unsigned_type_node, 1);
6897 /* Split the block just before the gsi. Insert a gimple nop to make
6898 this easier. */
6899 gimple *nop = gimple_build_nop ();
6900 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
6901 basic_block entry_bb = gsi_bb (*gsi);
6902 edge entry_edge = split_block (entry_bb, nop);
6903 basic_block lock_bb = entry_edge->dest;
6904 /* Reset the iterator. */
6905 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6907 /* Build and insert the locking sequence. */
6908 gimple_seq lock_seq = NULL;
6909 tree lock_var = make_ssa_name (unsigned_type_node);
6910 tree lock_expr = nvptx_global_lock_addr ();
6911 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6912 uns_unlocked, uns_locked);
6913 gimplify_assign (lock_var, lock_expr, &lock_seq);
6914 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6915 NULL_TREE, NULL_TREE);
6916 gimple_seq_add_stmt (&lock_seq, cond);
6917 gimple *lock_end = gimple_seq_last (lock_seq);
6918 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6920 /* Split the block just after the lock sequence. */
6921 edge locked_edge = split_block (lock_bb, lock_end);
6922 basic_block update_bb = locked_edge->dest;
6923 lock_bb = locked_edge->src;
6924 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6926 /* Create the lock loop ... */
6927 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6928 locked_edge->probability = profile_probability::even ();
6929 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6930 loop_edge->probability = profile_probability::even ();
6931 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6932 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6934 /* ... and the loop structure. */
6935 loop *lock_loop = alloc_loop ();
6936 lock_loop->header = lock_bb;
6937 lock_loop->latch = lock_bb;
6938 lock_loop->nb_iterations_estimate = 1;
6939 lock_loop->any_estimate = true;
6940 add_loop (lock_loop, entry_bb->loop_father);
6942 /* Build the pre-barrier. */
6943 gimple_seq red_seq = NULL;
6944 enum nvptx_builtins barrier_builtin
6945 = (level == GOMP_DIM_GANG
6946 ? NVPTX_BUILTIN_MEMBAR_GL
6947 : NVPTX_BUILTIN_MEMBAR_CTA);
6948 tree barrier_fn = nvptx_builtin_decl (barrier_builtin, true);
6949 tree barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
6950 gimplify_stmt (&barrier_expr, &red_seq);
6952 /* Build the reduction calculation. */
6953 tree acc_in = make_ssa_name (var_type);
6954 tree ref_in = build_simple_mem_ref (ptr);
6955 TREE_THIS_VOLATILE (ref_in) = 1;
6956 gimplify_assign (acc_in, ref_in, &red_seq);
6958 tree acc_out = make_ssa_name (var_type);
6959 tree update_expr = fold_build2 (op, var_type, ref_in, var);
6960 gimplify_assign (acc_out, update_expr, &red_seq);
6962 tree ref_out = build_simple_mem_ref (ptr);
6963 TREE_THIS_VOLATILE (ref_out) = 1;
6964 gimplify_assign (ref_out, acc_out, &red_seq);
6966 /* Build the post-barrier. */
6967 barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
6968 gimplify_stmt (&barrier_expr, &red_seq);
6970 /* Insert the reduction calculation. */
6971 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6973 /* Build & insert the unlock sequence. */
6974 gimple_seq unlock_seq = NULL;
6975 tree unlock_expr = nvptx_global_lock_addr ();
6976 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6977 uns_locked, uns_unlocked);
6978 gimplify_and_add (unlock_expr, &unlock_seq);
6979 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6981 return acc_out;
6984 /* Emit a sequence to update a reduction accumlator at *PTR with the
6985 value held in VAR using operator OP. Return the updated value.
6987 TODO: optimize for atomic ops and indepedent complex ops. */
6989 static tree
6990 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6991 tree ptr, tree var, tree_code op, int level)
6993 tree type = TREE_TYPE (var);
6994 tree size = TYPE_SIZE (type);
6996 if (size == TYPE_SIZE (unsigned_type_node)
6997 || size == TYPE_SIZE (long_long_unsigned_type_node))
6998 return nvptx_lockless_update (loc, gsi, ptr, var, op);
6999 else
7000 return nvptx_lockfull_update (loc, gsi, ptr, var, op, level);
7003 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
7005 static void
7006 nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
7008 gimple_stmt_iterator gsi = gsi_for_stmt (call);
7009 tree lhs = gimple_call_lhs (call);
7010 tree var = gimple_call_arg (call, 2);
7011 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
7012 gimple_seq seq = NULL;
7014 push_gimplify_context (true);
7016 if (level != GOMP_DIM_GANG)
7018 /* Copy the receiver object. */
7019 tree ref_to_res = gimple_call_arg (call, 1);
7021 if (!integer_zerop (ref_to_res))
7022 var = build_simple_mem_ref (ref_to_res);
7025 if (level == GOMP_DIM_WORKER
7026 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
7028 /* Store incoming value to worker reduction buffer. */
7029 tree offset = gimple_call_arg (call, 5);
7030 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
7031 level == GOMP_DIM_VECTOR);
7032 tree ptr = make_ssa_name (TREE_TYPE (call));
7034 gimplify_assign (ptr, call, &seq);
7035 tree ref = build_simple_mem_ref (ptr);
7036 TREE_THIS_VOLATILE (ref) = 1;
7037 gimplify_assign (ref, var, &seq);
7040 if (lhs)
7041 gimplify_assign (lhs, var, &seq);
7043 pop_gimplify_context (NULL);
7044 gsi_replace_with_seq (&gsi, seq, true);
7047 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
7049 static void
7050 nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
7052 gimple_stmt_iterator gsi = gsi_for_stmt (call);
7053 tree lhs = gimple_call_lhs (call);
7054 tree var = gimple_call_arg (call, 2);
7055 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
7056 enum tree_code rcode
7057 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
7058 tree init = omp_reduction_init_op (gimple_location (call), rcode,
7059 TREE_TYPE (var));
7060 gimple_seq seq = NULL;
7062 push_gimplify_context (true);
7064 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
7066 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
7067 tree tid = make_ssa_name (integer_type_node);
7068 tree dim_vector = gimple_call_arg (call, 3);
7069 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
7070 dim_vector);
7071 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
7072 NULL_TREE, NULL_TREE);
7074 gimple_call_set_lhs (tid_call, tid);
7075 gimple_seq_add_stmt (&seq, tid_call);
7076 gimple_seq_add_stmt (&seq, cond_stmt);
7078 /* Split the block just after the call. */
7079 edge init_edge = split_block (gsi_bb (gsi), call);
7080 basic_block init_bb = init_edge->dest;
7081 basic_block call_bb = init_edge->src;
7083 /* Fixup flags from call_bb to init_bb. */
7084 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
7085 init_edge->probability = profile_probability::even ();
7087 /* Set the initialization stmts. */
7088 gimple_seq init_seq = NULL;
7089 tree init_var = make_ssa_name (TREE_TYPE (var));
7090 gimplify_assign (init_var, init, &init_seq);
7091 gsi = gsi_start_bb (init_bb);
7092 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
7094 /* Split block just after the init stmt. */
7095 gsi_prev (&gsi);
7096 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
7097 basic_block dst_bb = inited_edge->dest;
7099 /* Create false edge from call_bb to dst_bb. */
7100 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
7101 nop_edge->probability = profile_probability::even ();
7103 /* Create phi node in dst block. */
7104 gphi *phi = create_phi_node (lhs, dst_bb);
7105 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
7106 add_phi_arg (phi, var, nop_edge, gimple_location (call));
7108 /* Reset dominator of dst bb. */
7109 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
7111 /* Reset the gsi. */
7112 gsi = gsi_for_stmt (call);
7114 else
7116 if (level == GOMP_DIM_GANG)
7118 /* If there's no receiver object, propagate the incoming VAR. */
7119 tree ref_to_res = gimple_call_arg (call, 1);
7120 if (integer_zerop (ref_to_res))
7121 init = var;
7124 if (lhs != NULL_TREE)
7125 gimplify_assign (lhs, init, &seq);
7128 pop_gimplify_context (NULL);
7129 gsi_replace_with_seq (&gsi, seq, true);
7132 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
7134 static void
7135 nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
7137 gimple_stmt_iterator gsi = gsi_for_stmt (call);
7138 tree lhs = gimple_call_lhs (call);
7139 tree ref_to_res = gimple_call_arg (call, 1);
7140 tree var = gimple_call_arg (call, 2);
7141 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
7142 enum tree_code op
7143 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
7144 gimple_seq seq = NULL;
7145 tree r = NULL_TREE;
7147 push_gimplify_context (true);
7149 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
7151 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
7152 but that requires a method of emitting a unified jump at the
7153 gimple level. */
7154 for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
7156 tree other_var = make_ssa_name (TREE_TYPE (var));
7157 nvptx_generate_vector_shuffle (gimple_location (call),
7158 other_var, var, shfl, &seq);
7160 r = make_ssa_name (TREE_TYPE (var));
7161 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
7162 var, other_var), &seq);
7163 var = r;
7166 else
7168 tree accum = NULL_TREE;
7170 if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
7172 /* Get reduction buffer address. */
7173 tree offset = gimple_call_arg (call, 5);
7174 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
7175 level == GOMP_DIM_VECTOR);
7176 tree ptr = make_ssa_name (TREE_TYPE (call));
7178 gimplify_assign (ptr, call, &seq);
7179 accum = ptr;
7181 else if (integer_zerop (ref_to_res))
7182 r = var;
7183 else
7184 accum = ref_to_res;
7186 if (accum)
7188 /* UPDATE the accumulator. */
7189 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
7190 seq = NULL;
7191 r = nvptx_reduction_update (gimple_location (call), &gsi,
7192 accum, var, op, level);
7196 if (lhs)
7197 gimplify_assign (lhs, r, &seq);
7198 pop_gimplify_context (NULL);
7200 gsi_replace_with_seq (&gsi, seq, true);
7203 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
7205 static void
7206 nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
7208 gimple_stmt_iterator gsi = gsi_for_stmt (call);
7209 tree lhs = gimple_call_lhs (call);
7210 tree var = gimple_call_arg (call, 2);
7211 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
7212 gimple_seq seq = NULL;
7214 push_gimplify_context (true);
7215 if (level == GOMP_DIM_WORKER
7216 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
7218 /* Read the worker reduction buffer. */
7219 tree offset = gimple_call_arg (call, 5);
7220 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
7221 level == GOMP_DIM_VECTOR);
7222 tree ptr = make_ssa_name (TREE_TYPE (call));
7224 gimplify_assign (ptr, call, &seq);
7225 var = build_simple_mem_ref (ptr);
7226 TREE_THIS_VOLATILE (var) = 1;
7229 if (level != GOMP_DIM_GANG)
7231 /* Write to the receiver object. */
7232 tree ref_to_res = gimple_call_arg (call, 1);
7234 if (!integer_zerop (ref_to_res))
7235 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
7238 if (lhs)
7239 gimplify_assign (lhs, var, &seq);
7241 pop_gimplify_context (NULL);
7243 gsi_replace_with_seq (&gsi, seq, true);
7246 /* NVPTX reduction expander. */
7248 static void
7249 nvptx_goacc_reduction (gcall *call)
7251 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
7252 offload_attrs oa;
7254 populate_offload_attrs (&oa);
7256 switch (code)
7258 case IFN_GOACC_REDUCTION_SETUP:
7259 nvptx_goacc_reduction_setup (call, &oa);
7260 break;
7262 case IFN_GOACC_REDUCTION_INIT:
7263 nvptx_goacc_reduction_init (call, &oa);
7264 break;
7266 case IFN_GOACC_REDUCTION_FINI:
7267 nvptx_goacc_reduction_fini (call, &oa);
7268 break;
7270 case IFN_GOACC_REDUCTION_TEARDOWN:
7271 nvptx_goacc_reduction_teardown (call, &oa);
7272 break;
7274 default:
7275 gcc_unreachable ();
7279 static bool
7280 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
7281 rtx x ATTRIBUTE_UNUSED)
7283 return true;
7286 static bool
7287 nvptx_scalar_mode_supported_p (scalar_mode mode)
7289 if (nvptx_experimental && mode == HFmode && TARGET_SM53)
7290 return true;
7292 return default_scalar_mode_supported_p (mode);
7295 static bool
7296 nvptx_libgcc_floating_mode_supported_p (scalar_float_mode mode)
7298 if (nvptx_experimental && mode == HFmode && TARGET_SM53)
7299 return true;
7301 return default_libgcc_floating_mode_supported_p (mode);
7304 static bool
7305 nvptx_vector_mode_supported (machine_mode mode)
7307 return (mode == V2SImode
7308 || mode == V2DImode);
7311 /* Return the preferred mode for vectorizing scalar MODE. */
7313 static machine_mode
7314 nvptx_preferred_simd_mode (scalar_mode mode)
7316 switch (mode)
7318 case E_DImode:
7319 return V2DImode;
7320 case E_SImode:
7321 return V2SImode;
7323 default:
7324 return default_preferred_simd_mode (mode);
7328 unsigned int
7329 nvptx_data_alignment (const_tree type, unsigned int basic_align)
7331 if (TREE_CODE (type) == INTEGER_TYPE)
7333 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
7334 if (size == GET_MODE_SIZE (TImode))
7335 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
7338 return basic_align;
7341 /* Implement TARGET_MODES_TIEABLE_P. */
7343 static bool
7344 nvptx_modes_tieable_p (machine_mode, machine_mode)
7346 return false;
7349 /* Implement TARGET_HARD_REGNO_NREGS. */
7351 static unsigned int
7352 nvptx_hard_regno_nregs (unsigned int, machine_mode)
7354 return 1;
7357 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
7359 static bool
7360 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
7362 return false;
7365 /* Implement TARGET_TRULY_NOOP_TRUNCATION. */
7367 static bool
7368 nvptx_truly_noop_truncation (poly_uint64, poly_uint64)
7370 return false;
7373 /* Implement TARGET_GOACC_ADJUST_PRIVATE_DECL. */
7375 static tree
7376 nvptx_goacc_adjust_private_decl (location_t loc, tree decl, int level)
7378 gcc_checking_assert (!lookup_attribute ("oacc gang-private",
7379 DECL_ATTRIBUTES (decl)));
7381 /* Set "oacc gang-private" attribute for gang-private variable
7382 declarations. */
7383 if (level == GOMP_DIM_GANG)
7385 tree id = get_identifier ("oacc gang-private");
7386 /* For later diagnostic purposes, pass LOC as VALUE (wrapped as a
7387 TREE). */
7388 tree loc_tree = build_empty_stmt (loc);
7389 DECL_ATTRIBUTES (decl)
7390 = tree_cons (id, loc_tree, DECL_ATTRIBUTES (decl));
7393 return decl;
7396 /* Implement TARGET_GOACC_EXPAND_VAR_DECL. */
7398 static rtx
7399 nvptx_goacc_expand_var_decl (tree var)
7401 /* Place "oacc gang-private" variables in shared memory. */
7402 if (tree attr = lookup_attribute ("oacc gang-private",
7403 DECL_ATTRIBUTES (var)))
7405 gcc_checking_assert (VAR_P (var));
7407 unsigned int offset, *poffset;
7408 poffset = gang_private_shared_hmap.get (var);
7409 if (poffset)
7410 offset = *poffset;
7411 else
7413 unsigned HOST_WIDE_INT align = DECL_ALIGN (var);
7414 gang_private_shared_size
7415 = (gang_private_shared_size + align - 1) & ~(align - 1);
7416 if (gang_private_shared_align < align)
7417 gang_private_shared_align = align;
7419 offset = gang_private_shared_size;
7420 bool existed = gang_private_shared_hmap.put (var, offset);
7421 gcc_checking_assert (!existed);
7422 gang_private_shared_size += tree_to_uhwi (DECL_SIZE_UNIT (var));
7424 location_t loc = EXPR_LOCATION (TREE_VALUE (attr));
7425 #if 0 /* For some reason, this doesn't work. */
7426 if (dump_enabled_p ())
7428 dump_flags_t l_dump_flags
7429 = get_openacc_privatization_dump_flags ();
7431 const dump_user_location_t d_u_loc
7432 = dump_user_location_t::from_location_t (loc);
7433 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
7434 #if __GNUC__ >= 10
7435 # pragma GCC diagnostic push
7436 # pragma GCC diagnostic ignored "-Wformat"
7437 #endif
7438 dump_printf_loc (l_dump_flags, d_u_loc,
7439 "variable %<%T%> adjusted for OpenACC"
7440 " privatization level: %qs\n",
7441 var, "gang");
7442 #if __GNUC__ >= 10
7443 # pragma GCC diagnostic pop
7444 #endif
7446 #else /* ..., thus emulate that, good enough for testsuite usage. */
7447 if (param_openacc_privatization != OPENACC_PRIVATIZATION_QUIET)
7448 inform (loc,
7449 "variable %qD adjusted for OpenACC privatization level:"
7450 " %qs",
7451 var, "gang");
7452 if (dump_file && (dump_flags & TDF_DETAILS))
7454 /* 'dumpfile.cc:dump_loc' */
7455 fprintf (dump_file, "%s:%d:%d: ", LOCATION_FILE (loc),
7456 LOCATION_LINE (loc), LOCATION_COLUMN (loc));
7457 fprintf (dump_file, "%s: ", "note");
7459 fprintf (dump_file,
7460 "variable '");
7461 print_generic_expr (dump_file, var, TDF_SLIM);
7462 fprintf (dump_file,
7463 "' adjusted for OpenACC privatization level: '%s'\n",
7464 "gang");
7466 #endif
7468 rtx addr = plus_constant (Pmode, gang_private_shared_sym, offset);
7469 return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var)), addr);
7472 return NULL_RTX;
7475 static GTY(()) tree nvptx_previous_fndecl;
7477 static void
7478 nvptx_set_current_function (tree fndecl)
7480 if (!fndecl || fndecl == nvptx_previous_fndecl)
7481 return;
7483 gang_private_shared_hmap.empty ();
7484 nvptx_previous_fndecl = fndecl;
7485 vector_red_partition = 0;
7486 oacc_bcast_partition = 0;
7489 /* Implement TARGET_LIBC_HAS_FUNCTION. */
7491 bool
7492 nvptx_libc_has_function (enum function_class fn_class, tree type)
7494 if (fn_class == function_sincos)
7496 if (type != NULL_TREE)
7497 /* Currently, newlib does not support sincosl. */
7498 return type == float_type_node || type == double_type_node;
7499 else
7500 return true;
7503 return default_libc_has_function (fn_class, type);
7506 bool
7507 nvptx_mem_local_p (rtx mem)
7509 gcc_assert (GET_CODE (mem) == MEM);
7511 struct address_info info;
7512 decompose_mem_address (&info, mem);
7514 if (info.base != NULL && REG_P (*info.base)
7515 && REGNO_PTR_FRAME_P (REGNO (*info.base)))
7517 if (TARGET_SOFT_STACK)
7519 /* Frame-related doesn't mean local. */
7521 else
7522 return true;
7525 return false;
7528 /* Define locally, for use in NVPTX_ASM_OUTPUT_DEF. */
7529 #define SET_ASM_OP ".alias "
7531 /* Define locally, for use in nvptx_asm_output_def_from_decls. Add NVPTX_
7532 prefix to avoid clash with ASM_OUTPUT_DEF from nvptx.h.
7533 Copy of ASM_OUTPUT_DEF from defaults.h, with added terminating
7534 semicolon. */
7535 #define NVPTX_ASM_OUTPUT_DEF(FILE,LABEL1,LABEL2) \
7536 do \
7538 fprintf ((FILE), "%s", SET_ASM_OP); \
7539 assemble_name (FILE, LABEL1); \
7540 fprintf (FILE, ","); \
7541 assemble_name (FILE, LABEL2); \
7542 fprintf (FILE, ";\n"); \
7544 while (0)
7546 void
7547 nvptx_asm_output_def_from_decls (FILE *stream, tree name, tree value)
7549 if (nvptx_alias == 0 || !TARGET_PTX_6_3)
7551 /* Copied from assemble_alias. */
7552 error_at (DECL_SOURCE_LOCATION (name),
7553 "alias definitions not supported in this configuration");
7554 TREE_ASM_WRITTEN (name) = 1;
7555 return;
7558 if (lookup_attribute ("weak", DECL_ATTRIBUTES (name)))
7560 /* Prevent execution FAILs for gcc.dg/globalalias.c and
7561 gcc.dg/pr77587.c. */
7562 error_at (DECL_SOURCE_LOCATION (name),
7563 "weak alias definitions not supported in this configuration");
7564 TREE_ASM_WRITTEN (name) = 1;
7565 return;
7568 /* Ptx also doesn't support value having weak linkage, but we can't detect
7569 that here, so we'll end up with:
7570 "error: Function test with .weak scope cannot be aliased".
7571 See gcc.dg/localalias.c. */
7573 if (TREE_CODE (name) != FUNCTION_DECL)
7575 error_at (DECL_SOURCE_LOCATION (name),
7576 "non-function alias definitions not supported"
7577 " in this configuration");
7578 TREE_ASM_WRITTEN (name) = 1;
7579 return;
7582 if (!cgraph_node::get (name)->referred_to_p ())
7583 /* Prevent "Internal error: reference to deleted section". */
7584 return;
7586 std::stringstream s;
7587 write_fn_proto (s, false, get_fnname_from_decl (name), name);
7588 fputs (s.str ().c_str (), stream);
7590 tree id = DECL_ASSEMBLER_NAME (name);
7591 NVPTX_ASM_OUTPUT_DEF (stream, IDENTIFIER_POINTER (id),
7592 IDENTIFIER_POINTER (value));
7595 #undef NVPTX_ASM_OUTPUT_DEF
7596 #undef SET_ASM_OP
7598 #undef TARGET_OPTION_OVERRIDE
7599 #define TARGET_OPTION_OVERRIDE nvptx_option_override
7601 #undef TARGET_ATTRIBUTE_TABLE
7602 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
7604 #undef TARGET_LRA_P
7605 #define TARGET_LRA_P hook_bool_void_false
7607 #undef TARGET_LEGITIMATE_ADDRESS_P
7608 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
7610 #undef TARGET_PROMOTE_FUNCTION_MODE
7611 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
7613 #undef TARGET_FUNCTION_ARG
7614 #define TARGET_FUNCTION_ARG nvptx_function_arg
7615 #undef TARGET_FUNCTION_INCOMING_ARG
7616 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
7617 #undef TARGET_FUNCTION_ARG_ADVANCE
7618 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
7619 #undef TARGET_FUNCTION_ARG_BOUNDARY
7620 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
7621 #undef TARGET_PASS_BY_REFERENCE
7622 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
7623 #undef TARGET_FUNCTION_VALUE_REGNO_P
7624 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
7625 #undef TARGET_FUNCTION_VALUE
7626 #define TARGET_FUNCTION_VALUE nvptx_function_value
7627 #undef TARGET_LIBCALL_VALUE
7628 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
7629 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
7630 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
7631 #undef TARGET_GET_DRAP_RTX
7632 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
7633 #undef TARGET_SPLIT_COMPLEX_ARG
7634 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
7635 #undef TARGET_RETURN_IN_MEMORY
7636 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
7637 #undef TARGET_OMIT_STRUCT_RETURN_REG
7638 #define TARGET_OMIT_STRUCT_RETURN_REG true
7639 #undef TARGET_STRICT_ARGUMENT_NAMING
7640 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
7641 #undef TARGET_CALL_ARGS
7642 #define TARGET_CALL_ARGS nvptx_call_args
7643 #undef TARGET_END_CALL_ARGS
7644 #define TARGET_END_CALL_ARGS nvptx_end_call_args
7646 #undef TARGET_ASM_FILE_START
7647 #define TARGET_ASM_FILE_START nvptx_file_start
7648 #undef TARGET_ASM_FILE_END
7649 #define TARGET_ASM_FILE_END nvptx_file_end
7650 #undef TARGET_ASM_GLOBALIZE_LABEL
7651 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
7652 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
7653 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
7654 #undef TARGET_PRINT_OPERAND
7655 #define TARGET_PRINT_OPERAND nvptx_print_operand
7656 #undef TARGET_PRINT_OPERAND_ADDRESS
7657 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
7658 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
7659 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
7660 #undef TARGET_ASM_INTEGER
7661 #define TARGET_ASM_INTEGER nvptx_assemble_integer
7662 #undef TARGET_ASM_DECL_END
7663 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
7664 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
7665 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
7666 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
7667 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
7668 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
7669 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
7671 #undef TARGET_MACHINE_DEPENDENT_REORG
7672 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
7673 #undef TARGET_NO_REGISTER_ALLOCATION
7674 #define TARGET_NO_REGISTER_ALLOCATION true
7676 #undef TARGET_ENCODE_SECTION_INFO
7677 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
7678 #undef TARGET_RECORD_OFFLOAD_SYMBOL
7679 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
7681 #undef TARGET_VECTOR_ALIGNMENT
7682 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
7684 #undef TARGET_CANNOT_COPY_INSN_P
7685 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
7687 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
7688 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
7690 #undef TARGET_INIT_BUILTINS
7691 #define TARGET_INIT_BUILTINS nvptx_init_builtins
7692 #undef TARGET_EXPAND_BUILTIN
7693 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
7694 #undef TARGET_BUILTIN_DECL
7695 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
7697 #undef TARGET_SIMT_VF
7698 #define TARGET_SIMT_VF nvptx_simt_vf
7700 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
7701 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa
7703 #undef TARGET_GOACC_VALIDATE_DIMS
7704 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
7706 #undef TARGET_GOACC_DIM_LIMIT
7707 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
7709 #undef TARGET_GOACC_FORK_JOIN
7710 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
7712 #undef TARGET_GOACC_REDUCTION
7713 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
7715 #undef TARGET_CANNOT_FORCE_CONST_MEM
7716 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
7718 #undef TARGET_SCALAR_MODE_SUPPORTED_P
7719 #define TARGET_SCALAR_MODE_SUPPORTED_P nvptx_scalar_mode_supported_p
7721 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
7722 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
7723 nvptx_libgcc_floating_mode_supported_p
7725 #undef TARGET_VECTOR_MODE_SUPPORTED_P
7726 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
7728 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
7729 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
7730 nvptx_preferred_simd_mode
7732 #undef TARGET_MODES_TIEABLE_P
7733 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
7735 #undef TARGET_HARD_REGNO_NREGS
7736 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
7738 #undef TARGET_CAN_CHANGE_MODE_CLASS
7739 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
7741 #undef TARGET_TRULY_NOOP_TRUNCATION
7742 #define TARGET_TRULY_NOOP_TRUNCATION nvptx_truly_noop_truncation
7744 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
7745 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
7747 #undef TARGET_GOACC_ADJUST_PRIVATE_DECL
7748 #define TARGET_GOACC_ADJUST_PRIVATE_DECL nvptx_goacc_adjust_private_decl
7750 #undef TARGET_GOACC_EXPAND_VAR_DECL
7751 #define TARGET_GOACC_EXPAND_VAR_DECL nvptx_goacc_expand_var_decl
7753 #undef TARGET_SET_CURRENT_FUNCTION
7754 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
7756 #undef TARGET_LIBC_HAS_FUNCTION
7757 #define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function
7759 struct gcc_target targetm = TARGET_INITIALIZER;
7761 #include "gt-nvptx.h"