20180-12-08 Steven G. Kargl <kargl@gcc.gnu.org>
[official-gcc.git] / gcc / config / nvptx / nvptx.c
blob9903a2738633806efa05e52cbe21e91290eee59a
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2018 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "gomp-constants.h"
63 #include "dumpfile.h"
64 #include "internal-fn.h"
65 #include "gimple-iterator.h"
66 #include "stringpool.h"
67 #include "attribs.h"
68 #include "tree-vrp.h"
69 #include "tree-ssa-operands.h"
70 #include "tree-ssanames.h"
71 #include "gimplify.h"
72 #include "tree-phinodes.h"
73 #include "cfgloop.h"
74 #include "fold-const.h"
75 #include "intl.h"
77 /* This file should be included last. */
78 #include "target-def.h"
80 #define WORKAROUND_PTXJIT_BUG 1
81 #define WORKAROUND_PTXJIT_BUG_2 1
82 #define WORKAROUND_PTXJIT_BUG_3 1
84 /* The various PTX memory areas an object might reside in. */
85 enum nvptx_data_area
87 DATA_AREA_GENERIC,
88 DATA_AREA_GLOBAL,
89 DATA_AREA_SHARED,
90 DATA_AREA_LOCAL,
91 DATA_AREA_CONST,
92 DATA_AREA_PARAM,
93 DATA_AREA_MAX
96 /* We record the data area in the target symbol flags. */
97 #define SYMBOL_DATA_AREA(SYM) \
98 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
99 & 7)
100 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
101 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
103 /* Record the function decls we've written, and the libfuncs and function
104 decls corresponding to them. */
105 static std::stringstream func_decls;
107 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
109 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
110 static bool equal (rtx a, rtx b) { return a == b; }
113 static GTY((cache))
114 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
116 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
118 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
119 static bool equal (tree a, tree b) { return a == b; }
122 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
123 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
125 /* Buffer needed to broadcast across workers. This is used for both
126 worker-neutering and worker broadcasting. It is shared by all
127 functions emitted. The buffer is placed in shared memory. It'd be
128 nice if PTX supported common blocks, because then this could be
129 shared across TUs (taking the largest size). */
130 static unsigned worker_bcast_size;
131 static unsigned worker_bcast_align;
132 static GTY(()) rtx worker_bcast_sym;
134 /* Buffer needed for worker reductions. This has to be distinct from
135 the worker broadcast array, as both may be live concurrently. */
136 static unsigned worker_red_size;
137 static unsigned worker_red_align;
138 static GTY(()) rtx worker_red_sym;
140 /* Global lock variable, needed for 128bit worker & gang reductions. */
141 static GTY(()) tree global_lock_var;
143 /* True if any function references __nvptx_stacks. */
144 static bool need_softstack_decl;
146 /* True if any function references __nvptx_uni. */
147 static bool need_unisimt_decl;
149 /* Allocate a new, cleared machine_function structure. */
151 static struct machine_function *
152 nvptx_init_machine_status (void)
154 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
155 p->return_mode = VOIDmode;
156 return p;
159 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
160 and -fopenacc is also enabled. */
162 static void
163 diagnose_openacc_conflict (bool optval, const char *optname)
165 if (flag_openacc && optval)
166 error ("option %s is not supported together with -fopenacc", optname);
169 /* Implement TARGET_OPTION_OVERRIDE. */
171 static void
172 nvptx_option_override (void)
174 init_machine_status = nvptx_init_machine_status;
176 /* Set toplevel_reorder, unless explicitly disabled. We need
177 reordering so that we emit necessary assembler decls of
178 undeclared variables. */
179 if (!global_options_set.x_flag_toplevel_reorder)
180 flag_toplevel_reorder = 1;
182 debug_nonbind_markers_p = 0;
184 /* Set flag_no_common, unless explicitly disabled. We fake common
185 using .weak, and that's not entirely accurate, so avoid it
186 unless forced. */
187 if (!global_options_set.x_flag_no_common)
188 flag_no_common = 1;
190 /* The patch area requires nops, which we don't have. */
191 if (function_entry_patch_area_size > 0)
192 sorry ("not generating patch area, nops not supported");
194 /* Assumes that it will see only hard registers. */
195 flag_var_tracking = 0;
197 if (nvptx_optimize < 0)
198 nvptx_optimize = optimize > 0;
200 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
201 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
202 declared_libfuncs_htab
203 = hash_table<declared_libfunc_hasher>::create_ggc (17);
205 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
206 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
207 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
209 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
210 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
211 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
213 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
214 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
215 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
217 if (TARGET_GOMP)
218 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
221 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
222 deal with ptx ideosyncracies. */
224 const char *
225 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
227 switch (mode)
229 case E_BLKmode:
230 return ".b8";
231 case E_BImode:
232 return ".pred";
233 case E_QImode:
234 if (promote)
235 return ".u32";
236 else
237 return ".u8";
238 case E_HImode:
239 return ".u16";
240 case E_SImode:
241 return ".u32";
242 case E_DImode:
243 return ".u64";
245 case E_SFmode:
246 return ".f32";
247 case E_DFmode:
248 return ".f64";
250 case E_V2SImode:
251 return ".v2.u32";
252 case E_V2DImode:
253 return ".v2.u64";
255 default:
256 gcc_unreachable ();
260 /* Encode the PTX data area that DECL (which might not actually be a
261 _DECL) should reside in. */
263 static void
264 nvptx_encode_section_info (tree decl, rtx rtl, int first)
266 default_encode_section_info (decl, rtl, first);
267 if (first && MEM_P (rtl))
269 nvptx_data_area area = DATA_AREA_GENERIC;
271 if (TREE_CONSTANT (decl))
272 area = DATA_AREA_CONST;
273 else if (TREE_CODE (decl) == VAR_DECL)
275 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
277 area = DATA_AREA_SHARED;
278 if (DECL_INITIAL (decl))
279 error ("static initialization of variable %q+D in %<.shared%>"
280 " memory is not supported", decl);
282 else
283 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
286 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
290 /* Return the PTX name of the data area in which SYM should be
291 placed. The symbol must have already been processed by
292 nvptx_encode_seciton_info, or equivalent. */
294 static const char *
295 section_for_sym (rtx sym)
297 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
298 /* Same order as nvptx_data_area enum. */
299 static char const *const areas[] =
300 {"", ".global", ".shared", ".local", ".const", ".param"};
302 return areas[area];
305 /* Similarly for a decl. */
307 static const char *
308 section_for_decl (const_tree decl)
310 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
313 /* Check NAME for special function names and redirect them by returning a
314 replacement. This applies to malloc, free and realloc, for which we
315 want to use libgcc wrappers, and call, which triggers a bug in
316 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
317 not active in an offload compiler -- the names are all set by the
318 host-side compiler. */
320 static const char *
321 nvptx_name_replacement (const char *name)
323 if (strcmp (name, "call") == 0)
324 return "__nvptx_call";
325 if (strcmp (name, "malloc") == 0)
326 return "__nvptx_malloc";
327 if (strcmp (name, "free") == 0)
328 return "__nvptx_free";
329 if (strcmp (name, "realloc") == 0)
330 return "__nvptx_realloc";
331 return name;
334 /* If MODE should be treated as two registers of an inner mode, return
335 that inner mode. Otherwise return VOIDmode. */
337 static machine_mode
338 maybe_split_mode (machine_mode mode)
340 if (COMPLEX_MODE_P (mode))
341 return GET_MODE_INNER (mode);
343 if (mode == TImode)
344 return DImode;
346 return VOIDmode;
349 /* Return true if mode should be treated as two registers. */
351 static bool
352 split_mode_p (machine_mode mode)
354 return maybe_split_mode (mode) != VOIDmode;
357 /* Output a register, subreg, or register pair (with optional
358 enclosing braces). */
360 static void
361 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
362 int subreg_offset = -1)
364 if (inner_mode == VOIDmode)
366 if (HARD_REGISTER_NUM_P (regno))
367 fprintf (file, "%s", reg_names[regno]);
368 else
369 fprintf (file, "%%r%d", regno);
371 else if (subreg_offset >= 0)
373 output_reg (file, regno, VOIDmode);
374 fprintf (file, "$%d", subreg_offset);
376 else
378 if (subreg_offset == -1)
379 fprintf (file, "{");
380 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
381 fprintf (file, ",");
382 output_reg (file, regno, inner_mode, 0);
383 if (subreg_offset == -1)
384 fprintf (file, "}");
388 /* Emit forking instructions for MASK. */
390 static void
391 nvptx_emit_forking (unsigned mask, bool is_call)
393 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
394 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
395 if (mask)
397 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
399 /* Emit fork at all levels. This helps form SESE regions, as
400 it creates a block with a single successor before entering a
401 partitooned region. That is a good candidate for the end of
402 an SESE region. */
403 emit_insn (gen_nvptx_fork (op));
404 emit_insn (gen_nvptx_forked (op));
408 /* Emit joining instructions for MASK. */
410 static void
411 nvptx_emit_joining (unsigned mask, bool is_call)
413 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
414 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
415 if (mask)
417 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
419 /* Emit joining for all non-call pars to ensure there's a single
420 predecessor for the block the join insn ends up in. This is
421 needed for skipping entire loops. */
422 emit_insn (gen_nvptx_joining (op));
423 emit_insn (gen_nvptx_join (op));
428 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
429 returned in memory. Integer and floating types supported by the
430 machine are passed in registers, everything else is passed in
431 memory. Complex types are split. */
433 static bool
434 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
436 if (type)
438 if (AGGREGATE_TYPE_P (type))
439 return true;
440 if (TREE_CODE (type) == VECTOR_TYPE)
441 return true;
444 if (!for_return && COMPLEX_MODE_P (mode))
445 /* Complex types are passed as two underlying args. */
446 mode = GET_MODE_INNER (mode);
448 if (GET_MODE_CLASS (mode) != MODE_INT
449 && GET_MODE_CLASS (mode) != MODE_FLOAT)
450 return true;
452 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
453 return true;
455 return false;
458 /* A non-memory argument of mode MODE is being passed, determine the mode it
459 should be promoted to. This is also used for determining return
460 type promotion. */
462 static machine_mode
463 promote_arg (machine_mode mode, bool prototyped)
465 if (!prototyped && mode == SFmode)
466 /* K&R float promotion for unprototyped functions. */
467 mode = DFmode;
468 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
469 mode = SImode;
471 return mode;
474 /* A non-memory return type of MODE is being returned. Determine the
475 mode it should be promoted to. */
477 static machine_mode
478 promote_return (machine_mode mode)
480 return promote_arg (mode, true);
483 /* Implement TARGET_FUNCTION_ARG. */
485 static rtx
486 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
487 const_tree, bool named)
489 if (mode == VOIDmode || !named)
490 return NULL_RTX;
492 return gen_reg_rtx (mode);
495 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
497 static rtx
498 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
499 const_tree, bool named)
501 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
503 if (mode == VOIDmode || !named)
504 return NULL_RTX;
506 /* No need to deal with split modes here, the only case that can
507 happen is complex modes and those are dealt with by
508 TARGET_SPLIT_COMPLEX_ARG. */
509 return gen_rtx_UNSPEC (mode,
510 gen_rtvec (1, GEN_INT (cum->count)),
511 UNSPEC_ARG_REG);
514 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
516 static void
517 nvptx_function_arg_advance (cumulative_args_t cum_v,
518 machine_mode ARG_UNUSED (mode),
519 const_tree ARG_UNUSED (type),
520 bool ARG_UNUSED (named))
522 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
524 cum->count++;
527 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
529 For nvptx This is only used for varadic args. The type has already
530 been promoted and/or converted to invisible reference. */
532 static unsigned
533 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
535 return GET_MODE_ALIGNMENT (mode);
538 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
540 For nvptx, we know how to handle functions declared as stdarg: by
541 passing an extra pointer to the unnamed arguments. However, the
542 Fortran frontend can produce a different situation, where a
543 function pointer is declared with no arguments, but the actual
544 function and calls to it take more arguments. In that case, we
545 want to ensure the call matches the definition of the function. */
547 static bool
548 nvptx_strict_argument_naming (cumulative_args_t cum_v)
550 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
552 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
555 /* Implement TARGET_LIBCALL_VALUE. */
557 static rtx
558 nvptx_libcall_value (machine_mode mode, const_rtx)
560 if (!cfun || !cfun->machine->doing_call)
561 /* Pretend to return in a hard reg for early uses before pseudos can be
562 generated. */
563 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
565 return gen_reg_rtx (mode);
568 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
569 where function FUNC returns or receives a value of data type TYPE. */
571 static rtx
572 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
573 bool outgoing)
575 machine_mode mode = promote_return (TYPE_MODE (type));
577 if (outgoing)
579 gcc_assert (cfun);
580 cfun->machine->return_mode = mode;
581 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
584 return nvptx_libcall_value (mode, NULL_RTX);
587 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
589 static bool
590 nvptx_function_value_regno_p (const unsigned int regno)
592 return regno == NVPTX_RETURN_REGNUM;
595 /* Types with a mode other than those supported by the machine are passed by
596 reference in memory. */
598 static bool
599 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
600 machine_mode mode, const_tree type,
601 bool ARG_UNUSED (named))
603 return pass_in_memory (mode, type, false);
606 /* Implement TARGET_RETURN_IN_MEMORY. */
608 static bool
609 nvptx_return_in_memory (const_tree type, const_tree)
611 return pass_in_memory (TYPE_MODE (type), type, true);
614 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
616 static machine_mode
617 nvptx_promote_function_mode (const_tree type, machine_mode mode,
618 int *ARG_UNUSED (punsignedp),
619 const_tree funtype, int for_return)
621 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
624 /* Helper for write_arg. Emit a single PTX argument of MODE, either
625 in a prototype, or as copy in a function prologue. ARGNO is the
626 index of this argument in the PTX function. FOR_REG is negative,
627 if we're emitting the PTX prototype. It is zero if we're copying
628 to an argument register and it is greater than zero if we're
629 copying to a specific hard register. */
631 static int
632 write_arg_mode (std::stringstream &s, int for_reg, int argno,
633 machine_mode mode)
635 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
637 if (for_reg < 0)
639 /* Writing PTX prototype. */
640 s << (argno ? ", " : " (");
641 s << ".param" << ptx_type << " %in_ar" << argno;
643 else
645 s << "\t.reg" << ptx_type << " ";
646 if (for_reg)
647 s << reg_names[for_reg];
648 else
649 s << "%ar" << argno;
650 s << ";\n";
651 if (argno >= 0)
653 s << "\tld.param" << ptx_type << " ";
654 if (for_reg)
655 s << reg_names[for_reg];
656 else
657 s << "%ar" << argno;
658 s << ", [%in_ar" << argno << "];\n";
661 return argno + 1;
664 /* Process function parameter TYPE to emit one or more PTX
665 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
666 is true, if this is a prototyped function, rather than an old-style
667 C declaration. Returns the next argument number to use.
669 The promotion behavior here must match the regular GCC function
670 parameter marshalling machinery. */
672 static int
673 write_arg_type (std::stringstream &s, int for_reg, int argno,
674 tree type, bool prototyped)
676 machine_mode mode = TYPE_MODE (type);
678 if (mode == VOIDmode)
679 return argno;
681 if (pass_in_memory (mode, type, false))
682 mode = Pmode;
683 else
685 bool split = TREE_CODE (type) == COMPLEX_TYPE;
687 if (split)
689 /* Complex types are sent as two separate args. */
690 type = TREE_TYPE (type);
691 mode = TYPE_MODE (type);
692 prototyped = true;
695 mode = promote_arg (mode, prototyped);
696 if (split)
697 argno = write_arg_mode (s, for_reg, argno, mode);
700 return write_arg_mode (s, for_reg, argno, mode);
703 /* Emit a PTX return as a prototype or function prologue declaration
704 for MODE. */
706 static void
707 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
709 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
710 const char *pfx = "\t.reg";
711 const char *sfx = ";\n";
713 if (for_proto)
714 pfx = "(.param", sfx = "_out) ";
716 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
719 /* Process a function return TYPE to emit a PTX return as a prototype
720 or function prologue declaration. Returns true if return is via an
721 additional pointer parameter. The promotion behavior here must
722 match the regular GCC function return mashalling. */
724 static bool
725 write_return_type (std::stringstream &s, bool for_proto, tree type)
727 machine_mode mode = TYPE_MODE (type);
729 if (mode == VOIDmode)
730 return false;
732 bool return_in_mem = pass_in_memory (mode, type, true);
734 if (return_in_mem)
736 if (for_proto)
737 return return_in_mem;
739 /* Named return values can cause us to return a pointer as well
740 as expect an argument for the return location. This is
741 optimization-level specific, so no caller can make use of
742 this data, but more importantly for us, we must ensure it
743 doesn't change the PTX prototype. */
744 mode = (machine_mode) cfun->machine->return_mode;
746 if (mode == VOIDmode)
747 return return_in_mem;
749 /* Clear return_mode to inhibit copy of retval to non-existent
750 retval parameter. */
751 cfun->machine->return_mode = VOIDmode;
753 else
754 mode = promote_return (mode);
756 write_return_mode (s, for_proto, mode);
758 return return_in_mem;
761 /* Look for attributes in ATTRS that would indicate we must write a function
762 as a .entry kernel rather than a .func. Return true if one is found. */
764 static bool
765 write_as_kernel (tree attrs)
767 return (lookup_attribute ("kernel", attrs) != NULL_TREE
768 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
769 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
770 /* For OpenMP target regions, the corresponding kernel entry is emitted from
771 write_omp_entry as a separate function. */
774 /* Emit a linker marker for a function decl or defn. */
776 static void
777 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
778 const char *name)
780 s << "\n// BEGIN";
781 if (globalize)
782 s << " GLOBAL";
783 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
784 s << name << "\n";
787 /* Emit a linker marker for a variable decl or defn. */
789 static void
790 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
792 fprintf (file, "\n// BEGIN%s VAR %s: ",
793 globalize ? " GLOBAL" : "",
794 is_defn ? "DEF" : "DECL");
795 assemble_name_raw (file, name);
796 fputs ("\n", file);
799 /* Write a .func or .kernel declaration or definition along with
800 a helper comment for use by ld. S is the stream to write to, DECL
801 the decl for the function with name NAME. For definitions, emit
802 a declaration too. */
804 static const char *
805 write_fn_proto (std::stringstream &s, bool is_defn,
806 const char *name, const_tree decl)
808 if (is_defn)
809 /* Emit a declaration. The PTX assembler gets upset without it. */
810 name = write_fn_proto (s, false, name, decl);
811 else
813 /* Avoid repeating the name replacement. */
814 name = nvptx_name_replacement (name);
815 if (name[0] == '*')
816 name++;
819 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
821 /* PTX declaration. */
822 if (DECL_EXTERNAL (decl))
823 s << ".extern ";
824 else if (TREE_PUBLIC (decl))
825 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
826 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
828 tree fntype = TREE_TYPE (decl);
829 tree result_type = TREE_TYPE (fntype);
831 /* atomic_compare_exchange_$n builtins have an exceptional calling
832 convention. */
833 int not_atomic_weak_arg = -1;
834 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
835 switch (DECL_FUNCTION_CODE (decl))
837 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
838 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
839 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
840 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
841 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
842 /* These atomics skip the 'weak' parm in an actual library
843 call. We must skip it in the prototype too. */
844 not_atomic_weak_arg = 3;
845 break;
847 default:
848 break;
851 /* Declare the result. */
852 bool return_in_mem = write_return_type (s, true, result_type);
854 s << name;
856 int argno = 0;
858 /* Emit argument list. */
859 if (return_in_mem)
860 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
862 /* We get:
863 NULL in TYPE_ARG_TYPES, for old-style functions
864 NULL in DECL_ARGUMENTS, for builtin functions without another
865 declaration.
866 So we have to pick the best one we have. */
867 tree args = TYPE_ARG_TYPES (fntype);
868 bool prototyped = true;
869 if (!args)
871 args = DECL_ARGUMENTS (decl);
872 prototyped = false;
875 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
877 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
879 if (not_atomic_weak_arg)
880 argno = write_arg_type (s, -1, argno, type, prototyped);
881 else
882 gcc_assert (type == boolean_type_node);
885 if (stdarg_p (fntype))
886 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
888 if (DECL_STATIC_CHAIN (decl))
889 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
891 if (!argno && strcmp (name, "main") == 0)
893 argno = write_arg_type (s, -1, argno, integer_type_node, true);
894 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
897 if (argno)
898 s << ")";
900 s << (is_defn ? "\n" : ";\n");
902 return name;
905 /* Construct a function declaration from a call insn. This can be
906 necessary for two reasons - either we have an indirect call which
907 requires a .callprototype declaration, or we have a libcall
908 generated by emit_library_call for which no decl exists. */
910 static void
911 write_fn_proto_from_insn (std::stringstream &s, const char *name,
912 rtx result, rtx pat)
914 if (!name)
916 s << "\t.callprototype ";
917 name = "_";
919 else
921 name = nvptx_name_replacement (name);
922 write_fn_marker (s, false, true, name);
923 s << "\t.extern .func ";
926 if (result != NULL_RTX)
927 write_return_mode (s, true, GET_MODE (result));
929 s << name;
931 int arg_end = XVECLEN (pat, 0);
932 for (int i = 1; i < arg_end; i++)
934 /* We don't have to deal with mode splitting & promotion here,
935 as that was already done when generating the call
936 sequence. */
937 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
939 write_arg_mode (s, -1, i - 1, mode);
941 if (arg_end != 1)
942 s << ")";
943 s << ";\n";
946 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
947 table and and write a ptx prototype. These are emitted at end of
948 compilation. */
950 static void
951 nvptx_record_fndecl (tree decl)
953 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
954 if (*slot == NULL)
956 *slot = decl;
957 const char *name = get_fnname_from_decl (decl);
958 write_fn_proto (func_decls, false, name, decl);
962 /* Record a libcall or unprototyped external function. CALLEE is the
963 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
964 declaration for it. */
966 static void
967 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
969 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
970 if (*slot == NULL)
972 *slot = callee;
974 const char *name = XSTR (callee, 0);
975 write_fn_proto_from_insn (func_decls, name, retval, pat);
979 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
980 is prototyped, record it now. Otherwise record it as needed at end
981 of compilation, when we might have more information about it. */
983 void
984 nvptx_record_needed_fndecl (tree decl)
986 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
988 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
989 if (*slot == NULL)
990 *slot = decl;
992 else
993 nvptx_record_fndecl (decl);
996 /* SYM is a SYMBOL_REF. If it refers to an external function, record
997 it as needed. */
999 static void
1000 nvptx_maybe_record_fnsym (rtx sym)
1002 tree decl = SYMBOL_REF_DECL (sym);
1004 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1005 nvptx_record_needed_fndecl (decl);
1008 /* Emit a local array to hold some part of a conventional stack frame
1009 and initialize REGNO to point to it. If the size is zero, it'll
1010 never be valid to dereference, so we can simply initialize to
1011 zero. */
1013 static void
1014 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1016 if (size)
1017 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1018 align, reg_names[regno], size);
1019 fprintf (file, "\t.reg.u%d %s;\n",
1020 POINTER_SIZE, reg_names[regno]);
1021 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1022 : "\tmov.u%d %s, 0;\n"),
1023 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1026 /* Emit soft stack frame setup sequence. */
1028 static void
1029 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1031 /* Maintain 64-bit stack alignment. */
1032 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1033 size = ROUND_UP (size, keep_align);
1034 int bits = POINTER_SIZE;
1035 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1036 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1037 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1038 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1039 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1040 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1041 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1042 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1043 fprintf (file, "\t{\n");
1044 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1045 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1046 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1047 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1048 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1049 bits == 64 ? ".wide" : ".lo", bits / 8);
1050 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1052 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1053 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1055 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1056 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1057 bits, reg_sspprev, reg_sspslot);
1059 /* Initialize %frame = %sspprev - size. */
1060 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1061 bits, reg_frame, reg_sspprev, size);
1063 /* Apply alignment, if larger than 64. */
1064 if (alignment > keep_align)
1065 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1066 bits, reg_frame, reg_frame, -alignment);
1068 size = crtl->outgoing_args_size;
1069 gcc_assert (size % keep_align == 0);
1071 /* Initialize %stack. */
1072 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1073 bits, reg_stack, reg_frame, size);
1075 if (!crtl->is_leaf)
1076 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1077 bits, reg_sspslot, reg_stack);
1078 fprintf (file, "\t}\n");
1079 cfun->machine->has_softstack = true;
1080 need_softstack_decl = true;
1083 /* Emit code to initialize the REGNO predicate register to indicate
1084 whether we are not lane zero on the NAME axis. */
1086 static void
1087 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1089 fprintf (file, "\t{\n");
1090 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1091 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1092 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1093 fprintf (file, "\t}\n");
1096 /* Emit code to initialize predicate and master lane index registers for
1097 -muniform-simt code generation variant. */
1099 static void
1100 nvptx_init_unisimt_predicate (FILE *file)
1102 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1103 int loc = REGNO (cfun->machine->unisimt_location);
1104 int bits = POINTER_SIZE;
1105 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1106 fprintf (file, "\t{\n");
1107 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1108 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1109 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1110 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1111 bits == 64 ? ".wide" : ".lo");
1112 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1113 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1114 if (cfun->machine->unisimt_predicate)
1116 int master = REGNO (cfun->machine->unisimt_master);
1117 int pred = REGNO (cfun->machine->unisimt_predicate);
1118 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1119 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1120 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1121 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1122 /* Compute predicate as 'tid.x == master'. */
1123 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1125 fprintf (file, "\t}\n");
1126 need_unisimt_decl = true;
1129 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1131 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1132 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1134 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1135 __nvptx_uni[tid.y] = 0;
1136 gomp_nvptx_main (ORIG, arg);
1138 ORIG itself should not be emitted as a PTX .entry function. */
1140 static void
1141 write_omp_entry (FILE *file, const char *name, const char *orig)
1143 static bool gomp_nvptx_main_declared;
1144 if (!gomp_nvptx_main_declared)
1146 gomp_nvptx_main_declared = true;
1147 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1148 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1149 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1151 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1152 #define NTID_Y "%ntid.y"
1153 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1154 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1155 {\n\
1156 .reg.u32 %r<3>;\n\
1157 .reg.u" PS " %R<4>;\n\
1158 mov.u32 %r0, %tid.y;\n\
1159 mov.u32 %r1, " NTID_Y ";\n\
1160 mov.u32 %r2, %ctaid.x;\n\
1161 cvt.u" PS ".u32 %R1, %r0;\n\
1162 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1163 mov.u" PS " %R0, __nvptx_stacks;\n\
1164 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1165 ld.param.u" PS " %R2, [%stack];\n\
1166 ld.param.u" PS " %R3, [%sz];\n\
1167 add.u" PS " %R2, %R2, %R3;\n\
1168 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1169 st.shared.u" PS " [%R0], %R2;\n\
1170 mov.u" PS " %R0, __nvptx_uni;\n\
1171 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1172 mov.u32 %r0, 0;\n\
1173 st.shared.u32 [%R0], %r0;\n\
1174 mov.u" PS " %R0, \0;\n\
1175 ld.param.u" PS " %R1, [%arg];\n\
1176 {\n\
1177 .param.u" PS " %P<2>;\n\
1178 st.param.u" PS " [%P0], %R0;\n\
1179 st.param.u" PS " [%P1], %R1;\n\
1180 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1181 }\n\
1182 ret.uni;\n\
1183 }\n"
1184 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1185 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1186 #undef ENTRY_TEMPLATE
1187 #undef NTID_Y
1188 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1189 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1190 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1191 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1192 need_softstack_decl = need_unisimt_decl = true;
1195 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1196 function, including local var decls and copies from the arguments to
1197 local regs. */
1199 void
1200 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1202 tree fntype = TREE_TYPE (decl);
1203 tree result_type = TREE_TYPE (fntype);
1204 int argno = 0;
1206 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1207 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1209 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1210 sprintf (buf, "%s$impl", name);
1211 write_omp_entry (file, name, buf);
1212 name = buf;
1214 /* We construct the initial part of the function into a string
1215 stream, in order to share the prototype writing code. */
1216 std::stringstream s;
1217 write_fn_proto (s, true, name, decl);
1218 s << "{\n";
1220 bool return_in_mem = write_return_type (s, false, result_type);
1221 if (return_in_mem)
1222 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1224 /* Declare and initialize incoming arguments. */
1225 tree args = TYPE_ARG_TYPES (fntype);
1226 bool prototyped = true;
1227 if (!args)
1229 args = DECL_ARGUMENTS (decl);
1230 prototyped = false;
1233 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1235 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1237 argno = write_arg_type (s, 0, argno, type, prototyped);
1240 if (stdarg_p (fntype))
1241 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1242 true);
1244 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1245 write_arg_type (s, STATIC_CHAIN_REGNUM,
1246 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1247 true);
1249 fprintf (file, "%s", s.str().c_str());
1251 /* Usually 'crtl->is_leaf' is computed during register allocator
1252 initialization (which is not done on NVPTX) or for pressure-sensitive
1253 optimizations. Initialize it here, except if already set. */
1254 if (!crtl->is_leaf)
1255 crtl->is_leaf = leaf_function_p ();
1257 HOST_WIDE_INT sz = get_frame_size ();
1258 bool need_frameptr = sz || cfun->machine->has_chain;
1259 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1260 if (!TARGET_SOFT_STACK)
1262 /* Declare a local var for outgoing varargs. */
1263 if (cfun->machine->has_varadic)
1264 init_frame (file, STACK_POINTER_REGNUM,
1265 UNITS_PER_WORD, crtl->outgoing_args_size);
1267 /* Declare a local variable for the frame. Force its size to be
1268 DImode-compatible. */
1269 if (need_frameptr)
1270 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1271 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1273 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1274 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1275 init_softstack_frame (file, alignment, sz);
1277 if (cfun->machine->has_simtreg)
1279 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1280 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1281 align = MAX (align, GET_MODE_SIZE (DImode));
1282 if (!crtl->is_leaf || cfun->calls_alloca)
1283 simtsz = HOST_WIDE_INT_M1U;
1284 if (simtsz == HOST_WIDE_INT_M1U)
1285 simtsz = nvptx_softstack_size;
1286 if (cfun->machine->has_softstack)
1287 simtsz += POINTER_SIZE / 8;
1288 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1289 if (align > GET_MODE_SIZE (DImode))
1290 simtsz += align - GET_MODE_SIZE (DImode);
1291 if (simtsz)
1292 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1293 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1295 /* Declare the pseudos we have as ptx registers. */
1296 int maxregs = max_reg_num ();
1297 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1299 if (regno_reg_rtx[i] != const0_rtx)
1301 machine_mode mode = PSEUDO_REGNO_MODE (i);
1302 machine_mode split = maybe_split_mode (mode);
1304 if (split_mode_p (mode))
1305 mode = split;
1306 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1307 output_reg (file, i, split, -2);
1308 fprintf (file, ";\n");
1312 /* Emit axis predicates. */
1313 if (cfun->machine->axis_predicate[0])
1314 nvptx_init_axis_predicate (file,
1315 REGNO (cfun->machine->axis_predicate[0]), "y");
1316 if (cfun->machine->axis_predicate[1])
1317 nvptx_init_axis_predicate (file,
1318 REGNO (cfun->machine->axis_predicate[1]), "x");
1319 if (cfun->machine->unisimt_predicate
1320 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1321 nvptx_init_unisimt_predicate (file);
1324 /* Output code for switching uniform-simt state. ENTERING indicates whether
1325 we are entering or leaving non-uniform execution region. */
1327 static void
1328 nvptx_output_unisimt_switch (FILE *file, bool entering)
1330 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1331 return;
1332 fprintf (file, "\t{\n");
1333 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1334 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1335 if (!crtl->is_leaf)
1337 int loc = REGNO (cfun->machine->unisimt_location);
1338 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1340 if (cfun->machine->unisimt_predicate)
1342 int master = REGNO (cfun->machine->unisimt_master);
1343 int pred = REGNO (cfun->machine->unisimt_predicate);
1344 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1345 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1346 master, entering ? "%ustmp2" : "0");
1347 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1349 fprintf (file, "\t}\n");
1352 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1353 ENTERING indicates whether we are entering or leaving non-uniform execution.
1354 PTR is the register pointing to allocated storage, it is assigned to on
1355 entering and used to restore state on leaving. SIZE and ALIGN are used only
1356 on entering. */
1358 static void
1359 nvptx_output_softstack_switch (FILE *file, bool entering,
1360 rtx ptr, rtx size, rtx align)
1362 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1363 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1364 return;
1365 int bits = POINTER_SIZE, regno = REGNO (ptr);
1366 fprintf (file, "\t{\n");
1367 if (entering)
1369 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1370 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1371 cfun->machine->simt_stack_size);
1372 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1373 if (CONST_INT_P (size))
1374 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1375 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1376 else
1377 output_reg (file, REGNO (size), VOIDmode);
1378 fputs (";\n", file);
1379 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1380 fprintf (file,
1381 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1382 bits, regno, regno, UINTVAL (align));
1384 if (cfun->machine->has_softstack)
1386 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1387 if (entering)
1389 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1390 bits, regno, bits / 8, reg_stack);
1391 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1392 bits, reg_stack, regno, bits / 8);
1394 else
1396 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1397 bits, reg_stack, regno, bits / 8);
1399 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1401 fprintf (file, "\t}\n");
1404 /* Output code to enter non-uniform execution region. DEST is a register
1405 to hold a per-lane allocation given by SIZE and ALIGN. */
1407 const char *
1408 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1410 nvptx_output_unisimt_switch (asm_out_file, true);
1411 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1412 return "";
1415 /* Output code to leave non-uniform execution region. SRC is the register
1416 holding per-lane storage previously allocated by omp_simt_enter insn. */
1418 const char *
1419 nvptx_output_simt_exit (rtx src)
1421 nvptx_output_unisimt_switch (asm_out_file, false);
1422 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1423 return "";
1426 /* Output instruction that sets soft stack pointer in shared memory to the
1427 value in register given by SRC_REGNO. */
1429 const char *
1430 nvptx_output_set_softstack (unsigned src_regno)
1432 if (cfun->machine->has_softstack && !crtl->is_leaf)
1434 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1435 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1436 output_reg (asm_out_file, src_regno, VOIDmode);
1437 fprintf (asm_out_file, ";\n");
1439 return "";
1441 /* Output a return instruction. Also copy the return value to its outgoing
1442 location. */
1444 const char *
1445 nvptx_output_return (void)
1447 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1449 if (mode != VOIDmode)
1450 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1451 nvptx_ptx_type_from_mode (mode, false),
1452 reg_names[NVPTX_RETURN_REGNUM],
1453 reg_names[NVPTX_RETURN_REGNUM]);
1455 return "ret;";
1458 /* Terminate a function by writing a closing brace to FILE. */
1460 void
1461 nvptx_function_end (FILE *file)
1463 fprintf (file, "}\n");
1466 /* Decide whether we can make a sibling call to a function. For ptx, we
1467 can't. */
1469 static bool
1470 nvptx_function_ok_for_sibcall (tree, tree)
1472 return false;
1475 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1477 static rtx
1478 nvptx_get_drap_rtx (void)
1480 if (TARGET_SOFT_STACK && stack_realign_drap)
1481 return arg_pointer_rtx;
1482 return NULL_RTX;
1485 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1486 argument to the next call. */
1488 static void
1489 nvptx_call_args (rtx arg, tree fntype)
1491 if (!cfun->machine->doing_call)
1493 cfun->machine->doing_call = true;
1494 cfun->machine->is_varadic = false;
1495 cfun->machine->num_args = 0;
1497 if (fntype && stdarg_p (fntype))
1499 cfun->machine->is_varadic = true;
1500 cfun->machine->has_varadic = true;
1501 cfun->machine->num_args++;
1505 if (REG_P (arg) && arg != pc_rtx)
1507 cfun->machine->num_args++;
1508 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1509 cfun->machine->call_args);
1513 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1514 information we recorded. */
1516 static void
1517 nvptx_end_call_args (void)
1519 cfun->machine->doing_call = false;
1520 free_EXPR_LIST_list (&cfun->machine->call_args);
1523 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1524 track of whether calls involving static chains or varargs were seen
1525 in the current function.
1526 For libcalls, maintain a hash table of decls we have seen, and
1527 record a function decl for later when encountering a new one. */
1529 void
1530 nvptx_expand_call (rtx retval, rtx address)
1532 rtx callee = XEXP (address, 0);
1533 rtx varargs = NULL_RTX;
1534 unsigned parallel = 0;
1536 if (!call_insn_operand (callee, Pmode))
1538 callee = force_reg (Pmode, callee);
1539 address = change_address (address, QImode, callee);
1542 if (GET_CODE (callee) == SYMBOL_REF)
1544 tree decl = SYMBOL_REF_DECL (callee);
1545 if (decl != NULL_TREE)
1547 if (DECL_STATIC_CHAIN (decl))
1548 cfun->machine->has_chain = true;
1550 tree attr = oacc_get_fn_attrib (decl);
1551 if (attr)
1553 tree dims = TREE_VALUE (attr);
1555 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1556 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1558 if (TREE_PURPOSE (dims)
1559 && !integer_zerop (TREE_PURPOSE (dims)))
1560 break;
1561 /* Not on this axis. */
1562 parallel ^= GOMP_DIM_MASK (ix);
1563 dims = TREE_CHAIN (dims);
1569 unsigned nargs = cfun->machine->num_args;
1570 if (cfun->machine->is_varadic)
1572 varargs = gen_reg_rtx (Pmode);
1573 emit_move_insn (varargs, stack_pointer_rtx);
1576 rtvec vec = rtvec_alloc (nargs + 1);
1577 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1578 int vec_pos = 0;
1580 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1581 rtx tmp_retval = retval;
1582 if (retval)
1584 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1585 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1586 call = gen_rtx_SET (tmp_retval, call);
1588 XVECEXP (pat, 0, vec_pos++) = call;
1590 /* Construct the call insn, including a USE for each argument pseudo
1591 register. These will be used when printing the insn. */
1592 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1593 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1595 if (varargs)
1596 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1598 gcc_assert (vec_pos = XVECLEN (pat, 0));
1600 nvptx_emit_forking (parallel, true);
1601 emit_call_insn (pat);
1602 nvptx_emit_joining (parallel, true);
1604 if (tmp_retval != retval)
1605 emit_move_insn (retval, tmp_retval);
1608 /* Emit a comparison COMPARE, and return the new test to be used in the
1609 jump. */
1612 nvptx_expand_compare (rtx compare)
1614 rtx pred = gen_reg_rtx (BImode);
1615 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1616 XEXP (compare, 0), XEXP (compare, 1));
1617 emit_insn (gen_rtx_SET (pred, cmp));
1618 return gen_rtx_NE (BImode, pred, const0_rtx);
1621 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1623 void
1624 nvptx_expand_oacc_fork (unsigned mode)
1626 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1629 void
1630 nvptx_expand_oacc_join (unsigned mode)
1632 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1635 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1636 objects. */
1638 static rtx
1639 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1641 rtx res;
1643 switch (GET_MODE (src))
1645 case E_DImode:
1646 res = gen_unpackdisi2 (dst0, dst1, src);
1647 break;
1648 case E_DFmode:
1649 res = gen_unpackdfsi2 (dst0, dst1, src);
1650 break;
1651 default: gcc_unreachable ();
1653 return res;
1656 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1657 object. */
1659 static rtx
1660 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1662 rtx res;
1664 switch (GET_MODE (dst))
1666 case E_DImode:
1667 res = gen_packsidi2 (dst, src0, src1);
1668 break;
1669 case E_DFmode:
1670 res = gen_packsidf2 (dst, src0, src1);
1671 break;
1672 default: gcc_unreachable ();
1674 return res;
1677 /* Generate an instruction or sequence to broadcast register REG
1678 across the vectors of a single warp. */
1681 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1683 rtx res;
1685 switch (GET_MODE (dst))
1687 case E_SImode:
1688 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1689 break;
1690 case E_SFmode:
1691 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1692 break;
1693 case E_DImode:
1694 case E_DFmode:
1696 rtx tmp0 = gen_reg_rtx (SImode);
1697 rtx tmp1 = gen_reg_rtx (SImode);
1699 start_sequence ();
1700 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1701 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1702 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1703 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1704 res = get_insns ();
1705 end_sequence ();
1707 break;
1708 case E_BImode:
1710 rtx tmp = gen_reg_rtx (SImode);
1712 start_sequence ();
1713 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1714 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1715 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1716 res = get_insns ();
1717 end_sequence ();
1719 break;
1720 case E_QImode:
1721 case E_HImode:
1723 rtx tmp = gen_reg_rtx (SImode);
1725 start_sequence ();
1726 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1727 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1728 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1729 tmp)));
1730 res = get_insns ();
1731 end_sequence ();
1733 break;
1735 default:
1736 gcc_unreachable ();
1738 return res;
1741 /* Generate an instruction or sequence to broadcast register REG
1742 across the vectors of a single warp. */
1744 static rtx
1745 nvptx_gen_vcast (rtx reg)
1747 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1750 /* Structure used when generating a worker-level spill or fill. */
1752 struct wcast_data_t
1754 rtx base; /* Register holding base addr of buffer. */
1755 rtx ptr; /* Iteration var, if needed. */
1756 unsigned offset; /* Offset into worker buffer. */
1759 /* Direction of the spill/fill and looping setup/teardown indicator. */
1761 enum propagate_mask
1763 PM_read = 1 << 0,
1764 PM_write = 1 << 1,
1765 PM_loop_begin = 1 << 2,
1766 PM_loop_end = 1 << 3,
1768 PM_read_write = PM_read | PM_write
1771 /* Generate instruction(s) to spill or fill register REG to/from the
1772 worker broadcast array. PM indicates what is to be done, REP
1773 how many loop iterations will be executed (0 for not a loop). */
1775 static rtx
1776 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1778 rtx res;
1779 machine_mode mode = GET_MODE (reg);
1781 switch (mode)
1783 case E_BImode:
1785 rtx tmp = gen_reg_rtx (SImode);
1787 start_sequence ();
1788 if (pm & PM_read)
1789 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1790 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1791 if (pm & PM_write)
1792 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1793 res = get_insns ();
1794 end_sequence ();
1796 break;
1798 default:
1800 rtx addr = data->ptr;
1802 if (!addr)
1804 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1806 if (align > worker_bcast_align)
1807 worker_bcast_align = align;
1808 data->offset = (data->offset + align - 1) & ~(align - 1);
1809 addr = data->base;
1810 if (data->offset)
1811 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1814 addr = gen_rtx_MEM (mode, addr);
1815 if (pm == PM_read)
1816 res = gen_rtx_SET (addr, reg);
1817 else if (pm == PM_write)
1818 res = gen_rtx_SET (reg, addr);
1819 else
1820 gcc_unreachable ();
1822 if (data->ptr)
1824 /* We're using a ptr, increment it. */
1825 start_sequence ();
1827 emit_insn (res);
1828 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1829 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1830 res = get_insns ();
1831 end_sequence ();
1833 else
1834 rep = 1;
1835 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1837 break;
1839 return res;
1842 /* Returns true if X is a valid address for use in a memory reference. */
1844 static bool
1845 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1847 enum rtx_code code = GET_CODE (x);
1849 switch (code)
1851 case REG:
1852 return true;
1854 case PLUS:
1855 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1856 return true;
1857 return false;
1859 case CONST:
1860 case SYMBOL_REF:
1861 case LABEL_REF:
1862 return true;
1864 default:
1865 return false;
1869 /* Machinery to output constant initializers. When beginning an
1870 initializer, we decide on a fragment size (which is visible in ptx
1871 in the type used), and then all initializer data is buffered until
1872 a fragment is filled and ready to be written out. */
1874 static struct
1876 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1877 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1878 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1879 out. */
1880 unsigned size; /* Fragment size to accumulate. */
1881 unsigned offset; /* Offset within current fragment. */
1882 bool started; /* Whether we've output any initializer. */
1883 } init_frag;
1885 /* The current fragment is full, write it out. SYM may provide a
1886 symbolic reference we should output, in which case the fragment
1887 value is the addend. */
1889 static void
1890 output_init_frag (rtx sym)
1892 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1893 unsigned HOST_WIDE_INT val = init_frag.val;
1895 init_frag.started = true;
1896 init_frag.val = 0;
1897 init_frag.offset = 0;
1898 init_frag.remaining--;
1900 if (sym)
1902 bool function = (SYMBOL_REF_DECL (sym)
1903 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
1904 if (!function)
1905 fprintf (asm_out_file, "generic(");
1906 output_address (VOIDmode, sym);
1907 if (!function)
1908 fprintf (asm_out_file, ")");
1909 if (val)
1910 fprintf (asm_out_file, " + ");
1913 if (!sym || val)
1914 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1917 /* Add value VAL of size SIZE to the data we're emitting, and keep
1918 writing out chunks as they fill up. */
1920 static void
1921 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1923 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1925 for (unsigned part = 0; size; size -= part)
1927 val >>= part * BITS_PER_UNIT;
1928 part = init_frag.size - init_frag.offset;
1929 if (part > size)
1930 part = size;
1932 unsigned HOST_WIDE_INT partial
1933 = val << (init_frag.offset * BITS_PER_UNIT);
1934 init_frag.val |= partial & init_frag.mask;
1935 init_frag.offset += part;
1937 if (init_frag.offset == init_frag.size)
1938 output_init_frag (NULL);
1942 /* Target hook for assembling integer object X of size SIZE. */
1944 static bool
1945 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1947 HOST_WIDE_INT val = 0;
1949 switch (GET_CODE (x))
1951 default:
1952 /* Let the generic machinery figure it out, usually for a
1953 CONST_WIDE_INT. */
1954 return false;
1956 case CONST_INT:
1957 nvptx_assemble_value (INTVAL (x), size);
1958 break;
1960 case CONST:
1961 x = XEXP (x, 0);
1962 gcc_assert (GET_CODE (x) == PLUS);
1963 val = INTVAL (XEXP (x, 1));
1964 x = XEXP (x, 0);
1965 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1966 /* FALLTHROUGH */
1968 case SYMBOL_REF:
1969 gcc_assert (size == init_frag.size);
1970 if (init_frag.offset)
1971 sorry ("cannot emit unaligned pointers in ptx assembly");
1973 nvptx_maybe_record_fnsym (x);
1974 init_frag.val = val;
1975 output_init_frag (x);
1976 break;
1979 return true;
1982 /* Output SIZE zero bytes. We ignore the FILE argument since the
1983 functions we're calling to perform the output just use
1984 asm_out_file. */
1986 void
1987 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1989 /* Finish the current fragment, if it's started. */
1990 if (init_frag.offset)
1992 unsigned part = init_frag.size - init_frag.offset;
1993 if (part > size)
1994 part = (unsigned) size;
1995 size -= part;
1996 nvptx_assemble_value (0, part);
1999 /* If this skip doesn't terminate the initializer, write as many
2000 remaining pieces as possible directly. */
2001 if (size < init_frag.remaining * init_frag.size)
2003 while (size >= init_frag.size)
2005 size -= init_frag.size;
2006 output_init_frag (NULL_RTX);
2008 if (size)
2009 nvptx_assemble_value (0, size);
2013 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2014 ignore the FILE arg. */
2016 void
2017 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2019 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2020 nvptx_assemble_value (str[i], 1);
2023 /* Return true if TYPE is a record type where the last field is an array without
2024 given dimension. */
2026 static bool
2027 flexible_array_member_type_p (const_tree type)
2029 if (TREE_CODE (type) != RECORD_TYPE)
2030 return false;
2032 const_tree last_field = NULL_TREE;
2033 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2034 last_field = f;
2036 if (!last_field)
2037 return false;
2039 const_tree last_field_type = TREE_TYPE (last_field);
2040 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2041 return false;
2043 return (! TYPE_DOMAIN (last_field_type)
2044 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2047 /* Emit a PTX variable decl and prepare for emission of its
2048 initializer. NAME is the symbol name and SETION the PTX data
2049 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2050 The caller has already emitted any indentation and linkage
2051 specifier. It is responsible for any initializer, terminating ;
2052 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2053 this is the opposite way round that PTX wants them! */
2055 static void
2056 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2057 const_tree type, HOST_WIDE_INT size, unsigned align,
2058 bool undefined = false)
2060 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2061 && (TYPE_DOMAIN (type) == NULL_TREE);
2063 if (undefined && flexible_array_member_type_p (type))
2065 size = 0;
2066 atype = true;
2069 while (TREE_CODE (type) == ARRAY_TYPE)
2070 type = TREE_TYPE (type);
2072 if (TREE_CODE (type) == VECTOR_TYPE
2073 || TREE_CODE (type) == COMPLEX_TYPE)
2074 /* Neither vector nor complex types can contain the other. */
2075 type = TREE_TYPE (type);
2077 unsigned elt_size = int_size_in_bytes (type);
2079 /* Largest mode we're prepared to accept. For BLKmode types we
2080 don't know if it'll contain pointer constants, so have to choose
2081 pointer size, otherwise we can choose DImode. */
2082 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2084 elt_size |= GET_MODE_SIZE (elt_mode);
2085 elt_size &= -elt_size; /* Extract LSB set. */
2087 init_frag.size = elt_size;
2088 /* Avoid undefined shift behavior by using '2'. */
2089 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2090 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2091 init_frag.val = 0;
2092 init_frag.offset = 0;
2093 init_frag.started = false;
2094 /* Size might not be a multiple of elt size, if there's an
2095 initialized trailing struct array with smaller type than
2096 elt_size. */
2097 init_frag.remaining = (size + elt_size - 1) / elt_size;
2099 fprintf (file, "%s .align %d .u%d ",
2100 section, align / BITS_PER_UNIT,
2101 elt_size * BITS_PER_UNIT);
2102 assemble_name (file, name);
2104 if (size)
2105 /* We make everything an array, to simplify any initialization
2106 emission. */
2107 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2108 else if (atype)
2109 fprintf (file, "[]");
2112 /* Called when the initializer for a decl has been completely output through
2113 combinations of the three functions above. */
2115 static void
2116 nvptx_assemble_decl_end (void)
2118 if (init_frag.offset)
2119 /* This can happen with a packed struct with trailing array member. */
2120 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2121 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2124 /* Output an uninitialized common or file-scope variable. */
2126 void
2127 nvptx_output_aligned_decl (FILE *file, const char *name,
2128 const_tree decl, HOST_WIDE_INT size, unsigned align)
2130 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2132 /* If this is public, it is common. The nearest thing we have to
2133 common is weak. */
2134 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2136 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2137 TREE_TYPE (decl), size, align);
2138 nvptx_assemble_decl_end ();
2141 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2142 writing a constant variable EXP with NAME and SIZE and its
2143 initializer to FILE. */
2145 static void
2146 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2147 const_tree exp, HOST_WIDE_INT obj_size)
2149 write_var_marker (file, true, false, name);
2151 fprintf (file, "\t");
2153 tree type = TREE_TYPE (exp);
2154 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2155 TYPE_ALIGN (type));
2158 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2159 a variable DECL with NAME to FILE. */
2161 void
2162 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2164 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2166 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2167 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2169 tree type = TREE_TYPE (decl);
2170 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2171 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2172 type, obj_size, DECL_ALIGN (decl));
2175 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2177 static void
2178 nvptx_globalize_label (FILE *, const char *)
2182 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2183 declaration only for variable DECL with NAME to FILE. */
2185 static void
2186 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2188 /* The middle end can place constant pool decls into the varpool as
2189 undefined. Until that is fixed, catch the problem here. */
2190 if (DECL_IN_CONSTANT_POOL (decl))
2191 return;
2193 /* We support weak defintions, and hence have the right
2194 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2195 if (DECL_WEAK (decl))
2196 error_at (DECL_SOURCE_LOCATION (decl),
2197 "PTX does not support weak declarations"
2198 " (only weak definitions)");
2199 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2201 fprintf (file, "\t.extern ");
2202 tree size = DECL_SIZE_UNIT (decl);
2203 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2204 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2205 DECL_ALIGN (decl), true);
2206 nvptx_assemble_decl_end ();
2209 /* Output a pattern for a move instruction. */
2211 const char *
2212 nvptx_output_mov_insn (rtx dst, rtx src)
2214 machine_mode dst_mode = GET_MODE (dst);
2215 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2216 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2217 machine_mode src_inner = (GET_CODE (src) == SUBREG
2218 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2220 rtx sym = src;
2221 if (GET_CODE (sym) == CONST)
2222 sym = XEXP (XEXP (sym, 0), 0);
2223 if (SYMBOL_REF_P (sym))
2225 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2226 return "%.\tcvta%D1%t0\t%0, %1;";
2227 nvptx_maybe_record_fnsym (sym);
2230 if (src_inner == dst_inner)
2231 return "%.\tmov%t0\t%0, %1;";
2233 if (CONSTANT_P (src))
2234 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2235 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2236 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2238 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2240 if (GET_MODE_BITSIZE (dst_mode) == 128
2241 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2243 /* mov.b128 is not supported. */
2244 if (dst_inner == V2DImode && src_inner == TImode)
2245 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2246 else if (dst_inner == TImode && src_inner == V2DImode)
2247 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2249 gcc_unreachable ();
2251 return "%.\tmov.b%T0\t%0, %1;";
2254 return "%.\tcvt%t0%t1\t%0, %1;";
2257 static void nvptx_print_operand (FILE *, rtx, int);
2259 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2260 involves writing .param declarations and in/out copies into them. For
2261 indirect calls, also write the .callprototype. */
2263 const char *
2264 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2266 char buf[16];
2267 static int labelno;
2268 bool needs_tgt = register_operand (callee, Pmode);
2269 rtx pat = PATTERN (insn);
2270 if (GET_CODE (pat) == COND_EXEC)
2271 pat = COND_EXEC_CODE (pat);
2272 int arg_end = XVECLEN (pat, 0);
2273 tree decl = NULL_TREE;
2275 fprintf (asm_out_file, "\t{\n");
2276 if (result != NULL)
2277 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2278 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2279 reg_names[NVPTX_RETURN_REGNUM]);
2281 /* Ensure we have a ptx declaration in the output if necessary. */
2282 if (GET_CODE (callee) == SYMBOL_REF)
2284 decl = SYMBOL_REF_DECL (callee);
2285 if (!decl
2286 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2287 nvptx_record_libfunc (callee, result, pat);
2288 else if (DECL_EXTERNAL (decl))
2289 nvptx_record_fndecl (decl);
2292 if (needs_tgt)
2294 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2295 labelno++;
2296 ASM_OUTPUT_LABEL (asm_out_file, buf);
2297 std::stringstream s;
2298 write_fn_proto_from_insn (s, NULL, result, pat);
2299 fputs (s.str().c_str(), asm_out_file);
2302 for (int argno = 1; argno < arg_end; argno++)
2304 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2305 machine_mode mode = GET_MODE (t);
2306 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2308 /* Mode splitting has already been done. */
2309 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2310 "\t\tst.param%s [%%out_arg%d], ",
2311 ptx_type, argno, ptx_type, argno);
2312 output_reg (asm_out_file, REGNO (t), VOIDmode);
2313 fprintf (asm_out_file, ";\n");
2316 /* The '.' stands for the call's predicate, if any. */
2317 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2318 fprintf (asm_out_file, "\t\tcall ");
2319 if (result != NULL_RTX)
2320 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2322 if (decl)
2324 const char *name = get_fnname_from_decl (decl);
2325 name = nvptx_name_replacement (name);
2326 assemble_name (asm_out_file, name);
2328 else
2329 output_address (VOIDmode, callee);
2331 const char *open = "(";
2332 for (int argno = 1; argno < arg_end; argno++)
2334 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2335 open = "";
2337 if (decl && DECL_STATIC_CHAIN (decl))
2339 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2340 open = "";
2342 if (!open[0])
2343 fprintf (asm_out_file, ")");
2345 if (needs_tgt)
2347 fprintf (asm_out_file, ", ");
2348 assemble_name (asm_out_file, buf);
2350 fprintf (asm_out_file, ";\n");
2352 if (find_reg_note (insn, REG_NORETURN, NULL))
2354 /* No return functions confuse the PTX JIT, as it doesn't realize
2355 the flow control barrier they imply. It can seg fault if it
2356 encounters what looks like an unexitable loop. Emit a trailing
2357 trap and exit, which it does grok. */
2358 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2359 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2362 if (result)
2364 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2366 if (!rval[0])
2367 /* We must escape the '%' that starts RETURN_REGNUM. */
2368 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2369 reg_names[NVPTX_RETURN_REGNUM]);
2370 return rval;
2373 return "}";
2376 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2378 static bool
2379 nvptx_print_operand_punct_valid_p (unsigned char c)
2381 return c == '.' || c== '#';
2384 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2386 static void
2387 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2389 rtx off;
2390 if (GET_CODE (x) == CONST)
2391 x = XEXP (x, 0);
2392 switch (GET_CODE (x))
2394 case PLUS:
2395 off = XEXP (x, 1);
2396 output_address (VOIDmode, XEXP (x, 0));
2397 fprintf (file, "+");
2398 output_address (VOIDmode, off);
2399 break;
2401 case SYMBOL_REF:
2402 case LABEL_REF:
2403 output_addr_const (file, x);
2404 break;
2406 default:
2407 gcc_assert (GET_CODE (x) != MEM);
2408 nvptx_print_operand (file, x, 0);
2409 break;
2413 /* Write assembly language output for the address ADDR to FILE. */
2415 static void
2416 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2418 nvptx_print_address_operand (file, addr, mode);
2421 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2423 Meaning of CODE:
2424 . -- print the predicate for the instruction or an emptry string for an
2425 unconditional one.
2426 # -- print a rounding mode for the instruction
2428 A -- print a data area for a MEM
2429 c -- print an opcode suffix for a comparison operator, including a type code
2430 D -- print a data area for a MEM operand
2431 S -- print a shuffle kind specified by CONST_INT
2432 t -- print a type opcode suffix, promoting QImode to 32 bits
2433 T -- print a type size in bits
2434 u -- print a type opcode suffix without promotions. */
2436 static void
2437 nvptx_print_operand (FILE *file, rtx x, int code)
2439 if (code == '.')
2441 x = current_insn_predicate;
2442 if (x)
2444 fputs ("@", file);
2445 if (GET_CODE (x) == EQ)
2446 fputs ("!", file);
2447 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2449 return;
2451 else if (code == '#')
2453 fputs (".rn", file);
2454 return;
2457 enum rtx_code x_code = GET_CODE (x);
2458 machine_mode mode = GET_MODE (x);
2460 switch (code)
2462 case 'A':
2463 x = XEXP (x, 0);
2464 /* FALLTHROUGH. */
2466 case 'D':
2467 if (GET_CODE (x) == CONST)
2468 x = XEXP (x, 0);
2469 if (GET_CODE (x) == PLUS)
2470 x = XEXP (x, 0);
2472 if (GET_CODE (x) == SYMBOL_REF)
2473 fputs (section_for_sym (x), file);
2474 break;
2476 case 't':
2477 case 'u':
2478 if (x_code == SUBREG)
2480 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2481 if (VECTOR_MODE_P (inner_mode)
2482 && (GET_MODE_SIZE (mode)
2483 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2484 mode = GET_MODE_INNER (inner_mode);
2485 else if (split_mode_p (inner_mode))
2486 mode = maybe_split_mode (inner_mode);
2487 else
2488 mode = inner_mode;
2490 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2491 break;
2493 case 'H':
2494 case 'L':
2496 rtx inner_x = SUBREG_REG (x);
2497 machine_mode inner_mode = GET_MODE (inner_x);
2498 machine_mode split = maybe_split_mode (inner_mode);
2500 output_reg (file, REGNO (inner_x), split,
2501 (code == 'H'
2502 ? GET_MODE_SIZE (inner_mode) / 2
2503 : 0));
2505 break;
2507 case 'S':
2509 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2510 /* Same order as nvptx_shuffle_kind. */
2511 static const char *const kinds[] =
2512 {".up", ".down", ".bfly", ".idx"};
2513 fputs (kinds[kind], file);
2515 break;
2517 case 'T':
2518 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2519 break;
2521 case 'j':
2522 fprintf (file, "@");
2523 goto common;
2525 case 'J':
2526 fprintf (file, "@!");
2527 goto common;
2529 case 'c':
2530 mode = GET_MODE (XEXP (x, 0));
2531 switch (x_code)
2533 case EQ:
2534 fputs (".eq", file);
2535 break;
2536 case NE:
2537 if (FLOAT_MODE_P (mode))
2538 fputs (".neu", file);
2539 else
2540 fputs (".ne", file);
2541 break;
2542 case LE:
2543 case LEU:
2544 fputs (".le", file);
2545 break;
2546 case GE:
2547 case GEU:
2548 fputs (".ge", file);
2549 break;
2550 case LT:
2551 case LTU:
2552 fputs (".lt", file);
2553 break;
2554 case GT:
2555 case GTU:
2556 fputs (".gt", file);
2557 break;
2558 case LTGT:
2559 fputs (".ne", file);
2560 break;
2561 case UNEQ:
2562 fputs (".equ", file);
2563 break;
2564 case UNLE:
2565 fputs (".leu", file);
2566 break;
2567 case UNGE:
2568 fputs (".geu", file);
2569 break;
2570 case UNLT:
2571 fputs (".ltu", file);
2572 break;
2573 case UNGT:
2574 fputs (".gtu", file);
2575 break;
2576 case UNORDERED:
2577 fputs (".nan", file);
2578 break;
2579 case ORDERED:
2580 fputs (".num", file);
2581 break;
2582 default:
2583 gcc_unreachable ();
2585 if (FLOAT_MODE_P (mode)
2586 || x_code == EQ || x_code == NE
2587 || x_code == GEU || x_code == GTU
2588 || x_code == LEU || x_code == LTU)
2589 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2590 else
2591 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2592 break;
2593 default:
2594 common:
2595 switch (x_code)
2597 case SUBREG:
2599 rtx inner_x = SUBREG_REG (x);
2600 machine_mode inner_mode = GET_MODE (inner_x);
2601 machine_mode split = maybe_split_mode (inner_mode);
2603 if (VECTOR_MODE_P (inner_mode)
2604 && (GET_MODE_SIZE (mode)
2605 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2607 output_reg (file, REGNO (inner_x), VOIDmode);
2608 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2610 else if (split_mode_p (inner_mode)
2611 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2612 output_reg (file, REGNO (inner_x), split);
2613 else
2614 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2616 break;
2618 case REG:
2619 output_reg (file, REGNO (x), maybe_split_mode (mode));
2620 break;
2622 case MEM:
2623 fputc ('[', file);
2624 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2625 fputc (']', file);
2626 break;
2628 case CONST_INT:
2629 output_addr_const (file, x);
2630 break;
2632 case CONST:
2633 case SYMBOL_REF:
2634 case LABEL_REF:
2635 /* We could use output_addr_const, but that can print things like
2636 "x-8", which breaks ptxas. Need to ensure it is output as
2637 "x+-8". */
2638 nvptx_print_address_operand (file, x, VOIDmode);
2639 break;
2641 case CONST_DOUBLE:
2642 long vals[2];
2643 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2644 vals[0] &= 0xffffffff;
2645 vals[1] &= 0xffffffff;
2646 if (mode == SFmode)
2647 fprintf (file, "0f%08lx", vals[0]);
2648 else
2649 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2650 break;
2652 case CONST_VECTOR:
2654 unsigned n = CONST_VECTOR_NUNITS (x);
2655 fprintf (file, "{ ");
2656 for (unsigned i = 0; i < n; ++i)
2658 if (i != 0)
2659 fprintf (file, ", ");
2661 rtx elem = CONST_VECTOR_ELT (x, i);
2662 output_addr_const (file, elem);
2664 fprintf (file, " }");
2666 break;
2668 default:
2669 output_addr_const (file, x);
2674 /* Record replacement regs used to deal with subreg operands. */
2675 struct reg_replace
2677 rtx replacement[MAX_RECOG_OPERANDS];
2678 machine_mode mode;
2679 int n_allocated;
2680 int n_in_use;
2683 /* Allocate or reuse a replacement in R and return the rtx. */
2685 static rtx
2686 get_replacement (struct reg_replace *r)
2688 if (r->n_allocated == r->n_in_use)
2689 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2690 return r->replacement[r->n_in_use++];
2693 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2694 the presence of subregs would break the rules for most instructions.
2695 Replace them with a suitable new register of the right size, plus
2696 conversion copyin/copyout instructions. */
2698 static void
2699 nvptx_reorg_subreg (void)
2701 struct reg_replace qiregs, hiregs, siregs, diregs;
2702 rtx_insn *insn, *next;
2704 qiregs.n_allocated = 0;
2705 hiregs.n_allocated = 0;
2706 siregs.n_allocated = 0;
2707 diregs.n_allocated = 0;
2708 qiregs.mode = QImode;
2709 hiregs.mode = HImode;
2710 siregs.mode = SImode;
2711 diregs.mode = DImode;
2713 for (insn = get_insns (); insn; insn = next)
2715 next = NEXT_INSN (insn);
2716 if (!NONDEBUG_INSN_P (insn)
2717 || asm_noperands (PATTERN (insn)) >= 0
2718 || GET_CODE (PATTERN (insn)) == USE
2719 || GET_CODE (PATTERN (insn)) == CLOBBER)
2720 continue;
2722 qiregs.n_in_use = 0;
2723 hiregs.n_in_use = 0;
2724 siregs.n_in_use = 0;
2725 diregs.n_in_use = 0;
2726 extract_insn (insn);
2727 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2729 for (int i = 0; i < recog_data.n_operands; i++)
2731 rtx op = recog_data.operand[i];
2732 if (GET_CODE (op) != SUBREG)
2733 continue;
2735 rtx inner = SUBREG_REG (op);
2737 machine_mode outer_mode = GET_MODE (op);
2738 machine_mode inner_mode = GET_MODE (inner);
2739 gcc_assert (s_ok);
2740 if (s_ok
2741 && (GET_MODE_PRECISION (inner_mode)
2742 >= GET_MODE_PRECISION (outer_mode)))
2743 continue;
2744 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2745 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2746 : outer_mode == HImode ? &hiregs
2747 : outer_mode == SImode ? &siregs
2748 : &diregs);
2749 rtx new_reg = get_replacement (r);
2751 if (recog_data.operand_type[i] != OP_OUT)
2753 enum rtx_code code;
2754 if (GET_MODE_PRECISION (inner_mode)
2755 < GET_MODE_PRECISION (outer_mode))
2756 code = ZERO_EXTEND;
2757 else
2758 code = TRUNCATE;
2760 rtx pat = gen_rtx_SET (new_reg,
2761 gen_rtx_fmt_e (code, outer_mode, inner));
2762 emit_insn_before (pat, insn);
2765 if (recog_data.operand_type[i] != OP_IN)
2767 enum rtx_code code;
2768 if (GET_MODE_PRECISION (inner_mode)
2769 < GET_MODE_PRECISION (outer_mode))
2770 code = TRUNCATE;
2771 else
2772 code = ZERO_EXTEND;
2774 rtx pat = gen_rtx_SET (inner,
2775 gen_rtx_fmt_e (code, inner_mode, new_reg));
2776 emit_insn_after (pat, insn);
2778 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2783 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2784 first use. */
2786 static rtx
2787 nvptx_get_unisimt_master ()
2789 rtx &master = cfun->machine->unisimt_master;
2790 return master ? master : master = gen_reg_rtx (SImode);
2793 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2795 static rtx
2796 nvptx_get_unisimt_predicate ()
2798 rtx &pred = cfun->machine->unisimt_predicate;
2799 return pred ? pred : pred = gen_reg_rtx (BImode);
2802 /* Return true if given call insn references one of the functions provided by
2803 the CUDA runtime: malloc, free, vprintf. */
2805 static bool
2806 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2808 rtx pat = PATTERN (insn);
2809 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2810 pat = XVECEXP (pat, 0, 0);
2811 if (GET_CODE (pat) == SET)
2812 pat = SET_SRC (pat);
2813 gcc_checking_assert (GET_CODE (pat) == CALL
2814 && GET_CODE (XEXP (pat, 0)) == MEM);
2815 rtx addr = XEXP (XEXP (pat, 0), 0);
2816 if (GET_CODE (addr) != SYMBOL_REF)
2817 return false;
2818 const char *name = XSTR (addr, 0);
2819 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2820 references with forced assembler name refer to PTX syscalls. For vprintf,
2821 accept both normal and forced-assembler-name references. */
2822 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2823 || !strcmp (name, "*malloc")
2824 || !strcmp (name, "*free"));
2827 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2828 propagate its value from lane MASTER to current lane. */
2830 static void
2831 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2833 rtx reg;
2834 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2835 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2838 /* Adjust code for uniform-simt code generation variant by making atomics and
2839 "syscalls" conditionally executed, and inserting shuffle-based propagation
2840 for registers being set. */
2842 static void
2843 nvptx_reorg_uniform_simt ()
2845 rtx_insn *insn, *next;
2847 for (insn = get_insns (); insn; insn = next)
2849 next = NEXT_INSN (insn);
2850 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2851 && !(NONJUMP_INSN_P (insn)
2852 && GET_CODE (PATTERN (insn)) == PARALLEL
2853 && get_attr_atomic (insn)))
2854 continue;
2855 rtx pat = PATTERN (insn);
2856 rtx master = nvptx_get_unisimt_master ();
2857 for (int i = 0; i < XVECLEN (pat, 0); i++)
2858 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2859 rtx pred = nvptx_get_unisimt_predicate ();
2860 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2861 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2862 validate_change (insn, &PATTERN (insn), pat, false);
2866 /* Loop structure of the function. The entire function is described as
2867 a NULL loop. */
2869 struct parallel
2871 /* Parent parallel. */
2872 parallel *parent;
2874 /* Next sibling parallel. */
2875 parallel *next;
2877 /* First child parallel. */
2878 parallel *inner;
2880 /* Partitioning mask of the parallel. */
2881 unsigned mask;
2883 /* Partitioning used within inner parallels. */
2884 unsigned inner_mask;
2886 /* Location of parallel forked and join. The forked is the first
2887 block in the parallel and the join is the first block after of
2888 the partition. */
2889 basic_block forked_block;
2890 basic_block join_block;
2892 rtx_insn *forked_insn;
2893 rtx_insn *join_insn;
2895 rtx_insn *fork_insn;
2896 rtx_insn *joining_insn;
2898 /* Basic blocks in this parallel, but not in child parallels. The
2899 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2900 blocks are not. */
2901 auto_vec<basic_block> blocks;
2903 public:
2904 parallel (parallel *parent, unsigned mode);
2905 ~parallel ();
2908 /* Constructor links the new parallel into it's parent's chain of
2909 children. */
2911 parallel::parallel (parallel *parent_, unsigned mask_)
2912 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2914 forked_block = join_block = 0;
2915 forked_insn = join_insn = 0;
2916 fork_insn = joining_insn = 0;
2918 if (parent)
2920 next = parent->inner;
2921 parent->inner = this;
2925 parallel::~parallel ()
2927 delete inner;
2928 delete next;
2931 /* Map of basic blocks to insns */
2932 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2934 /* A tuple of an insn of interest and the BB in which it resides. */
2935 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2936 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2938 /* Split basic blocks such that each forked and join unspecs are at
2939 the start of their basic blocks. Thus afterwards each block will
2940 have a single partitioning mode. We also do the same for return
2941 insns, as they are executed by every thread. Return the
2942 partitioning mode of the function as a whole. Populate MAP with
2943 head and tail blocks. We also clear the BB visited flag, which is
2944 used when finding partitions. */
2946 static void
2947 nvptx_split_blocks (bb_insn_map_t *map)
2949 insn_bb_vec_t worklist;
2950 basic_block block;
2951 rtx_insn *insn;
2953 /* Locate all the reorg instructions of interest. */
2954 FOR_ALL_BB_FN (block, cfun)
2956 bool seen_insn = false;
2958 /* Clear visited flag, for use by parallel locator */
2959 block->flags &= ~BB_VISITED;
2961 FOR_BB_INSNS (block, insn)
2963 if (!INSN_P (insn))
2964 continue;
2965 switch (recog_memoized (insn))
2967 default:
2968 seen_insn = true;
2969 continue;
2970 case CODE_FOR_nvptx_forked:
2971 case CODE_FOR_nvptx_join:
2972 break;
2974 case CODE_FOR_return:
2975 /* We also need to split just before return insns, as
2976 that insn needs executing by all threads, but the
2977 block it is in probably does not. */
2978 break;
2981 if (seen_insn)
2982 /* We've found an instruction that must be at the start of
2983 a block, but isn't. Add it to the worklist. */
2984 worklist.safe_push (insn_bb_t (insn, block));
2985 else
2986 /* It was already the first instruction. Just add it to
2987 the map. */
2988 map->get_or_insert (block) = insn;
2989 seen_insn = true;
2993 /* Split blocks on the worklist. */
2994 unsigned ix;
2995 insn_bb_t *elt;
2996 basic_block remap = 0;
2997 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2999 if (remap != elt->second)
3001 block = elt->second;
3002 remap = block;
3005 /* Split block before insn. The insn is in the new block */
3006 edge e = split_block (block, PREV_INSN (elt->first));
3008 block = e->dest;
3009 map->get_or_insert (block) = elt->first;
3013 /* BLOCK is a basic block containing a head or tail instruction.
3014 Locate the associated prehead or pretail instruction, which must be
3015 in the single predecessor block. */
3017 static rtx_insn *
3018 nvptx_discover_pre (basic_block block, int expected)
3020 gcc_assert (block->preds->length () == 1);
3021 basic_block pre_block = (*block->preds)[0]->src;
3022 rtx_insn *pre_insn;
3024 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3025 pre_insn = PREV_INSN (pre_insn))
3026 gcc_assert (pre_insn != BB_HEAD (pre_block));
3028 gcc_assert (recog_memoized (pre_insn) == expected);
3029 return pre_insn;
3032 /* Dump this parallel and all its inner parallels. */
3034 static void
3035 nvptx_dump_pars (parallel *par, unsigned depth)
3037 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3038 depth, par->mask,
3039 par->forked_block ? par->forked_block->index : -1,
3040 par->join_block ? par->join_block->index : -1);
3042 fprintf (dump_file, " blocks:");
3044 basic_block block;
3045 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3046 fprintf (dump_file, " %d", block->index);
3047 fprintf (dump_file, "\n");
3048 if (par->inner)
3049 nvptx_dump_pars (par->inner, depth + 1);
3051 if (par->next)
3052 nvptx_dump_pars (par->next, depth);
3055 /* If BLOCK contains a fork/join marker, process it to create or
3056 terminate a loop structure. Add this block to the current loop,
3057 and then walk successor blocks. */
3059 static parallel *
3060 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3062 if (block->flags & BB_VISITED)
3063 return par;
3064 block->flags |= BB_VISITED;
3066 if (rtx_insn **endp = map->get (block))
3068 rtx_insn *end = *endp;
3070 /* This is a block head or tail, or return instruction. */
3071 switch (recog_memoized (end))
3073 case CODE_FOR_return:
3074 /* Return instructions are in their own block, and we
3075 don't need to do anything more. */
3076 return par;
3078 case CODE_FOR_nvptx_forked:
3079 /* Loop head, create a new inner loop and add it into
3080 our parent's child list. */
3082 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3084 gcc_assert (mask);
3085 par = new parallel (par, mask);
3086 par->forked_block = block;
3087 par->forked_insn = end;
3088 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3089 par->fork_insn
3090 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3092 break;
3094 case CODE_FOR_nvptx_join:
3095 /* A loop tail. Finish the current loop and return to
3096 parent. */
3098 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3100 gcc_assert (par->mask == mask);
3101 par->join_block = block;
3102 par->join_insn = end;
3103 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3104 par->joining_insn
3105 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3106 par = par->parent;
3108 break;
3110 default:
3111 gcc_unreachable ();
3115 if (par)
3116 /* Add this block onto the current loop's list of blocks. */
3117 par->blocks.safe_push (block);
3118 else
3119 /* This must be the entry block. Create a NULL parallel. */
3120 par = new parallel (0, 0);
3122 /* Walk successor blocks. */
3123 edge e;
3124 edge_iterator ei;
3126 FOR_EACH_EDGE (e, ei, block->succs)
3127 nvptx_find_par (map, par, e->dest);
3129 return par;
3132 /* DFS walk the CFG looking for fork & join markers. Construct
3133 loop structures as we go. MAP is a mapping of basic blocks
3134 to head & tail markers, discovered when splitting blocks. This
3135 speeds up the discovery. We rely on the BB visited flag having
3136 been cleared when splitting blocks. */
3138 static parallel *
3139 nvptx_discover_pars (bb_insn_map_t *map)
3141 basic_block block;
3143 /* Mark exit blocks as visited. */
3144 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3145 block->flags |= BB_VISITED;
3147 /* And entry block as not. */
3148 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3149 block->flags &= ~BB_VISITED;
3151 parallel *par = nvptx_find_par (map, 0, block);
3153 if (dump_file)
3155 fprintf (dump_file, "\nLoops\n");
3156 nvptx_dump_pars (par, 0);
3157 fprintf (dump_file, "\n");
3160 return par;
3163 /* Analyse a group of BBs within a partitioned region and create N
3164 Single-Entry-Single-Exit regions. Some of those regions will be
3165 trivial ones consisting of a single BB. The blocks of a
3166 partitioned region might form a set of disjoint graphs -- because
3167 the region encloses a differently partitoned sub region.
3169 We use the linear time algorithm described in 'Finding Regions Fast:
3170 Single Entry Single Exit and control Regions in Linear Time'
3171 Johnson, Pearson & Pingali. That algorithm deals with complete
3172 CFGs, where a back edge is inserted from END to START, and thus the
3173 problem becomes one of finding equivalent loops.
3175 In this case we have a partial CFG. We complete it by redirecting
3176 any incoming edge to the graph to be from an arbitrary external BB,
3177 and similarly redirecting any outgoing edge to be to that BB.
3178 Thus we end up with a closed graph.
3180 The algorithm works by building a spanning tree of an undirected
3181 graph and keeping track of back edges from nodes further from the
3182 root in the tree to nodes nearer to the root in the tree. In the
3183 description below, the root is up and the tree grows downwards.
3185 We avoid having to deal with degenerate back-edges to the same
3186 block, by splitting each BB into 3 -- one for input edges, one for
3187 the node itself and one for the output edges. Such back edges are
3188 referred to as 'Brackets'. Cycle equivalent nodes will have the
3189 same set of brackets.
3191 Determining bracket equivalency is done by maintaining a list of
3192 brackets in such a manner that the list length and final bracket
3193 uniquely identify the set.
3195 We use coloring to mark all BBs with cycle equivalency with the
3196 same color. This is the output of the 'Finding Regions Fast'
3197 algorithm. Notice it doesn't actually find the set of nodes within
3198 a particular region, just unorderd sets of nodes that are the
3199 entries and exits of SESE regions.
3201 After determining cycle equivalency, we need to find the minimal
3202 set of SESE regions. Do this with a DFS coloring walk of the
3203 complete graph. We're either 'looking' or 'coloring'. When
3204 looking, and we're in the subgraph, we start coloring the color of
3205 the current node, and remember that node as the start of the
3206 current color's SESE region. Every time we go to a new node, we
3207 decrement the count of nodes with thet color. If it reaches zero,
3208 we remember that node as the end of the current color's SESE region
3209 and return to 'looking'. Otherwise we color the node the current
3210 color.
3212 This way we end up with coloring the inside of non-trivial SESE
3213 regions with the color of that region. */
3215 /* A pair of BBs. We use this to represent SESE regions. */
3216 typedef std::pair<basic_block, basic_block> bb_pair_t;
3217 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3219 /* A node in the undirected CFG. The discriminator SECOND indicates just
3220 above or just below the BB idicated by FIRST. */
3221 typedef std::pair<basic_block, int> pseudo_node_t;
3223 /* A bracket indicates an edge towards the root of the spanning tree of the
3224 undirected graph. Each bracket has a color, determined
3225 from the currrent set of brackets. */
3226 struct bracket
3228 pseudo_node_t back; /* Back target */
3230 /* Current color and size of set. */
3231 unsigned color;
3232 unsigned size;
3234 bracket (pseudo_node_t back_)
3235 : back (back_), color (~0u), size (~0u)
3239 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3241 if (length != size)
3243 size = length;
3244 color = color_counts.length ();
3245 color_counts.quick_push (0);
3247 color_counts[color]++;
3248 return color;
3252 typedef auto_vec<bracket> bracket_vec_t;
3254 /* Basic block info for finding SESE regions. */
3256 struct bb_sese
3258 int node; /* Node number in spanning tree. */
3259 int parent; /* Parent node number. */
3261 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3262 edges arrive at pseudo-node Ai and the outgoing edges leave at
3263 pseudo-node Ao. We have to remember which way we arrived at a
3264 particular node when generating the spanning tree. dir > 0 means
3265 we arrived at Ai, dir < 0 means we arrived at Ao. */
3266 int dir;
3268 /* Lowest numbered pseudo-node reached via a backedge from thsis
3269 node, or any descendant. */
3270 pseudo_node_t high;
3272 int color; /* Cycle-equivalence color */
3274 /* Stack of brackets for this node. */
3275 bracket_vec_t brackets;
3277 bb_sese (unsigned node_, unsigned p, int dir_)
3278 :node (node_), parent (p), dir (dir_)
3281 ~bb_sese ();
3283 /* Push a bracket ending at BACK. */
3284 void push (const pseudo_node_t &back)
3286 if (dump_file)
3287 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3288 back.first ? back.first->index : 0, back.second);
3289 brackets.safe_push (bracket (back));
3292 void append (bb_sese *child);
3293 void remove (const pseudo_node_t &);
3295 /* Set node's color. */
3296 void set_color (auto_vec<unsigned> &color_counts)
3298 color = brackets.last ().get_color (color_counts, brackets.length ());
3302 bb_sese::~bb_sese ()
3306 /* Destructively append CHILD's brackets. */
3308 void
3309 bb_sese::append (bb_sese *child)
3311 if (int len = child->brackets.length ())
3313 int ix;
3315 if (dump_file)
3317 for (ix = 0; ix < len; ix++)
3319 const pseudo_node_t &pseudo = child->brackets[ix].back;
3320 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3321 child->node, pseudo.first ? pseudo.first->index : 0,
3322 pseudo.second);
3325 if (!brackets.length ())
3326 std::swap (brackets, child->brackets);
3327 else
3329 brackets.reserve (len);
3330 for (ix = 0; ix < len; ix++)
3331 brackets.quick_push (child->brackets[ix]);
3336 /* Remove brackets that terminate at PSEUDO. */
3338 void
3339 bb_sese::remove (const pseudo_node_t &pseudo)
3341 unsigned removed = 0;
3342 int len = brackets.length ();
3344 for (int ix = 0; ix < len; ix++)
3346 if (brackets[ix].back == pseudo)
3348 if (dump_file)
3349 fprintf (dump_file, "Removing backedge %d:%+d\n",
3350 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3351 removed++;
3353 else if (removed)
3354 brackets[ix-removed] = brackets[ix];
3356 while (removed--)
3357 brackets.pop ();
3360 /* Accessors for BB's aux pointer. */
3361 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3362 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3364 /* DFS walk creating SESE data structures. Only cover nodes with
3365 BB_VISITED set. Append discovered blocks to LIST. We number in
3366 increments of 3 so that the above and below pseudo nodes can be
3367 implicitly numbered too. */
3369 static int
3370 nvptx_sese_number (int n, int p, int dir, basic_block b,
3371 auto_vec<basic_block> *list)
3373 if (BB_GET_SESE (b))
3374 return n;
3376 if (dump_file)
3377 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3378 b->index, n, p, dir);
3380 BB_SET_SESE (b, new bb_sese (n, p, dir));
3381 p = n;
3383 n += 3;
3384 list->quick_push (b);
3386 /* First walk the nodes on the 'other side' of this node, then walk
3387 the nodes on the same side. */
3388 for (unsigned ix = 2; ix; ix--)
3390 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3391 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3392 : offsetof (edge_def, src));
3393 edge e;
3394 edge_iterator (ei);
3396 FOR_EACH_EDGE (e, ei, edges)
3398 basic_block target = *(basic_block *)((char *)e + offset);
3400 if (target->flags & BB_VISITED)
3401 n = nvptx_sese_number (n, p, dir, target, list);
3403 dir = -dir;
3405 return n;
3408 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3409 EDGES are the outgoing edges and OFFSET is the offset to the src
3410 or dst block on the edges. */
3412 static void
3413 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3414 vec<edge, va_gc> *edges, size_t offset)
3416 edge e;
3417 edge_iterator (ei);
3418 int hi_back = depth;
3419 pseudo_node_t node_back (0, depth);
3420 int hi_child = depth;
3421 pseudo_node_t node_child (0, depth);
3422 basic_block child = NULL;
3423 unsigned num_children = 0;
3424 int usd = -dir * sese->dir;
3426 if (dump_file)
3427 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3428 me->index, sese->node, dir);
3430 if (dir < 0)
3432 /* This is the above pseudo-child. It has the BB itself as an
3433 additional child node. */
3434 node_child = sese->high;
3435 hi_child = node_child.second;
3436 if (node_child.first)
3437 hi_child += BB_GET_SESE (node_child.first)->node;
3438 num_children++;
3441 /* Examine each edge.
3442 - if it is a child (a) append its bracket list and (b) record
3443 whether it is the child with the highest reaching bracket.
3444 - if it is an edge to ancestor, record whether it's the highest
3445 reaching backlink. */
3446 FOR_EACH_EDGE (e, ei, edges)
3448 basic_block target = *(basic_block *)((char *)e + offset);
3450 if (bb_sese *t_sese = BB_GET_SESE (target))
3452 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3454 /* Child node. Append its bracket list. */
3455 num_children++;
3456 sese->append (t_sese);
3458 /* Compare it's hi value. */
3459 int t_hi = t_sese->high.second;
3461 if (basic_block child_hi_block = t_sese->high.first)
3462 t_hi += BB_GET_SESE (child_hi_block)->node;
3464 if (hi_child > t_hi)
3466 hi_child = t_hi;
3467 node_child = t_sese->high;
3468 child = target;
3471 else if (t_sese->node < sese->node + dir
3472 && !(dir < 0 && sese->parent == t_sese->node))
3474 /* Non-parental ancestor node -- a backlink. */
3475 int d = usd * t_sese->dir;
3476 int back = t_sese->node + d;
3478 if (hi_back > back)
3480 hi_back = back;
3481 node_back = pseudo_node_t (target, d);
3485 else
3486 { /* Fallen off graph, backlink to entry node. */
3487 hi_back = 0;
3488 node_back = pseudo_node_t (0, 0);
3492 /* Remove any brackets that terminate at this pseudo node. */
3493 sese->remove (pseudo_node_t (me, dir));
3495 /* Now push any backlinks from this pseudo node. */
3496 FOR_EACH_EDGE (e, ei, edges)
3498 basic_block target = *(basic_block *)((char *)e + offset);
3499 if (bb_sese *t_sese = BB_GET_SESE (target))
3501 if (t_sese->node < sese->node + dir
3502 && !(dir < 0 && sese->parent == t_sese->node))
3503 /* Non-parental ancestor node - backedge from me. */
3504 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3506 else
3508 /* back edge to entry node */
3509 sese->push (pseudo_node_t (0, 0));
3513 /* If this node leads directly or indirectly to a no-return region of
3514 the graph, then fake a backedge to entry node. */
3515 if (!sese->brackets.length () || !edges || !edges->length ())
3517 hi_back = 0;
3518 node_back = pseudo_node_t (0, 0);
3519 sese->push (node_back);
3522 /* Record the highest reaching backedge from us or a descendant. */
3523 sese->high = hi_back < hi_child ? node_back : node_child;
3525 if (num_children > 1)
3527 /* There is more than one child -- this is a Y shaped piece of
3528 spanning tree. We have to insert a fake backedge from this
3529 node to the highest ancestor reached by not-the-highest
3530 reaching child. Note that there may be multiple children
3531 with backedges to the same highest node. That's ok and we
3532 insert the edge to that highest node. */
3533 hi_child = depth;
3534 if (dir < 0 && child)
3536 node_child = sese->high;
3537 hi_child = node_child.second;
3538 if (node_child.first)
3539 hi_child += BB_GET_SESE (node_child.first)->node;
3542 FOR_EACH_EDGE (e, ei, edges)
3544 basic_block target = *(basic_block *)((char *)e + offset);
3546 if (target == child)
3547 /* Ignore the highest child. */
3548 continue;
3550 bb_sese *t_sese = BB_GET_SESE (target);
3551 if (!t_sese)
3552 continue;
3553 if (t_sese->parent != sese->node)
3554 /* Not a child. */
3555 continue;
3557 /* Compare its hi value. */
3558 int t_hi = t_sese->high.second;
3560 if (basic_block child_hi_block = t_sese->high.first)
3561 t_hi += BB_GET_SESE (child_hi_block)->node;
3563 if (hi_child > t_hi)
3565 hi_child = t_hi;
3566 node_child = t_sese->high;
3570 sese->push (node_child);
3575 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3576 proceed to successors. Set SESE entry and exit nodes of
3577 REGIONS. */
3579 static void
3580 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3581 basic_block block, int coloring)
3583 bb_sese *sese = BB_GET_SESE (block);
3585 if (block->flags & BB_VISITED)
3587 /* If we've already encountered this block, either we must not
3588 be coloring, or it must have been colored the current color. */
3589 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3590 return;
3593 block->flags |= BB_VISITED;
3595 if (sese)
3597 if (coloring < 0)
3599 /* Start coloring a region. */
3600 regions[sese->color].first = block;
3601 coloring = sese->color;
3604 if (!--color_counts[sese->color] && sese->color == coloring)
3606 /* Found final block of SESE region. */
3607 regions[sese->color].second = block;
3608 coloring = -1;
3610 else
3611 /* Color the node, so we can assert on revisiting the node
3612 that the graph is indeed SESE. */
3613 sese->color = coloring;
3615 else
3616 /* Fallen off the subgraph, we cannot be coloring. */
3617 gcc_assert (coloring < 0);
3619 /* Walk each successor block. */
3620 if (block->succs && block->succs->length ())
3622 edge e;
3623 edge_iterator ei;
3625 FOR_EACH_EDGE (e, ei, block->succs)
3626 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3628 else
3629 gcc_assert (coloring < 0);
3632 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3633 end up with NULL entries in it. */
3635 static void
3636 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3638 basic_block block;
3639 int ix;
3641 /* First clear each BB of the whole function. */
3642 FOR_ALL_BB_FN (block, cfun)
3644 block->flags &= ~BB_VISITED;
3645 BB_SET_SESE (block, 0);
3648 /* Mark blocks in the function that are in this graph. */
3649 for (ix = 0; blocks.iterate (ix, &block); ix++)
3650 block->flags |= BB_VISITED;
3652 /* Counts of nodes assigned to each color. There cannot be more
3653 colors than blocks (and hopefully there will be fewer). */
3654 auto_vec<unsigned> color_counts;
3655 color_counts.reserve (blocks.length ());
3657 /* Worklist of nodes in the spanning tree. Again, there cannot be
3658 more nodes in the tree than blocks (there will be fewer if the
3659 CFG of blocks is disjoint). */
3660 auto_vec<basic_block> spanlist;
3661 spanlist.reserve (blocks.length ());
3663 /* Make sure every block has its cycle class determined. */
3664 for (ix = 0; blocks.iterate (ix, &block); ix++)
3666 if (BB_GET_SESE (block))
3667 /* We already met this block in an earlier graph solve. */
3668 continue;
3670 if (dump_file)
3671 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3673 /* Number the nodes reachable from block initial DFS order. */
3674 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3676 /* Now walk in reverse DFS order to find cycle equivalents. */
3677 while (spanlist.length ())
3679 block = spanlist.pop ();
3680 bb_sese *sese = BB_GET_SESE (block);
3682 /* Do the pseudo node below. */
3683 nvptx_sese_pseudo (block, sese, depth, +1,
3684 sese->dir > 0 ? block->succs : block->preds,
3685 (sese->dir > 0 ? offsetof (edge_def, dest)
3686 : offsetof (edge_def, src)));
3687 sese->set_color (color_counts);
3688 /* Do the pseudo node above. */
3689 nvptx_sese_pseudo (block, sese, depth, -1,
3690 sese->dir < 0 ? block->succs : block->preds,
3691 (sese->dir < 0 ? offsetof (edge_def, dest)
3692 : offsetof (edge_def, src)));
3694 if (dump_file)
3695 fprintf (dump_file, "\n");
3698 if (dump_file)
3700 unsigned count;
3701 const char *comma = "";
3703 fprintf (dump_file, "Found %d cycle equivalents\n",
3704 color_counts.length ());
3705 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3707 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3709 comma = "";
3710 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3711 if (BB_GET_SESE (block)->color == ix)
3713 block->flags |= BB_VISITED;
3714 fprintf (dump_file, "%s%d", comma, block->index);
3715 comma=",";
3717 fprintf (dump_file, "}");
3718 comma = ", ";
3720 fprintf (dump_file, "\n");
3723 /* Now we've colored every block in the subgraph. We now need to
3724 determine the minimal set of SESE regions that cover that
3725 subgraph. Do this with a DFS walk of the complete function.
3726 During the walk we're either 'looking' or 'coloring'. When we
3727 reach the last node of a particular color, we stop coloring and
3728 return to looking. */
3730 /* There cannot be more SESE regions than colors. */
3731 regions.reserve (color_counts.length ());
3732 for (ix = color_counts.length (); ix--;)
3733 regions.quick_push (bb_pair_t (0, 0));
3735 for (ix = 0; blocks.iterate (ix, &block); ix++)
3736 block->flags &= ~BB_VISITED;
3738 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3740 if (dump_file)
3742 const char *comma = "";
3743 int len = regions.length ();
3745 fprintf (dump_file, "SESE regions:");
3746 for (ix = 0; ix != len; ix++)
3748 basic_block from = regions[ix].first;
3749 basic_block to = regions[ix].second;
3751 if (from)
3753 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3754 if (to != from)
3755 fprintf (dump_file, "->%d", to->index);
3757 int color = BB_GET_SESE (from)->color;
3759 /* Print the blocks within the region (excluding ends). */
3760 FOR_EACH_BB_FN (block, cfun)
3762 bb_sese *sese = BB_GET_SESE (block);
3764 if (sese && sese->color == color
3765 && block != from && block != to)
3766 fprintf (dump_file, ".%d", block->index);
3768 fprintf (dump_file, "}");
3770 comma = ",";
3772 fprintf (dump_file, "\n\n");
3775 for (ix = 0; blocks.iterate (ix, &block); ix++)
3776 delete BB_GET_SESE (block);
3779 #undef BB_SET_SESE
3780 #undef BB_GET_SESE
3782 /* Propagate live state at the start of a partitioned region. IS_CALL
3783 indicates whether the propagation is for a (partitioned) call
3784 instruction. BLOCK provides the live register information, and
3785 might not contain INSN. Propagation is inserted just after INSN. RW
3786 indicates whether we are reading and/or writing state. This
3787 separation is needed for worker-level proppagation where we
3788 essentially do a spill & fill. FN is the underlying worker
3789 function to generate the propagation instructions for single
3790 register. DATA is user data.
3792 Returns true if we didn't emit any instructions.
3794 We propagate the live register set for non-calls and the entire
3795 frame for calls and non-calls. We could do better by (a)
3796 propagating just the live set that is used within the partitioned
3797 regions and (b) only propagating stack entries that are used. The
3798 latter might be quite hard to determine. */
3800 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3802 static bool
3803 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
3804 propagate_mask rw, propagator_fn fn, void *data)
3806 bitmap live = DF_LIVE_IN (block);
3807 bitmap_iterator iterator;
3808 unsigned ix;
3809 bool empty = true;
3811 /* Copy the frame array. */
3812 HOST_WIDE_INT fs = get_frame_size ();
3813 if (fs)
3815 rtx tmp = gen_reg_rtx (DImode);
3816 rtx idx = NULL_RTX;
3817 rtx ptr = gen_reg_rtx (Pmode);
3818 rtx pred = NULL_RTX;
3819 rtx_code_label *label = NULL;
3821 empty = false;
3822 /* The frame size might not be DImode compatible, but the frame
3823 array's declaration will be. So it's ok to round up here. */
3824 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3825 /* Detect single iteration loop. */
3826 if (fs == 1)
3827 fs = 0;
3829 start_sequence ();
3830 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3831 if (fs)
3833 idx = gen_reg_rtx (SImode);
3834 pred = gen_reg_rtx (BImode);
3835 label = gen_label_rtx ();
3837 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3838 /* Allow worker function to initialize anything needed. */
3839 rtx init = fn (tmp, PM_loop_begin, fs, data);
3840 if (init)
3841 emit_insn (init);
3842 emit_label (label);
3843 LABEL_NUSES (label)++;
3844 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3846 if (rw & PM_read)
3847 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3848 emit_insn (fn (tmp, rw, fs, data));
3849 if (rw & PM_write)
3850 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3851 if (fs)
3853 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3854 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3855 emit_insn (gen_br_true_uni (pred, label));
3856 rtx fini = fn (tmp, PM_loop_end, fs, data);
3857 if (fini)
3858 emit_insn (fini);
3859 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3861 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3862 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3863 rtx cpy = get_insns ();
3864 end_sequence ();
3865 insn = emit_insn_after (cpy, insn);
3868 if (!is_call)
3869 /* Copy live registers. */
3870 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3872 rtx reg = regno_reg_rtx[ix];
3874 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3876 rtx bcast = fn (reg, rw, 0, data);
3878 insn = emit_insn_after (bcast, insn);
3879 empty = false;
3882 return empty;
3885 /* Worker for nvptx_vpropagate. */
3887 static rtx
3888 vprop_gen (rtx reg, propagate_mask pm,
3889 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3891 if (!(pm & PM_read_write))
3892 return 0;
3894 return nvptx_gen_vcast (reg);
3897 /* Propagate state that is live at start of BLOCK across the vectors
3898 of a single warp. Propagation is inserted just after INSN.
3899 IS_CALL and return as for nvptx_propagate. */
3901 static bool
3902 nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
3904 return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
3907 /* Worker for nvptx_wpropagate. */
3909 static rtx
3910 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3912 wcast_data_t *data = (wcast_data_t *)data_;
3914 if (pm & PM_loop_begin)
3916 /* Starting a loop, initialize pointer. */
3917 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3919 if (align > worker_bcast_align)
3920 worker_bcast_align = align;
3921 data->offset = (data->offset + align - 1) & ~(align - 1);
3923 data->ptr = gen_reg_rtx (Pmode);
3925 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3927 else if (pm & PM_loop_end)
3929 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3930 data->ptr = NULL_RTX;
3931 return clobber;
3933 else
3934 return nvptx_gen_wcast (reg, pm, rep, data);
3937 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3938 indicates if this is just before partitioned mode (do spill), or
3939 just after it starts (do fill). Sequence is inserted just after
3940 INSN. IS_CALL and return as for nvptx_propagate. */
3942 static bool
3943 nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
3945 wcast_data_t data;
3947 data.base = gen_reg_rtx (Pmode);
3948 data.offset = 0;
3949 data.ptr = NULL_RTX;
3951 bool empty = nvptx_propagate (is_call, block, insn,
3952 pre_p ? PM_read : PM_write, wprop_gen, &data);
3953 gcc_assert (empty == !data.offset);
3954 if (data.offset)
3956 /* Stuff was emitted, initialize the base pointer now. */
3957 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3958 emit_insn_after (init, insn);
3960 if (worker_bcast_size < data.offset)
3961 worker_bcast_size = data.offset;
3963 return empty;
3966 /* Emit a worker-level synchronization barrier. We use different
3967 markers for before and after synchronizations. */
3969 static rtx
3970 nvptx_wsync (bool after)
3972 return gen_nvptx_barsync (GEN_INT (after));
3975 #if WORKAROUND_PTXJIT_BUG
3976 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
3977 real insns. */
3979 static rtx_insn *
3980 bb_first_real_insn (basic_block bb)
3982 rtx_insn *insn;
3984 /* Find first insn of from block. */
3985 FOR_BB_INSNS (bb, insn)
3986 if (INSN_P (insn))
3987 return insn;
3989 return 0;
3991 #endif
3993 /* Return true if INSN needs neutering. */
3995 static bool
3996 needs_neutering_p (rtx_insn *insn)
3998 if (!INSN_P (insn))
3999 return false;
4001 switch (recog_memoized (insn))
4003 case CODE_FOR_nvptx_fork:
4004 case CODE_FOR_nvptx_forked:
4005 case CODE_FOR_nvptx_joining:
4006 case CODE_FOR_nvptx_join:
4007 case CODE_FOR_nvptx_barsync:
4008 return false;
4009 default:
4010 return true;
4014 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4016 static bool
4017 verify_neutering_jumps (basic_block from,
4018 rtx_insn *vector_jump, rtx_insn *worker_jump,
4019 rtx_insn *vector_label, rtx_insn *worker_label)
4021 basic_block bb = from;
4022 rtx_insn *insn = BB_HEAD (bb);
4023 bool seen_worker_jump = false;
4024 bool seen_vector_jump = false;
4025 bool seen_worker_label = false;
4026 bool seen_vector_label = false;
4027 bool worker_neutered = false;
4028 bool vector_neutered = false;
4029 while (true)
4031 if (insn == worker_jump)
4033 seen_worker_jump = true;
4034 worker_neutered = true;
4035 gcc_assert (!vector_neutered);
4037 else if (insn == vector_jump)
4039 seen_vector_jump = true;
4040 vector_neutered = true;
4042 else if (insn == worker_label)
4044 seen_worker_label = true;
4045 gcc_assert (worker_neutered);
4046 worker_neutered = false;
4048 else if (insn == vector_label)
4050 seen_vector_label = true;
4051 gcc_assert (vector_neutered);
4052 vector_neutered = false;
4054 else if (INSN_P (insn))
4055 switch (recog_memoized (insn))
4057 case CODE_FOR_nvptx_barsync:
4058 gcc_assert (!vector_neutered && !worker_neutered);
4059 break;
4060 default:
4061 break;
4064 if (insn != BB_END (bb))
4065 insn = NEXT_INSN (insn);
4066 else if (JUMP_P (insn) && single_succ_p (bb)
4067 && !seen_vector_jump && !seen_worker_jump)
4069 bb = single_succ (bb);
4070 insn = BB_HEAD (bb);
4072 else
4073 break;
4076 gcc_assert (!(vector_jump && !seen_vector_jump));
4077 gcc_assert (!(worker_jump && !seen_worker_jump));
4079 if (seen_vector_label || seen_worker_label)
4081 gcc_assert (!(vector_label && !seen_vector_label));
4082 gcc_assert (!(worker_label && !seen_worker_label));
4084 return true;
4087 return false;
4090 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4092 static void
4093 verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4094 rtx_insn *worker_label)
4096 basic_block bb = to;
4097 rtx_insn *insn = BB_END (bb);
4098 bool seen_worker_label = false;
4099 bool seen_vector_label = false;
4100 while (true)
4102 if (insn == worker_label)
4104 seen_worker_label = true;
4105 gcc_assert (!seen_vector_label);
4107 else if (insn == vector_label)
4108 seen_vector_label = true;
4109 else if (INSN_P (insn))
4110 switch (recog_memoized (insn))
4112 case CODE_FOR_nvptx_barsync:
4113 gcc_assert (!seen_vector_label && !seen_worker_label);
4114 break;
4117 if (insn != BB_HEAD (bb))
4118 insn = PREV_INSN (insn);
4119 else
4120 break;
4123 gcc_assert (!(vector_label && !seen_vector_label));
4124 gcc_assert (!(worker_label && !seen_worker_label));
4127 /* Single neutering according to MASK. FROM is the incoming block and
4128 TO is the outgoing block. These may be the same block. Insert at
4129 start of FROM:
4131 if (tid.<axis>) goto end.
4133 and insert before ending branch of TO (if there is such an insn):
4135 end:
4136 <possibly-broadcast-cond>
4137 <branch>
4139 We currently only use differnt FROM and TO when skipping an entire
4140 loop. We could do more if we detected superblocks. */
4142 static void
4143 nvptx_single (unsigned mask, basic_block from, basic_block to)
4145 rtx_insn *head = BB_HEAD (from);
4146 rtx_insn *tail = BB_END (to);
4147 unsigned skip_mask = mask;
4149 while (true)
4151 /* Find first insn of from block. */
4152 while (head != BB_END (from) && !needs_neutering_p (head))
4153 head = NEXT_INSN (head);
4155 if (from == to)
4156 break;
4158 if (!(JUMP_P (head) && single_succ_p (from)))
4159 break;
4161 basic_block jump_target = single_succ (from);
4162 if (!single_pred_p (jump_target))
4163 break;
4165 from = jump_target;
4166 head = BB_HEAD (from);
4169 /* Find last insn of to block */
4170 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4171 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4172 tail = PREV_INSN (tail);
4174 /* Detect if tail is a branch. */
4175 rtx tail_branch = NULL_RTX;
4176 rtx cond_branch = NULL_RTX;
4177 if (tail && INSN_P (tail))
4179 tail_branch = PATTERN (tail);
4180 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4181 tail_branch = NULL_RTX;
4182 else
4184 cond_branch = SET_SRC (tail_branch);
4185 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4186 cond_branch = NULL_RTX;
4190 if (tail == head)
4192 /* If this is empty, do nothing. */
4193 if (!head || !needs_neutering_p (head))
4194 return;
4196 if (cond_branch)
4198 /* If we're only doing vector single, there's no need to
4199 emit skip code because we'll not insert anything. */
4200 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4201 skip_mask = 0;
4203 else if (tail_branch)
4204 /* Block with only unconditional branch. Nothing to do. */
4205 return;
4208 /* Insert the vector test inside the worker test. */
4209 unsigned mode;
4210 rtx_insn *before = tail;
4211 rtx_insn *neuter_start = NULL;
4212 rtx_insn *worker_label = NULL, *vector_label = NULL;
4213 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4214 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4215 if (GOMP_DIM_MASK (mode) & skip_mask)
4217 rtx_code_label *label = gen_label_rtx ();
4218 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4219 rtx_insn **mode_jump = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4220 rtx_insn **mode_label = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4222 if (!pred)
4224 pred = gen_reg_rtx (BImode);
4225 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4228 rtx br;
4229 if (mode == GOMP_DIM_VECTOR)
4230 br = gen_br_true (pred, label);
4231 else
4232 br = gen_br_true_uni (pred, label);
4233 if (neuter_start)
4234 neuter_start = emit_insn_after (br, neuter_start);
4235 else
4236 neuter_start = emit_insn_before (br, head);
4237 *mode_jump = neuter_start;
4239 LABEL_NUSES (label)++;
4240 rtx_insn *label_insn;
4241 if (tail_branch)
4243 label_insn = emit_label_before (label, before);
4244 before = label_insn;
4246 else
4248 label_insn = emit_label_after (label, tail);
4249 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4250 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4251 emit_insn_after (gen_exit (), label_insn);
4254 if (mode == GOMP_DIM_VECTOR)
4255 vector_label = label_insn;
4256 else
4257 worker_label = label_insn;
4260 /* Now deal with propagating the branch condition. */
4261 if (cond_branch)
4263 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4265 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
4267 /* Vector mode only, do a shuffle. */
4268 #if WORKAROUND_PTXJIT_BUG
4269 /* The branch condition %rcond is propagated like this:
4272 .reg .u32 %x;
4273 mov.u32 %x,%tid.x;
4274 setp.ne.u32 %rnotvzero,%x,0;
4277 @%rnotvzero bra Lskip;
4278 setp.<op>.<type> %rcond,op1,op2;
4279 Lskip:
4280 selp.u32 %rcondu32,1,0,%rcond;
4281 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4282 setp.ne.u32 %rcond,%rcondu32,0;
4284 There seems to be a bug in the ptx JIT compiler (observed at driver
4285 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4286 unless %rcond is initialized to something before 'bra Lskip'. The
4287 bug is not observed with ptxas from cuda 8.0.61.
4289 It is true that the code is non-trivial: at Lskip, %rcond is
4290 uninitialized in threads 1-31, and after the selp the same holds
4291 for %rcondu32. But shfl propagates the defined value in thread 0
4292 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4293 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4295 There is nothing in the PTX spec to suggest that this is wrong, or
4296 to explain why the extra initialization is needed. So, we classify
4297 it as a JIT bug, and the extra initialization as workaround:
4300 .reg .u32 %x;
4301 mov.u32 %x,%tid.x;
4302 setp.ne.u32 %rnotvzero,%x,0;
4305 +.reg .pred %rcond2;
4306 +setp.eq.u32 %rcond2, 1, 0;
4308 @%rnotvzero bra Lskip;
4309 setp.<op>.<type> %rcond,op1,op2;
4310 +mov.pred %rcond2, %rcond;
4311 Lskip:
4312 +mov.pred %rcond, %rcond2;
4313 selp.u32 %rcondu32,1,0,%rcond;
4314 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4315 setp.ne.u32 %rcond,%rcondu32,0;
4317 rtx_insn *label = PREV_INSN (tail);
4318 gcc_assert (label && LABEL_P (label));
4319 rtx tmp = gen_reg_rtx (BImode);
4320 emit_insn_before (gen_movbi (tmp, const0_rtx),
4321 bb_first_real_insn (from));
4322 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4323 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4324 #endif
4325 emit_insn_before (nvptx_gen_vcast (pvar), tail);
4327 else
4329 /* Includes worker mode, do spill & fill. By construction
4330 we should never have worker mode only. */
4331 wcast_data_t data;
4333 data.base = worker_bcast_sym;
4334 data.ptr = 0;
4336 if (worker_bcast_size < GET_MODE_SIZE (SImode))
4337 worker_bcast_size = GET_MODE_SIZE (SImode);
4339 data.offset = 0;
4340 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
4341 before);
4342 /* Barrier so other workers can see the write. */
4343 emit_insn_before (nvptx_wsync (false), tail);
4344 data.offset = 0;
4345 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
4346 /* This barrier is needed to avoid worker zero clobbering
4347 the broadcast buffer before all the other workers have
4348 had a chance to read this instance of it. */
4349 emit_insn_before (nvptx_wsync (true), tail);
4352 extract_insn (tail);
4353 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4354 UNSPEC_BR_UNIFIED);
4355 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4358 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4359 vector_label, worker_label);
4360 if (!seen_label)
4361 verify_neutering_labels (to, vector_label, worker_label);
4364 /* PAR is a parallel that is being skipped in its entirety according to
4365 MASK. Treat this as skipping a superblock starting at forked
4366 and ending at joining. */
4368 static void
4369 nvptx_skip_par (unsigned mask, parallel *par)
4371 basic_block tail = par->join_block;
4372 gcc_assert (tail->preds->length () == 1);
4374 basic_block pre_tail = (*tail->preds)[0]->src;
4375 gcc_assert (pre_tail->succs->length () == 1);
4377 nvptx_single (mask, par->forked_block, pre_tail);
4380 /* If PAR has a single inner parallel and PAR itself only contains
4381 empty entry and exit blocks, swallow the inner PAR. */
4383 static void
4384 nvptx_optimize_inner (parallel *par)
4386 parallel *inner = par->inner;
4388 /* We mustn't be the outer dummy par. */
4389 if (!par->mask)
4390 return;
4392 /* We must have a single inner par. */
4393 if (!inner || inner->next)
4394 return;
4396 /* We must only contain 2 blocks ourselves -- the head and tail of
4397 the inner par. */
4398 if (par->blocks.length () != 2)
4399 return;
4401 /* We must be disjoint partitioning. As we only have vector and
4402 worker partitioning, this is sufficient to guarantee the pars
4403 have adjacent partitioning. */
4404 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4405 /* This indicates malformed code generation. */
4406 return;
4408 /* The outer forked insn should be immediately followed by the inner
4409 fork insn. */
4410 rtx_insn *forked = par->forked_insn;
4411 rtx_insn *fork = BB_END (par->forked_block);
4413 if (NEXT_INSN (forked) != fork)
4414 return;
4415 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4417 /* The outer joining insn must immediately follow the inner join
4418 insn. */
4419 rtx_insn *joining = par->joining_insn;
4420 rtx_insn *join = inner->join_insn;
4421 if (NEXT_INSN (join) != joining)
4422 return;
4424 /* Preconditions met. Swallow the inner par. */
4425 if (dump_file)
4426 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4427 inner->mask, inner->forked_block->index,
4428 inner->join_block->index,
4429 par->mask, par->forked_block->index, par->join_block->index);
4431 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4433 par->blocks.reserve (inner->blocks.length ());
4434 while (inner->blocks.length ())
4435 par->blocks.quick_push (inner->blocks.pop ());
4437 par->inner = inner->inner;
4438 inner->inner = NULL;
4440 delete inner;
4443 /* Process the parallel PAR and all its contained
4444 parallels. We do everything but the neutering. Return mask of
4445 partitioned modes used within this parallel. */
4447 static unsigned
4448 nvptx_process_pars (parallel *par)
4450 if (nvptx_optimize)
4451 nvptx_optimize_inner (par);
4453 unsigned inner_mask = par->mask;
4455 /* Do the inner parallels first. */
4456 if (par->inner)
4458 par->inner_mask = nvptx_process_pars (par->inner);
4459 inner_mask |= par->inner_mask;
4462 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4464 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4466 nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
4467 bool empty = nvptx_wpropagate (true, is_call,
4468 par->forked_block, par->fork_insn);
4470 if (!empty || !is_call)
4472 /* Insert begin and end synchronizations. */
4473 emit_insn_before (nvptx_wsync (false), par->forked_insn);
4474 emit_insn_before (nvptx_wsync (true), par->join_insn);
4477 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4478 nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
4480 /* Now do siblings. */
4481 if (par->next)
4482 inner_mask |= nvptx_process_pars (par->next);
4483 return inner_mask;
4486 /* Neuter the parallel described by PAR. We recurse in depth-first
4487 order. MODES are the partitioning of the execution and OUTER is
4488 the partitioning of the parallels we are contained in. */
4490 static void
4491 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4493 unsigned me = (par->mask
4494 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4495 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4496 unsigned skip_mask = 0, neuter_mask = 0;
4498 if (par->inner)
4499 nvptx_neuter_pars (par->inner, modes, outer | me);
4501 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4503 if ((outer | me) & GOMP_DIM_MASK (mode))
4504 {} /* Mode is partitioned: no neutering. */
4505 else if (!(modes & GOMP_DIM_MASK (mode)))
4506 {} /* Mode is not used: nothing to do. */
4507 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4508 || !par->forked_insn)
4509 /* Partitioned in inner parallels, or we're not a partitioned
4510 at all: neuter individual blocks. */
4511 neuter_mask |= GOMP_DIM_MASK (mode);
4512 else if (!par->parent || !par->parent->forked_insn
4513 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4514 /* Parent isn't a parallel or contains this paralleling: skip
4515 parallel at this level. */
4516 skip_mask |= GOMP_DIM_MASK (mode);
4517 else
4518 {} /* Parent will skip this parallel itself. */
4521 if (neuter_mask)
4523 int ix, len;
4525 if (nvptx_optimize)
4527 /* Neuter whole SESE regions. */
4528 bb_pair_vec_t regions;
4530 nvptx_find_sese (par->blocks, regions);
4531 len = regions.length ();
4532 for (ix = 0; ix != len; ix++)
4534 basic_block from = regions[ix].first;
4535 basic_block to = regions[ix].second;
4537 if (from)
4538 nvptx_single (neuter_mask, from, to);
4539 else
4540 gcc_assert (!to);
4543 else
4545 /* Neuter each BB individually. */
4546 len = par->blocks.length ();
4547 for (ix = 0; ix != len; ix++)
4549 basic_block block = par->blocks[ix];
4551 nvptx_single (neuter_mask, block, block);
4556 if (skip_mask)
4557 nvptx_skip_par (skip_mask, par);
4559 if (par->next)
4560 nvptx_neuter_pars (par->next, modes, outer);
4563 #if WORKAROUND_PTXJIT_BUG_2
4564 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4565 is needed in the nvptx target because the branches generated for
4566 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4568 static rtx
4569 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4571 rtx pat;
4572 if ((strict && !JUMP_P (insn))
4573 || (!strict && !INSN_P (insn)))
4574 return NULL_RTX;
4575 pat = PATTERN (insn);
4577 /* The set is allowed to appear either as the insn pattern or
4578 the first set in a PARALLEL. */
4579 if (GET_CODE (pat) == PARALLEL)
4580 pat = XVECEXP (pat, 0, 0);
4581 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4582 return pat;
4584 return NULL_RTX;
4587 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4589 static rtx
4590 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4592 rtx x = nvptx_pc_set (insn, strict);
4594 if (!x)
4595 return NULL_RTX;
4596 x = SET_SRC (x);
4597 if (GET_CODE (x) == LABEL_REF)
4598 return x;
4599 if (GET_CODE (x) != IF_THEN_ELSE)
4600 return NULL_RTX;
4601 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4602 return XEXP (x, 1);
4603 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4604 return XEXP (x, 2);
4605 return NULL_RTX;
4608 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4609 insn inbetween the branch and the label. This works around a JIT bug
4610 observed at driver version 384.111, at -O0 for sm_50. */
4612 static void
4613 prevent_branch_around_nothing (void)
4615 rtx_insn *seen_label = NULL;
4616 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4618 if (INSN_P (insn) && condjump_p (insn))
4620 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
4621 continue;
4624 if (seen_label == NULL)
4625 continue;
4627 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4628 continue;
4630 if (INSN_P (insn))
4631 switch (recog_memoized (insn))
4633 case CODE_FOR_nvptx_fork:
4634 case CODE_FOR_nvptx_forked:
4635 case CODE_FOR_nvptx_joining:
4636 case CODE_FOR_nvptx_join:
4637 continue;
4638 default:
4639 seen_label = NULL;
4640 continue;
4643 if (LABEL_P (insn) && insn == seen_label)
4644 emit_insn_before (gen_fake_nop (), insn);
4646 seen_label = NULL;
4649 #endif
4651 #ifdef WORKAROUND_PTXJIT_BUG_3
4652 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4653 works around a hang observed at driver version 390.48 for sm_50. */
4655 static void
4656 workaround_barsyncs (void)
4658 bool seen_barsync = false;
4659 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4661 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
4663 if (seen_barsync)
4665 emit_insn_before (gen_nvptx_membar_cta (), insn);
4666 emit_insn_before (gen_nvptx_membar_cta (), insn);
4669 seen_barsync = true;
4670 continue;
4673 if (!seen_barsync)
4674 continue;
4676 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4677 continue;
4678 else if (INSN_P (insn))
4679 switch (recog_memoized (insn))
4681 case CODE_FOR_nvptx_fork:
4682 case CODE_FOR_nvptx_forked:
4683 case CODE_FOR_nvptx_joining:
4684 case CODE_FOR_nvptx_join:
4685 continue;
4686 default:
4687 break;
4690 seen_barsync = false;
4693 #endif
4695 /* PTX-specific reorganization
4696 - Split blocks at fork and join instructions
4697 - Compute live registers
4698 - Mark now-unused registers, so function begin doesn't declare
4699 unused registers.
4700 - Insert state propagation when entering partitioned mode
4701 - Insert neutering instructions when in single mode
4702 - Replace subregs with suitable sequences.
4705 static void
4706 nvptx_reorg (void)
4708 /* We are freeing block_for_insn in the toplev to keep compatibility
4709 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4710 compute_bb_for_insn ();
4712 thread_prologue_and_epilogue_insns ();
4714 /* Split blocks and record interesting unspecs. */
4715 bb_insn_map_t bb_insn_map;
4717 nvptx_split_blocks (&bb_insn_map);
4719 /* Compute live regs */
4720 df_clear_flags (DF_LR_RUN_DCE);
4721 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
4722 df_live_add_problem ();
4723 df_live_set_all_dirty ();
4724 df_analyze ();
4725 regstat_init_n_sets_and_refs ();
4727 if (dump_file)
4728 df_dump (dump_file);
4730 /* Mark unused regs as unused. */
4731 int max_regs = max_reg_num ();
4732 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
4733 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4734 regno_reg_rtx[i] = const0_rtx;
4736 /* Determine launch dimensions of the function. If it is not an
4737 offloaded function (i.e. this is a regular compiler), the
4738 function has no neutering. */
4739 tree attr = oacc_get_fn_attrib (current_function_decl);
4740 if (attr)
4742 /* If we determined this mask before RTL expansion, we could
4743 elide emission of some levels of forks and joins. */
4744 unsigned mask = 0;
4745 tree dims = TREE_VALUE (attr);
4746 unsigned ix;
4748 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4750 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4751 tree allowed = TREE_PURPOSE (dims);
4753 if (size != 1 && !(allowed && integer_zerop (allowed)))
4754 mask |= GOMP_DIM_MASK (ix);
4756 /* If there is worker neutering, there must be vector
4757 neutering. Otherwise the hardware will fail. */
4758 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4759 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4761 /* Discover & process partitioned regions. */
4762 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4763 nvptx_process_pars (pars);
4764 nvptx_neuter_pars (pars, mask, 0);
4765 delete pars;
4768 /* Replace subregs. */
4769 nvptx_reorg_subreg ();
4771 if (TARGET_UNIFORM_SIMT)
4772 nvptx_reorg_uniform_simt ();
4774 #if WORKAROUND_PTXJIT_BUG_2
4775 prevent_branch_around_nothing ();
4776 #endif
4778 #ifdef WORKAROUND_PTXJIT_BUG_3
4779 workaround_barsyncs ();
4780 #endif
4782 regstat_free_n_sets_and_refs ();
4784 df_finish_pass (true);
4787 /* Handle a "kernel" attribute; arguments as in
4788 struct attribute_spec.handler. */
4790 static tree
4791 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4792 int ARG_UNUSED (flags), bool *no_add_attrs)
4794 tree decl = *node;
4796 if (TREE_CODE (decl) != FUNCTION_DECL)
4798 error ("%qE attribute only applies to functions", name);
4799 *no_add_attrs = true;
4801 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
4803 error ("%qE attribute requires a void return type", name);
4804 *no_add_attrs = true;
4807 return NULL_TREE;
4810 /* Handle a "shared" attribute; arguments as in
4811 struct attribute_spec.handler. */
4813 static tree
4814 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4815 int ARG_UNUSED (flags), bool *no_add_attrs)
4817 tree decl = *node;
4819 if (TREE_CODE (decl) != VAR_DECL)
4821 error ("%qE attribute only applies to variables", name);
4822 *no_add_attrs = true;
4824 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4826 error ("%qE attribute not allowed with auto storage class", name);
4827 *no_add_attrs = true;
4830 return NULL_TREE;
4833 /* Table of valid machine attributes. */
4834 static const struct attribute_spec nvptx_attribute_table[] =
4836 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
4837 affects_type_identity, handler, exclude } */
4838 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
4839 NULL },
4840 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
4841 NULL },
4842 { NULL, 0, 0, false, false, false, false, NULL, NULL }
4845 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
4847 static HOST_WIDE_INT
4848 nvptx_vector_alignment (const_tree type)
4850 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4852 return MIN (align, BIGGEST_ALIGNMENT);
4855 /* Indicate that INSN cannot be duplicated. */
4857 static bool
4858 nvptx_cannot_copy_insn_p (rtx_insn *insn)
4860 switch (recog_memoized (insn))
4862 case CODE_FOR_nvptx_shufflesi:
4863 case CODE_FOR_nvptx_shufflesf:
4864 case CODE_FOR_nvptx_barsync:
4865 case CODE_FOR_nvptx_fork:
4866 case CODE_FOR_nvptx_forked:
4867 case CODE_FOR_nvptx_joining:
4868 case CODE_FOR_nvptx_join:
4869 return true;
4870 default:
4871 return false;
4875 /* Section anchors do not work. Initialization for flag_section_anchor
4876 probes the existence of the anchoring target hooks and prevents
4877 anchoring if they don't exist. However, we may be being used with
4878 a host-side compiler that does support anchoring, and hence see
4879 the anchor flag set (as it's not recalculated). So provide an
4880 implementation denying anchoring. */
4882 static bool
4883 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4885 return false;
4888 /* Record a symbol for mkoffload to enter into the mapping table. */
4890 static void
4891 nvptx_record_offload_symbol (tree decl)
4893 switch (TREE_CODE (decl))
4895 case VAR_DECL:
4896 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4897 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4898 break;
4900 case FUNCTION_DECL:
4902 tree attr = oacc_get_fn_attrib (decl);
4903 /* OpenMP offloading does not set this attribute. */
4904 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
4906 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4907 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4909 for (; dims; dims = TREE_CHAIN (dims))
4911 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4913 gcc_assert (!TREE_PURPOSE (dims));
4914 fprintf (asm_out_file, ", %#x", size);
4917 fprintf (asm_out_file, "\n");
4919 break;
4921 default:
4922 gcc_unreachable ();
4926 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4927 at the start of a file. */
4929 static void
4930 nvptx_file_start (void)
4932 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4933 fputs ("\t.version\t3.1\n", asm_out_file);
4934 if (TARGET_SM35)
4935 fputs ("\t.target\tsm_35\n", asm_out_file);
4936 else
4937 fputs ("\t.target\tsm_30\n", asm_out_file);
4938 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4939 fputs ("// END PREAMBLE\n", asm_out_file);
4942 /* Emit a declaration for a worker-level buffer in .shared memory. */
4944 static void
4945 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
4947 const char *name = XSTR (sym, 0);
4949 write_var_marker (file, true, false, name);
4950 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4951 align, name, size);
4954 /* Write out the function declarations we've collected and declare storage
4955 for the broadcast buffer. */
4957 static void
4958 nvptx_file_end (void)
4960 hash_table<tree_hasher>::iterator iter;
4961 tree decl;
4962 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
4963 nvptx_record_fndecl (decl);
4964 fputs (func_decls.str().c_str(), asm_out_file);
4966 if (worker_bcast_size)
4967 write_worker_buffer (asm_out_file, worker_bcast_sym,
4968 worker_bcast_align, worker_bcast_size);
4970 if (worker_red_size)
4971 write_worker_buffer (asm_out_file, worker_red_sym,
4972 worker_red_align, worker_red_size);
4974 if (need_softstack_decl)
4976 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4977 /* 32 is the maximum number of warps in a block. Even though it's an
4978 external declaration, emit the array size explicitly; otherwise, it
4979 may fail at PTX JIT time if the definition is later in link order. */
4980 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
4981 POINTER_SIZE);
4983 if (need_unisimt_decl)
4985 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
4986 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
4990 /* Expander for the shuffle builtins. */
4992 static rtx
4993 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4995 if (ignore)
4996 return target;
4998 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4999 NULL_RTX, mode, EXPAND_NORMAL);
5000 if (!REG_P (src))
5001 src = copy_to_mode_reg (mode, src);
5003 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5004 NULL_RTX, SImode, EXPAND_NORMAL);
5005 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
5006 NULL_RTX, SImode, EXPAND_NORMAL);
5008 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5009 idx = copy_to_mode_reg (SImode, idx);
5011 rtx pat = nvptx_gen_shuffle (target, src, idx,
5012 (nvptx_shuffle_kind) INTVAL (op));
5013 if (pat)
5014 emit_insn (pat);
5016 return target;
5019 /* Worker reduction address expander. */
5021 static rtx
5022 nvptx_expand_worker_addr (tree exp, rtx target,
5023 machine_mode ARG_UNUSED (mode), int ignore)
5025 if (ignore)
5026 return target;
5028 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
5029 if (align > worker_red_align)
5030 worker_red_align = align;
5032 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5033 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
5034 if (size + offset > worker_red_size)
5035 worker_red_size = size + offset;
5037 rtx addr = worker_red_sym;
5038 if (offset)
5040 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5041 addr = gen_rtx_CONST (Pmode, addr);
5044 emit_move_insn (target, addr);
5046 return target;
5049 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5050 not require taking the address of any object, other than the memory
5051 cell being operated on. */
5053 static rtx
5054 nvptx_expand_cmp_swap (tree exp, rtx target,
5055 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5057 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5059 if (!target)
5060 target = gen_reg_rtx (mode);
5062 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5063 NULL_RTX, Pmode, EXPAND_NORMAL);
5064 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5065 NULL_RTX, mode, EXPAND_NORMAL);
5066 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5067 NULL_RTX, mode, EXPAND_NORMAL);
5068 rtx pat;
5070 mem = gen_rtx_MEM (mode, mem);
5071 if (!REG_P (cmp))
5072 cmp = copy_to_mode_reg (mode, cmp);
5073 if (!REG_P (src))
5074 src = copy_to_mode_reg (mode, src);
5076 if (mode == SImode)
5077 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5078 else
5079 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5081 emit_insn (pat);
5083 return target;
5087 /* Codes for all the NVPTX builtins. */
5088 enum nvptx_builtins
5090 NVPTX_BUILTIN_SHUFFLE,
5091 NVPTX_BUILTIN_SHUFFLELL,
5092 NVPTX_BUILTIN_WORKER_ADDR,
5093 NVPTX_BUILTIN_CMP_SWAP,
5094 NVPTX_BUILTIN_CMP_SWAPLL,
5095 NVPTX_BUILTIN_MAX
5098 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5100 /* Return the NVPTX builtin for CODE. */
5102 static tree
5103 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5105 if (code >= NVPTX_BUILTIN_MAX)
5106 return error_mark_node;
5108 return nvptx_builtin_decls[code];
5111 /* Set up all builtin functions for this target. */
5113 static void
5114 nvptx_init_builtins (void)
5116 #define DEF(ID, NAME, T) \
5117 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5118 = add_builtin_function ("__builtin_nvptx_" NAME, \
5119 build_function_type_list T, \
5120 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5121 #define ST sizetype
5122 #define UINT unsigned_type_node
5123 #define LLUINT long_long_unsigned_type_node
5124 #define PTRVOID ptr_type_node
5126 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5127 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5128 DEF (WORKER_ADDR, "worker_addr",
5129 (PTRVOID, ST, UINT, UINT, NULL_TREE));
5130 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5131 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5133 #undef DEF
5134 #undef ST
5135 #undef UINT
5136 #undef LLUINT
5137 #undef PTRVOID
5140 /* Expand an expression EXP that calls a built-in function,
5141 with result going to TARGET if that's convenient
5142 (and in mode MODE if that's convenient).
5143 SUBTARGET may be used as the target for computing one of EXP's operands.
5144 IGNORE is nonzero if the value is to be ignored. */
5146 static rtx
5147 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5148 machine_mode mode, int ignore)
5150 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
5151 switch (DECL_FUNCTION_CODE (fndecl))
5153 case NVPTX_BUILTIN_SHUFFLE:
5154 case NVPTX_BUILTIN_SHUFFLELL:
5155 return nvptx_expand_shuffle (exp, target, mode, ignore);
5157 case NVPTX_BUILTIN_WORKER_ADDR:
5158 return nvptx_expand_worker_addr (exp, target, mode, ignore);
5160 case NVPTX_BUILTIN_CMP_SWAP:
5161 case NVPTX_BUILTIN_CMP_SWAPLL:
5162 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5164 default: gcc_unreachable ();
5168 /* Define dimension sizes for known hardware. */
5169 #define PTX_VECTOR_LENGTH 32
5170 #define PTX_WORKER_LENGTH 32
5171 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
5173 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5175 static int
5176 nvptx_simt_vf ()
5178 return PTX_VECTOR_LENGTH;
5181 /* Validate compute dimensions of an OpenACC offload or routine, fill
5182 in non-unity defaults. FN_LEVEL indicates the level at which a
5183 routine might spawn a loop. It is negative for non-routines. If
5184 DECL is null, we are validating the default dimensions. */
5186 static bool
5187 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
5189 bool changed = false;
5191 /* The vector size must be 32, unless this is a SEQ routine. */
5192 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
5193 && dims[GOMP_DIM_VECTOR] >= 0
5194 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
5196 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
5197 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5198 dims[GOMP_DIM_VECTOR]
5199 ? G_("using vector_length (%d), ignoring %d")
5200 : G_("using vector_length (%d), ignoring runtime setting"),
5201 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
5202 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
5203 changed = true;
5206 /* Check the num workers is not too large. */
5207 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5209 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5210 "using num_workers (%d), ignoring %d",
5211 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
5212 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5213 changed = true;
5216 if (!decl)
5218 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
5219 if (dims[GOMP_DIM_WORKER] < 0)
5220 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
5221 if (dims[GOMP_DIM_GANG] < 0)
5222 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
5223 changed = true;
5226 return changed;
5229 /* Return maximum dimension size, or zero for unbounded. */
5231 static int
5232 nvptx_dim_limit (int axis)
5234 switch (axis)
5236 case GOMP_DIM_VECTOR:
5237 return PTX_VECTOR_LENGTH;
5239 default:
5240 break;
5242 return 0;
5245 /* Determine whether fork & joins are needed. */
5247 static bool
5248 nvptx_goacc_fork_join (gcall *call, const int dims[],
5249 bool ARG_UNUSED (is_fork))
5251 tree arg = gimple_call_arg (call, 2);
5252 unsigned axis = TREE_INT_CST_LOW (arg);
5254 /* We only care about worker and vector partitioning. */
5255 if (axis < GOMP_DIM_WORKER)
5256 return false;
5258 /* If the size is 1, there's no partitioning. */
5259 if (dims[axis] == 1)
5260 return false;
5262 return true;
5265 /* Generate a PTX builtin function call that returns the address in
5266 the worker reduction buffer at OFFSET. TYPE is the type of the
5267 data at that location. */
5269 static tree
5270 nvptx_get_worker_red_addr (tree type, tree offset)
5272 machine_mode mode = TYPE_MODE (type);
5273 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
5274 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5275 tree align = build_int_cst (unsigned_type_node,
5276 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5277 tree call = build_call_expr (fndecl, 3, offset, size, align);
5279 return fold_convert (build_pointer_type (type), call);
5282 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5283 will cast the variable if necessary. */
5285 static void
5286 nvptx_generate_vector_shuffle (location_t loc,
5287 tree dest_var, tree var, unsigned shift,
5288 gimple_seq *seq)
5290 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5291 tree_code code = NOP_EXPR;
5292 tree arg_type = unsigned_type_node;
5293 tree var_type = TREE_TYPE (var);
5294 tree dest_type = var_type;
5296 if (TREE_CODE (var_type) == COMPLEX_TYPE)
5297 var_type = TREE_TYPE (var_type);
5299 if (TREE_CODE (var_type) == REAL_TYPE)
5300 code = VIEW_CONVERT_EXPR;
5302 if (TYPE_SIZE (var_type)
5303 == TYPE_SIZE (long_long_unsigned_type_node))
5305 fn = NVPTX_BUILTIN_SHUFFLELL;
5306 arg_type = long_long_unsigned_type_node;
5309 tree call = nvptx_builtin_decl (fn, true);
5310 tree bits = build_int_cst (unsigned_type_node, shift);
5311 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5312 tree expr;
5314 if (var_type != dest_type)
5316 /* Do real and imaginary parts separately. */
5317 tree real = fold_build1 (REALPART_EXPR, var_type, var);
5318 real = fold_build1 (code, arg_type, real);
5319 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5320 real = fold_build1 (code, var_type, real);
5322 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5323 imag = fold_build1 (code, arg_type, imag);
5324 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5325 imag = fold_build1 (code, var_type, imag);
5327 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5329 else
5331 expr = fold_build1 (code, arg_type, var);
5332 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5333 expr = fold_build1 (code, dest_type, expr);
5336 gimplify_assign (dest_var, expr, seq);
5339 /* Lazily generate the global lock var decl and return its address. */
5341 static tree
5342 nvptx_global_lock_addr ()
5344 tree v = global_lock_var;
5346 if (!v)
5348 tree name = get_identifier ("__reduction_lock");
5349 tree type = build_qualified_type (unsigned_type_node,
5350 TYPE_QUAL_VOLATILE);
5351 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5352 global_lock_var = v;
5353 DECL_ARTIFICIAL (v) = 1;
5354 DECL_EXTERNAL (v) = 1;
5355 TREE_STATIC (v) = 1;
5356 TREE_PUBLIC (v) = 1;
5357 TREE_USED (v) = 1;
5358 mark_addressable (v);
5359 mark_decl_referenced (v);
5362 return build_fold_addr_expr (v);
5365 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5366 GSI. We use a lockless scheme for nearly all case, which looks
5367 like:
5368 actual = initval(OP);
5369 do {
5370 guess = actual;
5371 write = guess OP myval;
5372 actual = cmp&swap (ptr, guess, write)
5373 } while (actual bit-different-to guess);
5374 return write;
5376 This relies on a cmp&swap instruction, which is available for 32-
5377 and 64-bit types. Larger types must use a locking scheme. */
5379 static tree
5380 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5381 tree ptr, tree var, tree_code op)
5383 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5384 tree_code code = NOP_EXPR;
5385 tree arg_type = unsigned_type_node;
5386 tree var_type = TREE_TYPE (var);
5388 if (TREE_CODE (var_type) == COMPLEX_TYPE
5389 || TREE_CODE (var_type) == REAL_TYPE)
5390 code = VIEW_CONVERT_EXPR;
5392 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
5394 arg_type = long_long_unsigned_type_node;
5395 fn = NVPTX_BUILTIN_CMP_SWAPLL;
5398 tree swap_fn = nvptx_builtin_decl (fn, true);
5400 gimple_seq init_seq = NULL;
5401 tree init_var = make_ssa_name (arg_type);
5402 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5403 init_expr = fold_build1 (code, arg_type, init_expr);
5404 gimplify_assign (init_var, init_expr, &init_seq);
5405 gimple *init_end = gimple_seq_last (init_seq);
5407 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5409 /* Split the block just after the init stmts. */
5410 basic_block pre_bb = gsi_bb (*gsi);
5411 edge pre_edge = split_block (pre_bb, init_end);
5412 basic_block loop_bb = pre_edge->dest;
5413 pre_bb = pre_edge->src;
5414 /* Reset the iterator. */
5415 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5417 tree expect_var = make_ssa_name (arg_type);
5418 tree actual_var = make_ssa_name (arg_type);
5419 tree write_var = make_ssa_name (arg_type);
5421 /* Build and insert the reduction calculation. */
5422 gimple_seq red_seq = NULL;
5423 tree write_expr = fold_build1 (code, var_type, expect_var);
5424 write_expr = fold_build2 (op, var_type, write_expr, var);
5425 write_expr = fold_build1 (code, arg_type, write_expr);
5426 gimplify_assign (write_var, write_expr, &red_seq);
5428 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5430 /* Build & insert the cmp&swap sequence. */
5431 gimple_seq latch_seq = NULL;
5432 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5433 ptr, expect_var, write_var);
5434 gimplify_assign (actual_var, swap_expr, &latch_seq);
5436 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5437 NULL_TREE, NULL_TREE);
5438 gimple_seq_add_stmt (&latch_seq, cond);
5440 gimple *latch_end = gimple_seq_last (latch_seq);
5441 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
5443 /* Split the block just after the latch stmts. */
5444 edge post_edge = split_block (loop_bb, latch_end);
5445 basic_block post_bb = post_edge->dest;
5446 loop_bb = post_edge->src;
5447 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5449 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5450 post_edge->probability = profile_probability::even ();
5451 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
5452 loop_edge->probability = profile_probability::even ();
5453 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5454 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5456 gphi *phi = create_phi_node (expect_var, loop_bb);
5457 add_phi_arg (phi, init_var, pre_edge, loc);
5458 add_phi_arg (phi, actual_var, loop_edge, loc);
5460 loop *loop = alloc_loop ();
5461 loop->header = loop_bb;
5462 loop->latch = loop_bb;
5463 add_loop (loop, loop_bb->loop_father);
5465 return fold_build1 (code, var_type, write_var);
5468 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5469 GSI. This is necessary for types larger than 64 bits, where there
5470 is no cmp&swap instruction to implement a lockless scheme. We use
5471 a lock variable in global memory.
5473 while (cmp&swap (&lock_var, 0, 1))
5474 continue;
5475 T accum = *ptr;
5476 accum = accum OP var;
5477 *ptr = accum;
5478 cmp&swap (&lock_var, 1, 0);
5479 return accum;
5481 A lock in global memory is necessary to force execution engine
5482 descheduling and avoid resource starvation that can occur if the
5483 lock is in .shared memory. */
5485 static tree
5486 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5487 tree ptr, tree var, tree_code op)
5489 tree var_type = TREE_TYPE (var);
5490 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5491 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5492 tree uns_locked = build_int_cst (unsigned_type_node, 1);
5494 /* Split the block just before the gsi. Insert a gimple nop to make
5495 this easier. */
5496 gimple *nop = gimple_build_nop ();
5497 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
5498 basic_block entry_bb = gsi_bb (*gsi);
5499 edge entry_edge = split_block (entry_bb, nop);
5500 basic_block lock_bb = entry_edge->dest;
5501 /* Reset the iterator. */
5502 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5504 /* Build and insert the locking sequence. */
5505 gimple_seq lock_seq = NULL;
5506 tree lock_var = make_ssa_name (unsigned_type_node);
5507 tree lock_expr = nvptx_global_lock_addr ();
5508 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
5509 uns_unlocked, uns_locked);
5510 gimplify_assign (lock_var, lock_expr, &lock_seq);
5511 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
5512 NULL_TREE, NULL_TREE);
5513 gimple_seq_add_stmt (&lock_seq, cond);
5514 gimple *lock_end = gimple_seq_last (lock_seq);
5515 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
5517 /* Split the block just after the lock sequence. */
5518 edge locked_edge = split_block (lock_bb, lock_end);
5519 basic_block update_bb = locked_edge->dest;
5520 lock_bb = locked_edge->src;
5521 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5523 /* Create the lock loop ... */
5524 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5525 locked_edge->probability = profile_probability::even ();
5526 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
5527 loop_edge->probability = profile_probability::even ();
5528 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
5529 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
5531 /* ... and the loop structure. */
5532 loop *lock_loop = alloc_loop ();
5533 lock_loop->header = lock_bb;
5534 lock_loop->latch = lock_bb;
5535 lock_loop->nb_iterations_estimate = 1;
5536 lock_loop->any_estimate = true;
5537 add_loop (lock_loop, entry_bb->loop_father);
5539 /* Build and insert the reduction calculation. */
5540 gimple_seq red_seq = NULL;
5541 tree acc_in = make_ssa_name (var_type);
5542 tree ref_in = build_simple_mem_ref (ptr);
5543 TREE_THIS_VOLATILE (ref_in) = 1;
5544 gimplify_assign (acc_in, ref_in, &red_seq);
5546 tree acc_out = make_ssa_name (var_type);
5547 tree update_expr = fold_build2 (op, var_type, ref_in, var);
5548 gimplify_assign (acc_out, update_expr, &red_seq);
5550 tree ref_out = build_simple_mem_ref (ptr);
5551 TREE_THIS_VOLATILE (ref_out) = 1;
5552 gimplify_assign (ref_out, acc_out, &red_seq);
5554 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5556 /* Build & insert the unlock sequence. */
5557 gimple_seq unlock_seq = NULL;
5558 tree unlock_expr = nvptx_global_lock_addr ();
5559 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
5560 uns_locked, uns_unlocked);
5561 gimplify_and_add (unlock_expr, &unlock_seq);
5562 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
5564 return acc_out;
5567 /* Emit a sequence to update a reduction accumlator at *PTR with the
5568 value held in VAR using operator OP. Return the updated value.
5570 TODO: optimize for atomic ops and indepedent complex ops. */
5572 static tree
5573 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
5574 tree ptr, tree var, tree_code op)
5576 tree type = TREE_TYPE (var);
5577 tree size = TYPE_SIZE (type);
5579 if (size == TYPE_SIZE (unsigned_type_node)
5580 || size == TYPE_SIZE (long_long_unsigned_type_node))
5581 return nvptx_lockless_update (loc, gsi, ptr, var, op);
5582 else
5583 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
5586 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5588 static void
5589 nvptx_goacc_reduction_setup (gcall *call)
5591 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5592 tree lhs = gimple_call_lhs (call);
5593 tree var = gimple_call_arg (call, 2);
5594 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5595 gimple_seq seq = NULL;
5597 push_gimplify_context (true);
5599 if (level != GOMP_DIM_GANG)
5601 /* Copy the receiver object. */
5602 tree ref_to_res = gimple_call_arg (call, 1);
5604 if (!integer_zerop (ref_to_res))
5605 var = build_simple_mem_ref (ref_to_res);
5608 if (level == GOMP_DIM_WORKER)
5610 /* Store incoming value to worker reduction buffer. */
5611 tree offset = gimple_call_arg (call, 5);
5612 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5613 tree ptr = make_ssa_name (TREE_TYPE (call));
5615 gimplify_assign (ptr, call, &seq);
5616 tree ref = build_simple_mem_ref (ptr);
5617 TREE_THIS_VOLATILE (ref) = 1;
5618 gimplify_assign (ref, var, &seq);
5621 if (lhs)
5622 gimplify_assign (lhs, var, &seq);
5624 pop_gimplify_context (NULL);
5625 gsi_replace_with_seq (&gsi, seq, true);
5628 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
5630 static void
5631 nvptx_goacc_reduction_init (gcall *call)
5633 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5634 tree lhs = gimple_call_lhs (call);
5635 tree var = gimple_call_arg (call, 2);
5636 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5637 enum tree_code rcode
5638 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5639 tree init = omp_reduction_init_op (gimple_location (call), rcode,
5640 TREE_TYPE (var));
5641 gimple_seq seq = NULL;
5643 push_gimplify_context (true);
5645 if (level == GOMP_DIM_VECTOR)
5647 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5648 tree tid = make_ssa_name (integer_type_node);
5649 tree dim_vector = gimple_call_arg (call, 3);
5650 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
5651 dim_vector);
5652 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
5653 NULL_TREE, NULL_TREE);
5655 gimple_call_set_lhs (tid_call, tid);
5656 gimple_seq_add_stmt (&seq, tid_call);
5657 gimple_seq_add_stmt (&seq, cond_stmt);
5659 /* Split the block just after the call. */
5660 edge init_edge = split_block (gsi_bb (gsi), call);
5661 basic_block init_bb = init_edge->dest;
5662 basic_block call_bb = init_edge->src;
5664 /* Fixup flags from call_bb to init_bb. */
5665 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
5666 init_edge->probability = profile_probability::even ();
5668 /* Set the initialization stmts. */
5669 gimple_seq init_seq = NULL;
5670 tree init_var = make_ssa_name (TREE_TYPE (var));
5671 gimplify_assign (init_var, init, &init_seq);
5672 gsi = gsi_start_bb (init_bb);
5673 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5675 /* Split block just after the init stmt. */
5676 gsi_prev (&gsi);
5677 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5678 basic_block dst_bb = inited_edge->dest;
5680 /* Create false edge from call_bb to dst_bb. */
5681 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
5682 nop_edge->probability = profile_probability::even ();
5684 /* Create phi node in dst block. */
5685 gphi *phi = create_phi_node (lhs, dst_bb);
5686 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5687 add_phi_arg (phi, var, nop_edge, gimple_location (call));
5689 /* Reset dominator of dst bb. */
5690 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5692 /* Reset the gsi. */
5693 gsi = gsi_for_stmt (call);
5695 else
5697 if (level == GOMP_DIM_GANG)
5699 /* If there's no receiver object, propagate the incoming VAR. */
5700 tree ref_to_res = gimple_call_arg (call, 1);
5701 if (integer_zerop (ref_to_res))
5702 init = var;
5705 gimplify_assign (lhs, init, &seq);
5708 pop_gimplify_context (NULL);
5709 gsi_replace_with_seq (&gsi, seq, true);
5712 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
5714 static void
5715 nvptx_goacc_reduction_fini (gcall *call)
5717 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5718 tree lhs = gimple_call_lhs (call);
5719 tree ref_to_res = gimple_call_arg (call, 1);
5720 tree var = gimple_call_arg (call, 2);
5721 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5722 enum tree_code op
5723 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5724 gimple_seq seq = NULL;
5725 tree r = NULL_TREE;;
5727 push_gimplify_context (true);
5729 if (level == GOMP_DIM_VECTOR)
5731 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5732 but that requires a method of emitting a unified jump at the
5733 gimple level. */
5734 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5736 tree other_var = make_ssa_name (TREE_TYPE (var));
5737 nvptx_generate_vector_shuffle (gimple_location (call),
5738 other_var, var, shfl, &seq);
5740 r = make_ssa_name (TREE_TYPE (var));
5741 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5742 var, other_var), &seq);
5743 var = r;
5746 else
5748 tree accum = NULL_TREE;
5750 if (level == GOMP_DIM_WORKER)
5752 /* Get reduction buffer address. */
5753 tree offset = gimple_call_arg (call, 5);
5754 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5755 tree ptr = make_ssa_name (TREE_TYPE (call));
5757 gimplify_assign (ptr, call, &seq);
5758 accum = ptr;
5760 else if (integer_zerop (ref_to_res))
5761 r = var;
5762 else
5763 accum = ref_to_res;
5765 if (accum)
5767 /* UPDATE the accumulator. */
5768 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5769 seq = NULL;
5770 r = nvptx_reduction_update (gimple_location (call), &gsi,
5771 accum, var, op);
5775 if (lhs)
5776 gimplify_assign (lhs, r, &seq);
5777 pop_gimplify_context (NULL);
5779 gsi_replace_with_seq (&gsi, seq, true);
5782 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5784 static void
5785 nvptx_goacc_reduction_teardown (gcall *call)
5787 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5788 tree lhs = gimple_call_lhs (call);
5789 tree var = gimple_call_arg (call, 2);
5790 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5791 gimple_seq seq = NULL;
5793 push_gimplify_context (true);
5794 if (level == GOMP_DIM_WORKER)
5796 /* Read the worker reduction buffer. */
5797 tree offset = gimple_call_arg (call, 5);
5798 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
5799 tree ptr = make_ssa_name (TREE_TYPE (call));
5801 gimplify_assign (ptr, call, &seq);
5802 var = build_simple_mem_ref (ptr);
5803 TREE_THIS_VOLATILE (var) = 1;
5806 if (level != GOMP_DIM_GANG)
5808 /* Write to the receiver object. */
5809 tree ref_to_res = gimple_call_arg (call, 1);
5811 if (!integer_zerop (ref_to_res))
5812 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5815 if (lhs)
5816 gimplify_assign (lhs, var, &seq);
5818 pop_gimplify_context (NULL);
5820 gsi_replace_with_seq (&gsi, seq, true);
5823 /* NVPTX reduction expander. */
5825 static void
5826 nvptx_goacc_reduction (gcall *call)
5828 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5830 switch (code)
5832 case IFN_GOACC_REDUCTION_SETUP:
5833 nvptx_goacc_reduction_setup (call);
5834 break;
5836 case IFN_GOACC_REDUCTION_INIT:
5837 nvptx_goacc_reduction_init (call);
5838 break;
5840 case IFN_GOACC_REDUCTION_FINI:
5841 nvptx_goacc_reduction_fini (call);
5842 break;
5844 case IFN_GOACC_REDUCTION_TEARDOWN:
5845 nvptx_goacc_reduction_teardown (call);
5846 break;
5848 default:
5849 gcc_unreachable ();
5853 static bool
5854 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
5855 rtx x ATTRIBUTE_UNUSED)
5857 return true;
5860 static bool
5861 nvptx_vector_mode_supported (machine_mode mode)
5863 return (mode == V2SImode
5864 || mode == V2DImode);
5867 /* Return the preferred mode for vectorizing scalar MODE. */
5869 static machine_mode
5870 nvptx_preferred_simd_mode (scalar_mode mode)
5872 switch (mode)
5874 case E_DImode:
5875 return V2DImode;
5876 case E_SImode:
5877 return V2SImode;
5879 default:
5880 return default_preferred_simd_mode (mode);
5884 unsigned int
5885 nvptx_data_alignment (const_tree type, unsigned int basic_align)
5887 if (TREE_CODE (type) == INTEGER_TYPE)
5889 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5890 if (size == GET_MODE_SIZE (TImode))
5891 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
5894 return basic_align;
5897 /* Implement TARGET_MODES_TIEABLE_P. */
5899 static bool
5900 nvptx_modes_tieable_p (machine_mode, machine_mode)
5902 return false;
5905 /* Implement TARGET_HARD_REGNO_NREGS. */
5907 static unsigned int
5908 nvptx_hard_regno_nregs (unsigned int, machine_mode)
5910 return 1;
5913 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
5915 static bool
5916 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
5918 return false;
5921 #undef TARGET_OPTION_OVERRIDE
5922 #define TARGET_OPTION_OVERRIDE nvptx_option_override
5924 #undef TARGET_ATTRIBUTE_TABLE
5925 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5927 #undef TARGET_LRA_P
5928 #define TARGET_LRA_P hook_bool_void_false
5930 #undef TARGET_LEGITIMATE_ADDRESS_P
5931 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5933 #undef TARGET_PROMOTE_FUNCTION_MODE
5934 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5936 #undef TARGET_FUNCTION_ARG
5937 #define TARGET_FUNCTION_ARG nvptx_function_arg
5938 #undef TARGET_FUNCTION_INCOMING_ARG
5939 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5940 #undef TARGET_FUNCTION_ARG_ADVANCE
5941 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
5942 #undef TARGET_FUNCTION_ARG_BOUNDARY
5943 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
5944 #undef TARGET_PASS_BY_REFERENCE
5945 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5946 #undef TARGET_FUNCTION_VALUE_REGNO_P
5947 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5948 #undef TARGET_FUNCTION_VALUE
5949 #define TARGET_FUNCTION_VALUE nvptx_function_value
5950 #undef TARGET_LIBCALL_VALUE
5951 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
5952 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
5953 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
5954 #undef TARGET_GET_DRAP_RTX
5955 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
5956 #undef TARGET_SPLIT_COMPLEX_ARG
5957 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5958 #undef TARGET_RETURN_IN_MEMORY
5959 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5960 #undef TARGET_OMIT_STRUCT_RETURN_REG
5961 #define TARGET_OMIT_STRUCT_RETURN_REG true
5962 #undef TARGET_STRICT_ARGUMENT_NAMING
5963 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
5964 #undef TARGET_CALL_ARGS
5965 #define TARGET_CALL_ARGS nvptx_call_args
5966 #undef TARGET_END_CALL_ARGS
5967 #define TARGET_END_CALL_ARGS nvptx_end_call_args
5969 #undef TARGET_ASM_FILE_START
5970 #define TARGET_ASM_FILE_START nvptx_file_start
5971 #undef TARGET_ASM_FILE_END
5972 #define TARGET_ASM_FILE_END nvptx_file_end
5973 #undef TARGET_ASM_GLOBALIZE_LABEL
5974 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
5975 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
5976 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
5977 #undef TARGET_PRINT_OPERAND
5978 #define TARGET_PRINT_OPERAND nvptx_print_operand
5979 #undef TARGET_PRINT_OPERAND_ADDRESS
5980 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
5981 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
5982 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
5983 #undef TARGET_ASM_INTEGER
5984 #define TARGET_ASM_INTEGER nvptx_assemble_integer
5985 #undef TARGET_ASM_DECL_END
5986 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
5987 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
5988 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
5989 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
5990 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
5991 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
5992 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
5994 #undef TARGET_MACHINE_DEPENDENT_REORG
5995 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
5996 #undef TARGET_NO_REGISTER_ALLOCATION
5997 #define TARGET_NO_REGISTER_ALLOCATION true
5999 #undef TARGET_ENCODE_SECTION_INFO
6000 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
6001 #undef TARGET_RECORD_OFFLOAD_SYMBOL
6002 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6004 #undef TARGET_VECTOR_ALIGNMENT
6005 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6007 #undef TARGET_CANNOT_COPY_INSN_P
6008 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6010 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6011 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6013 #undef TARGET_INIT_BUILTINS
6014 #define TARGET_INIT_BUILTINS nvptx_init_builtins
6015 #undef TARGET_EXPAND_BUILTIN
6016 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6017 #undef TARGET_BUILTIN_DECL
6018 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
6020 #undef TARGET_SIMT_VF
6021 #define TARGET_SIMT_VF nvptx_simt_vf
6023 #undef TARGET_GOACC_VALIDATE_DIMS
6024 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6026 #undef TARGET_GOACC_DIM_LIMIT
6027 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6029 #undef TARGET_GOACC_FORK_JOIN
6030 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6032 #undef TARGET_GOACC_REDUCTION
6033 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6035 #undef TARGET_CANNOT_FORCE_CONST_MEM
6036 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6038 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6039 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6041 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6042 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6043 nvptx_preferred_simd_mode
6045 #undef TARGET_MODES_TIEABLE_P
6046 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6048 #undef TARGET_HARD_REGNO_NREGS
6049 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6051 #undef TARGET_CAN_CHANGE_MODE_CLASS
6052 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6054 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6055 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6057 struct gcc_target targetm = TARGET_INITIALIZER;
6059 #include "gt-nvptx.h"