* config/i386/i386.c: Include intl.h.
[official-gcc.git] / gcc / config / nvptx / nvptx.c
blobb6899221eb3a347318eb5450264a21dcfffb3c84
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2017 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include <sstream>
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "diagnostic.h"
39 #include "alias.h"
40 #include "insn-flags.h"
41 #include "output.h"
42 #include "insn-attr.h"
43 #include "flags.h"
44 #include "dojump.h"
45 #include "explow.h"
46 #include "calls.h"
47 #include "varasm.h"
48 #include "stmt.h"
49 #include "expr.h"
50 #include "tm-preds.h"
51 #include "tm-constrs.h"
52 #include "langhooks.h"
53 #include "dbxout.h"
54 #include "cfgrtl.h"
55 #include "gimple.h"
56 #include "stor-layout.h"
57 #include "builtins.h"
58 #include "omp-general.h"
59 #include "omp-low.h"
60 #include "gomp-constants.h"
61 #include "dumpfile.h"
62 #include "internal-fn.h"
63 #include "gimple-iterator.h"
64 #include "stringpool.h"
65 #include "tree-vrp.h"
66 #include "tree-ssa-operands.h"
67 #include "tree-ssanames.h"
68 #include "gimplify.h"
69 #include "tree-phinodes.h"
70 #include "cfgloop.h"
71 #include "fold-const.h"
73 /* This file should be included last. */
74 #include "target-def.h"
76 /* The various PTX memory areas an object might reside in. */
77 enum nvptx_data_area
79 DATA_AREA_GENERIC,
80 DATA_AREA_GLOBAL,
81 DATA_AREA_SHARED,
82 DATA_AREA_LOCAL,
83 DATA_AREA_CONST,
84 DATA_AREA_PARAM,
85 DATA_AREA_MAX
88 /* We record the data area in the target symbol flags. */
89 #define SYMBOL_DATA_AREA(SYM) \
90 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
91 & 7)
92 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
93 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
95 /* Record the function decls we've written, and the libfuncs and function
96 decls corresponding to them. */
97 static std::stringstream func_decls;
99 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
101 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
102 static bool equal (rtx a, rtx b) { return a == b; }
105 static GTY((cache))
106 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
108 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
110 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
111 static bool equal (tree a, tree b) { return a == b; }
114 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
115 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
117 /* Buffer needed to broadcast across workers. This is used for both
118 worker-neutering and worker broadcasting. It is shared by all
119 functions emitted. The buffer is placed in shared memory. It'd be
120 nice if PTX supported common blocks, because then this could be
121 shared across TUs (taking the largest size). */
122 static unsigned worker_bcast_size;
123 static unsigned worker_bcast_align;
124 static GTY(()) rtx worker_bcast_sym;
126 /* Buffer needed for worker reductions. This has to be distinct from
127 the worker broadcast array, as both may be live concurrently. */
128 static unsigned worker_red_size;
129 static unsigned worker_red_align;
130 static GTY(()) rtx worker_red_sym;
132 /* Global lock variable, needed for 128bit worker & gang reductions. */
133 static GTY(()) tree global_lock_var;
135 /* True if any function references __nvptx_stacks. */
136 static bool need_softstack_decl;
138 /* True if any function references __nvptx_uni. */
139 static bool need_unisimt_decl;
141 /* Allocate a new, cleared machine_function structure. */
143 static struct machine_function *
144 nvptx_init_machine_status (void)
146 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
147 p->return_mode = VOIDmode;
148 return p;
151 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
152 and -fopenacc is also enabled. */
154 static void
155 diagnose_openacc_conflict (bool optval, const char *optname)
157 if (flag_openacc && optval)
158 error ("option %s is not supported together with -fopenacc", optname);
161 /* Implement TARGET_OPTION_OVERRIDE. */
163 static void
164 nvptx_option_override (void)
166 init_machine_status = nvptx_init_machine_status;
168 /* Set toplevel_reorder, unless explicitly disabled. We need
169 reordering so that we emit necessary assembler decls of
170 undeclared variables. */
171 if (!global_options_set.x_flag_toplevel_reorder)
172 flag_toplevel_reorder = 1;
174 /* Set flag_no_common, unless explicitly disabled. We fake common
175 using .weak, and that's not entirely accurate, so avoid it
176 unless forced. */
177 if (!global_options_set.x_flag_no_common)
178 flag_no_common = 1;
180 /* Assumes that it will see only hard registers. */
181 flag_var_tracking = 0;
183 if (nvptx_optimize < 0)
184 nvptx_optimize = optimize > 0;
186 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
187 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
188 declared_libfuncs_htab
189 = hash_table<declared_libfunc_hasher>::create_ggc (17);
191 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
192 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
193 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
195 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
196 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
197 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
199 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
200 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
201 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
203 if (TARGET_GOMP)
204 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
207 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
208 deal with ptx ideosyncracies. */
210 const char *
211 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
213 switch (mode)
215 case BLKmode:
216 return ".b8";
217 case BImode:
218 return ".pred";
219 case QImode:
220 if (promote)
221 return ".u32";
222 else
223 return ".u8";
224 case HImode:
225 return ".u16";
226 case SImode:
227 return ".u32";
228 case DImode:
229 return ".u64";
231 case SFmode:
232 return ".f32";
233 case DFmode:
234 return ".f64";
236 default:
237 gcc_unreachable ();
241 /* Encode the PTX data area that DECL (which might not actually be a
242 _DECL) should reside in. */
244 static void
245 nvptx_encode_section_info (tree decl, rtx rtl, int first)
247 default_encode_section_info (decl, rtl, first);
248 if (first && MEM_P (rtl))
250 nvptx_data_area area = DATA_AREA_GENERIC;
252 if (TREE_CONSTANT (decl))
253 area = DATA_AREA_CONST;
254 else if (TREE_CODE (decl) == VAR_DECL)
256 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
258 area = DATA_AREA_SHARED;
259 if (DECL_INITIAL (decl))
260 error ("static initialization of variable %q+D in %<.shared%>"
261 " memory is not supported", decl);
263 else
264 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
267 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
271 /* Return the PTX name of the data area in which SYM should be
272 placed. The symbol must have already been processed by
273 nvptx_encode_seciton_info, or equivalent. */
275 static const char *
276 section_for_sym (rtx sym)
278 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
279 /* Same order as nvptx_data_area enum. */
280 static char const *const areas[] =
281 {"", ".global", ".shared", ".local", ".const", ".param"};
283 return areas[area];
286 /* Similarly for a decl. */
288 static const char *
289 section_for_decl (const_tree decl)
291 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
294 /* Check NAME for special function names and redirect them by returning a
295 replacement. This applies to malloc, free and realloc, for which we
296 want to use libgcc wrappers, and call, which triggers a bug in
297 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
298 not active in an offload compiler -- the names are all set by the
299 host-side compiler. */
301 static const char *
302 nvptx_name_replacement (const char *name)
304 if (strcmp (name, "call") == 0)
305 return "__nvptx_call";
306 if (strcmp (name, "malloc") == 0)
307 return "__nvptx_malloc";
308 if (strcmp (name, "free") == 0)
309 return "__nvptx_free";
310 if (strcmp (name, "realloc") == 0)
311 return "__nvptx_realloc";
312 return name;
315 /* If MODE should be treated as two registers of an inner mode, return
316 that inner mode. Otherwise return VOIDmode. */
318 static machine_mode
319 maybe_split_mode (machine_mode mode)
321 if (COMPLEX_MODE_P (mode))
322 return GET_MODE_INNER (mode);
324 if (mode == TImode)
325 return DImode;
327 return VOIDmode;
330 /* Output a register, subreg, or register pair (with optional
331 enclosing braces). */
333 static void
334 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
335 int subreg_offset = -1)
337 if (inner_mode == VOIDmode)
339 if (HARD_REGISTER_NUM_P (regno))
340 fprintf (file, "%s", reg_names[regno]);
341 else
342 fprintf (file, "%%r%d", regno);
344 else if (subreg_offset >= 0)
346 output_reg (file, regno, VOIDmode);
347 fprintf (file, "$%d", subreg_offset);
349 else
351 if (subreg_offset == -1)
352 fprintf (file, "{");
353 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
354 fprintf (file, ",");
355 output_reg (file, regno, inner_mode, 0);
356 if (subreg_offset == -1)
357 fprintf (file, "}");
361 /* Emit forking instructions for MASK. */
363 static void
364 nvptx_emit_forking (unsigned mask, bool is_call)
366 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
367 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
368 if (mask)
370 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
372 /* Emit fork at all levels. This helps form SESE regions, as
373 it creates a block with a single successor before entering a
374 partitooned region. That is a good candidate for the end of
375 an SESE region. */
376 if (!is_call)
377 emit_insn (gen_nvptx_fork (op));
378 emit_insn (gen_nvptx_forked (op));
382 /* Emit joining instructions for MASK. */
384 static void
385 nvptx_emit_joining (unsigned mask, bool is_call)
387 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
388 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
389 if (mask)
391 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
393 /* Emit joining for all non-call pars to ensure there's a single
394 predecessor for the block the join insn ends up in. This is
395 needed for skipping entire loops. */
396 if (!is_call)
397 emit_insn (gen_nvptx_joining (op));
398 emit_insn (gen_nvptx_join (op));
403 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
404 returned in memory. Integer and floating types supported by the
405 machine are passed in registers, everything else is passed in
406 memory. Complex types are split. */
408 static bool
409 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
411 if (type)
413 if (AGGREGATE_TYPE_P (type))
414 return true;
415 if (TREE_CODE (type) == VECTOR_TYPE)
416 return true;
419 if (!for_return && COMPLEX_MODE_P (mode))
420 /* Complex types are passed as two underlying args. */
421 mode = GET_MODE_INNER (mode);
423 if (GET_MODE_CLASS (mode) != MODE_INT
424 && GET_MODE_CLASS (mode) != MODE_FLOAT)
425 return true;
427 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
428 return true;
430 return false;
433 /* A non-memory argument of mode MODE is being passed, determine the mode it
434 should be promoted to. This is also used for determining return
435 type promotion. */
437 static machine_mode
438 promote_arg (machine_mode mode, bool prototyped)
440 if (!prototyped && mode == SFmode)
441 /* K&R float promotion for unprototyped functions. */
442 mode = DFmode;
443 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
444 mode = SImode;
446 return mode;
449 /* A non-memory return type of MODE is being returned. Determine the
450 mode it should be promoted to. */
452 static machine_mode
453 promote_return (machine_mode mode)
455 return promote_arg (mode, true);
458 /* Implement TARGET_FUNCTION_ARG. */
460 static rtx
461 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
462 const_tree, bool named)
464 if (mode == VOIDmode || !named)
465 return NULL_RTX;
467 return gen_reg_rtx (mode);
470 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
472 static rtx
473 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
474 const_tree, bool named)
476 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
478 if (mode == VOIDmode || !named)
479 return NULL_RTX;
481 /* No need to deal with split modes here, the only case that can
482 happen is complex modes and those are dealt with by
483 TARGET_SPLIT_COMPLEX_ARG. */
484 return gen_rtx_UNSPEC (mode,
485 gen_rtvec (1, GEN_INT (cum->count)),
486 UNSPEC_ARG_REG);
489 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
491 static void
492 nvptx_function_arg_advance (cumulative_args_t cum_v,
493 machine_mode ARG_UNUSED (mode),
494 const_tree ARG_UNUSED (type),
495 bool ARG_UNUSED (named))
497 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
499 cum->count++;
502 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
504 For nvptx This is only used for varadic args. The type has already
505 been promoted and/or converted to invisible reference. */
507 static unsigned
508 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
510 return GET_MODE_ALIGNMENT (mode);
513 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
515 For nvptx, we know how to handle functions declared as stdarg: by
516 passing an extra pointer to the unnamed arguments. However, the
517 Fortran frontend can produce a different situation, where a
518 function pointer is declared with no arguments, but the actual
519 function and calls to it take more arguments. In that case, we
520 want to ensure the call matches the definition of the function. */
522 static bool
523 nvptx_strict_argument_naming (cumulative_args_t cum_v)
525 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
527 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
530 /* Implement TARGET_LIBCALL_VALUE. */
532 static rtx
533 nvptx_libcall_value (machine_mode mode, const_rtx)
535 if (!cfun || !cfun->machine->doing_call)
536 /* Pretend to return in a hard reg for early uses before pseudos can be
537 generated. */
538 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
540 return gen_reg_rtx (mode);
543 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
544 where function FUNC returns or receives a value of data type TYPE. */
546 static rtx
547 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
548 bool outgoing)
550 machine_mode mode = promote_return (TYPE_MODE (type));
552 if (outgoing)
554 gcc_assert (cfun);
555 cfun->machine->return_mode = mode;
556 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
559 return nvptx_libcall_value (mode, NULL_RTX);
562 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
564 static bool
565 nvptx_function_value_regno_p (const unsigned int regno)
567 return regno == NVPTX_RETURN_REGNUM;
570 /* Types with a mode other than those supported by the machine are passed by
571 reference in memory. */
573 static bool
574 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
575 machine_mode mode, const_tree type,
576 bool ARG_UNUSED (named))
578 return pass_in_memory (mode, type, false);
581 /* Implement TARGET_RETURN_IN_MEMORY. */
583 static bool
584 nvptx_return_in_memory (const_tree type, const_tree)
586 return pass_in_memory (TYPE_MODE (type), type, true);
589 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
591 static machine_mode
592 nvptx_promote_function_mode (const_tree type, machine_mode mode,
593 int *ARG_UNUSED (punsignedp),
594 const_tree funtype, int for_return)
596 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
599 /* Helper for write_arg. Emit a single PTX argument of MODE, either
600 in a prototype, or as copy in a function prologue. ARGNO is the
601 index of this argument in the PTX function. FOR_REG is negative,
602 if we're emitting the PTX prototype. It is zero if we're copying
603 to an argument register and it is greater than zero if we're
604 copying to a specific hard register. */
606 static int
607 write_arg_mode (std::stringstream &s, int for_reg, int argno,
608 machine_mode mode)
610 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
612 if (for_reg < 0)
614 /* Writing PTX prototype. */
615 s << (argno ? ", " : " (");
616 s << ".param" << ptx_type << " %in_ar" << argno;
618 else
620 s << "\t.reg" << ptx_type << " ";
621 if (for_reg)
622 s << reg_names[for_reg];
623 else
624 s << "%ar" << argno;
625 s << ";\n";
626 if (argno >= 0)
628 s << "\tld.param" << ptx_type << " ";
629 if (for_reg)
630 s << reg_names[for_reg];
631 else
632 s << "%ar" << argno;
633 s << ", [%in_ar" << argno << "];\n";
636 return argno + 1;
639 /* Process function parameter TYPE to emit one or more PTX
640 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
641 is true, if this is a prototyped function, rather than an old-style
642 C declaration. Returns the next argument number to use.
644 The promotion behavior here must match the regular GCC function
645 parameter marshalling machinery. */
647 static int
648 write_arg_type (std::stringstream &s, int for_reg, int argno,
649 tree type, bool prototyped)
651 machine_mode mode = TYPE_MODE (type);
653 if (mode == VOIDmode)
654 return argno;
656 if (pass_in_memory (mode, type, false))
657 mode = Pmode;
658 else
660 bool split = TREE_CODE (type) == COMPLEX_TYPE;
662 if (split)
664 /* Complex types are sent as two separate args. */
665 type = TREE_TYPE (type);
666 mode = TYPE_MODE (type);
667 prototyped = true;
670 mode = promote_arg (mode, prototyped);
671 if (split)
672 argno = write_arg_mode (s, for_reg, argno, mode);
675 return write_arg_mode (s, for_reg, argno, mode);
678 /* Emit a PTX return as a prototype or function prologue declaration
679 for MODE. */
681 static void
682 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
684 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
685 const char *pfx = "\t.reg";
686 const char *sfx = ";\n";
688 if (for_proto)
689 pfx = "(.param", sfx = "_out) ";
691 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
694 /* Process a function return TYPE to emit a PTX return as a prototype
695 or function prologue declaration. Returns true if return is via an
696 additional pointer parameter. The promotion behavior here must
697 match the regular GCC function return mashalling. */
699 static bool
700 write_return_type (std::stringstream &s, bool for_proto, tree type)
702 machine_mode mode = TYPE_MODE (type);
704 if (mode == VOIDmode)
705 return false;
707 bool return_in_mem = pass_in_memory (mode, type, true);
709 if (return_in_mem)
711 if (for_proto)
712 return return_in_mem;
714 /* Named return values can cause us to return a pointer as well
715 as expect an argument for the return location. This is
716 optimization-level specific, so no caller can make use of
717 this data, but more importantly for us, we must ensure it
718 doesn't change the PTX prototype. */
719 mode = (machine_mode) cfun->machine->return_mode;
721 if (mode == VOIDmode)
722 return return_in_mem;
724 /* Clear return_mode to inhibit copy of retval to non-existent
725 retval parameter. */
726 cfun->machine->return_mode = VOIDmode;
728 else
729 mode = promote_return (mode);
731 write_return_mode (s, for_proto, mode);
733 return return_in_mem;
736 /* Look for attributes in ATTRS that would indicate we must write a function
737 as a .entry kernel rather than a .func. Return true if one is found. */
739 static bool
740 write_as_kernel (tree attrs)
742 return (lookup_attribute ("kernel", attrs) != NULL_TREE
743 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
744 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
745 /* For OpenMP target regions, the corresponding kernel entry is emitted from
746 write_omp_entry as a separate function. */
749 /* Emit a linker marker for a function decl or defn. */
751 static void
752 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
753 const char *name)
755 s << "\n// BEGIN";
756 if (globalize)
757 s << " GLOBAL";
758 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
759 s << name << "\n";
762 /* Emit a linker marker for a variable decl or defn. */
764 static void
765 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
767 fprintf (file, "\n// BEGIN%s VAR %s: ",
768 globalize ? " GLOBAL" : "",
769 is_defn ? "DEF" : "DECL");
770 assemble_name_raw (file, name);
771 fputs ("\n", file);
774 /* Write a .func or .kernel declaration or definition along with
775 a helper comment for use by ld. S is the stream to write to, DECL
776 the decl for the function with name NAME. For definitions, emit
777 a declaration too. */
779 static const char *
780 write_fn_proto (std::stringstream &s, bool is_defn,
781 const char *name, const_tree decl)
783 if (is_defn)
784 /* Emit a declaration. The PTX assembler gets upset without it. */
785 name = write_fn_proto (s, false, name, decl);
786 else
788 /* Avoid repeating the name replacement. */
789 name = nvptx_name_replacement (name);
790 if (name[0] == '*')
791 name++;
794 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
796 /* PTX declaration. */
797 if (DECL_EXTERNAL (decl))
798 s << ".extern ";
799 else if (TREE_PUBLIC (decl))
800 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
801 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
803 tree fntype = TREE_TYPE (decl);
804 tree result_type = TREE_TYPE (fntype);
806 /* atomic_compare_exchange_$n builtins have an exceptional calling
807 convention. */
808 int not_atomic_weak_arg = -1;
809 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
810 switch (DECL_FUNCTION_CODE (decl))
812 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
813 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
814 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
815 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
816 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
817 /* These atomics skip the 'weak' parm in an actual library
818 call. We must skip it in the prototype too. */
819 not_atomic_weak_arg = 3;
820 break;
822 default:
823 break;
826 /* Declare the result. */
827 bool return_in_mem = write_return_type (s, true, result_type);
829 s << name;
831 int argno = 0;
833 /* Emit argument list. */
834 if (return_in_mem)
835 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
837 /* We get:
838 NULL in TYPE_ARG_TYPES, for old-style functions
839 NULL in DECL_ARGUMENTS, for builtin functions without another
840 declaration.
841 So we have to pick the best one we have. */
842 tree args = TYPE_ARG_TYPES (fntype);
843 bool prototyped = true;
844 if (!args)
846 args = DECL_ARGUMENTS (decl);
847 prototyped = false;
850 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
852 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
854 if (not_atomic_weak_arg)
855 argno = write_arg_type (s, -1, argno, type, prototyped);
856 else
857 gcc_assert (type == boolean_type_node);
860 if (stdarg_p (fntype))
861 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
863 if (DECL_STATIC_CHAIN (decl))
864 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
866 if (!argno && strcmp (name, "main") == 0)
868 argno = write_arg_type (s, -1, argno, integer_type_node, true);
869 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
872 if (argno)
873 s << ")";
875 s << (is_defn ? "\n" : ";\n");
877 return name;
880 /* Construct a function declaration from a call insn. This can be
881 necessary for two reasons - either we have an indirect call which
882 requires a .callprototype declaration, or we have a libcall
883 generated by emit_library_call for which no decl exists. */
885 static void
886 write_fn_proto_from_insn (std::stringstream &s, const char *name,
887 rtx result, rtx pat)
889 if (!name)
891 s << "\t.callprototype ";
892 name = "_";
894 else
896 name = nvptx_name_replacement (name);
897 write_fn_marker (s, false, true, name);
898 s << "\t.extern .func ";
901 if (result != NULL_RTX)
902 write_return_mode (s, true, GET_MODE (result));
904 s << name;
906 int arg_end = XVECLEN (pat, 0);
907 for (int i = 1; i < arg_end; i++)
909 /* We don't have to deal with mode splitting & promotion here,
910 as that was already done when generating the call
911 sequence. */
912 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
914 write_arg_mode (s, -1, i - 1, mode);
916 if (arg_end != 1)
917 s << ")";
918 s << ";\n";
921 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
922 table and and write a ptx prototype. These are emitted at end of
923 compilation. */
925 static void
926 nvptx_record_fndecl (tree decl)
928 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
929 if (*slot == NULL)
931 *slot = decl;
932 const char *name = get_fnname_from_decl (decl);
933 write_fn_proto (func_decls, false, name, decl);
937 /* Record a libcall or unprototyped external function. CALLEE is the
938 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
939 declaration for it. */
941 static void
942 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
944 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
945 if (*slot == NULL)
947 *slot = callee;
949 const char *name = XSTR (callee, 0);
950 write_fn_proto_from_insn (func_decls, name, retval, pat);
954 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
955 is prototyped, record it now. Otherwise record it as needed at end
956 of compilation, when we might have more information about it. */
958 void
959 nvptx_record_needed_fndecl (tree decl)
961 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
963 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
964 if (*slot == NULL)
965 *slot = decl;
967 else
968 nvptx_record_fndecl (decl);
971 /* SYM is a SYMBOL_REF. If it refers to an external function, record
972 it as needed. */
974 static void
975 nvptx_maybe_record_fnsym (rtx sym)
977 tree decl = SYMBOL_REF_DECL (sym);
979 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
980 nvptx_record_needed_fndecl (decl);
983 /* Emit a local array to hold some part of a conventional stack frame
984 and initialize REGNO to point to it. If the size is zero, it'll
985 never be valid to dereference, so we can simply initialize to
986 zero. */
988 static void
989 init_frame (FILE *file, int regno, unsigned align, unsigned size)
991 if (size)
992 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
993 align, reg_names[regno], size);
994 fprintf (file, "\t.reg.u%d %s;\n",
995 POINTER_SIZE, reg_names[regno]);
996 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
997 : "\tmov.u%d %s, 0;\n"),
998 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1001 /* Emit soft stack frame setup sequence. */
1003 static void
1004 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1006 /* Maintain 64-bit stack alignment. */
1007 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1008 size = ROUND_UP (size, keep_align);
1009 int bits = POINTER_SIZE;
1010 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1011 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1012 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1013 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1014 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1015 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1016 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1017 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1018 fprintf (file, "\t{\n");
1019 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1020 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1021 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1022 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1023 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1024 bits == 64 ? ".wide" : ".lo", bits / 8);
1025 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1027 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1028 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1030 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1031 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1032 bits, reg_sspprev, reg_sspslot);
1034 /* Initialize %frame = %sspprev - size. */
1035 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1036 bits, reg_frame, reg_sspprev, size);
1038 /* Apply alignment, if larger than 64. */
1039 if (alignment > keep_align)
1040 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1041 bits, reg_frame, reg_frame, -alignment);
1043 size = crtl->outgoing_args_size;
1044 gcc_assert (size % keep_align == 0);
1046 /* Initialize %stack. */
1047 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1048 bits, reg_stack, reg_frame, size);
1050 /* Usually 'crtl->is_leaf' is computed during register allocator
1051 initialization (which is not done on NVPTX) or for pressure-sensitive
1052 optimizations. Initialize it here, except if already set. */
1053 if (!crtl->is_leaf)
1054 crtl->is_leaf = leaf_function_p ();
1055 if (!crtl->is_leaf)
1056 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1057 bits, reg_sspslot, reg_stack);
1058 fprintf (file, "\t}\n");
1059 cfun->machine->has_softstack = true;
1060 need_softstack_decl = true;
1063 /* Emit code to initialize the REGNO predicate register to indicate
1064 whether we are not lane zero on the NAME axis. */
1066 static void
1067 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1069 fprintf (file, "\t{\n");
1070 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1071 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1072 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1073 fprintf (file, "\t}\n");
1076 /* Emit code to initialize predicate and master lane index registers for
1077 -muniform-simt code generation variant. */
1079 static void
1080 nvptx_init_unisimt_predicate (FILE *file)
1082 int bits = POINTER_SIZE;
1083 int master = REGNO (cfun->machine->unisimt_master);
1084 int pred = REGNO (cfun->machine->unisimt_predicate);
1085 fprintf (file, "\t{\n");
1086 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1087 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1088 fprintf (file, "\t\t.reg.u%d %%ustmp2;\n", bits);
1089 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1090 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1091 bits == 64 ? ".wide" : ".lo");
1092 fprintf (file, "\t\tmov.u%d %%ustmp2, __nvptx_uni;\n", bits);
1093 fprintf (file, "\t\tadd.u%d %%ustmp2, %%ustmp2, %%ustmp1;\n", bits);
1094 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%ustmp2];\n", master);
1095 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.x;\n");
1096 /* Compute 'master lane index' as 'tid.x & __nvptx_uni[tid.y]'. */
1097 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1098 /* Compute predicate as 'tid.x == master'. */
1099 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1100 fprintf (file, "\t}\n");
1101 need_unisimt_decl = true;
1104 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1106 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1107 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1109 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1110 __nvptx_uni[tid.y] = 0;
1111 gomp_nvptx_main (ORIG, arg);
1113 ORIG itself should not be emitted as a PTX .entry function. */
1115 static void
1116 write_omp_entry (FILE *file, const char *name, const char *orig)
1118 static bool gomp_nvptx_main_declared;
1119 if (!gomp_nvptx_main_declared)
1121 gomp_nvptx_main_declared = true;
1122 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1123 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1124 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1126 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1127 #define NTID_Y "%ntid.y"
1128 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1129 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1130 {\n\
1131 .reg.u32 %r<3>;\n\
1132 .reg.u" PS " %R<4>;\n\
1133 mov.u32 %r0, %tid.y;\n\
1134 mov.u32 %r1, " NTID_Y ";\n\
1135 mov.u32 %r2, %ctaid.x;\n\
1136 cvt.u" PS ".u32 %R1, %r0;\n\
1137 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1138 mov.u" PS " %R0, __nvptx_stacks;\n\
1139 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1140 ld.param.u" PS " %R2, [%stack];\n\
1141 ld.param.u" PS " %R3, [%sz];\n\
1142 add.u" PS " %R2, %R2, %R3;\n\
1143 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1144 st.shared.u" PS " [%R0], %R2;\n\
1145 mov.u" PS " %R0, __nvptx_uni;\n\
1146 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1147 mov.u32 %r0, 0;\n\
1148 st.shared.u32 [%R0], %r0;\n\
1149 mov.u" PS " %R0, \0;\n\
1150 ld.param.u" PS " %R1, [%arg];\n\
1151 {\n\
1152 .param.u" PS " %P<2>;\n\
1153 st.param.u" PS " [%P0], %R0;\n\
1154 st.param.u" PS " [%P1], %R1;\n\
1155 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1156 }\n\
1157 ret.uni;\n\
1158 }\n"
1159 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1160 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1161 #undef ENTRY_TEMPLATE
1162 #undef NTID_Y
1163 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1164 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1165 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1166 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1167 need_softstack_decl = need_unisimt_decl = true;
1170 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1171 function, including local var decls and copies from the arguments to
1172 local regs. */
1174 void
1175 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1177 tree fntype = TREE_TYPE (decl);
1178 tree result_type = TREE_TYPE (fntype);
1179 int argno = 0;
1181 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1182 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1184 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1185 sprintf (buf, "%s$impl", name);
1186 write_omp_entry (file, name, buf);
1187 name = buf;
1189 /* We construct the initial part of the function into a string
1190 stream, in order to share the prototype writing code. */
1191 std::stringstream s;
1192 write_fn_proto (s, true, name, decl);
1193 s << "{\n";
1195 bool return_in_mem = write_return_type (s, false, result_type);
1196 if (return_in_mem)
1197 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1199 /* Declare and initialize incoming arguments. */
1200 tree args = TYPE_ARG_TYPES (fntype);
1201 bool prototyped = true;
1202 if (!args)
1204 args = DECL_ARGUMENTS (decl);
1205 prototyped = false;
1208 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1210 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1212 argno = write_arg_type (s, 0, argno, type, prototyped);
1215 if (stdarg_p (fntype))
1216 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1217 true);
1219 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1220 write_arg_type (s, STATIC_CHAIN_REGNUM,
1221 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1222 true);
1224 fprintf (file, "%s", s.str().c_str());
1226 HOST_WIDE_INT sz = get_frame_size ();
1227 bool need_frameptr = sz || cfun->machine->has_chain;
1228 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1229 if (!TARGET_SOFT_STACK)
1231 /* Declare a local var for outgoing varargs. */
1232 if (cfun->machine->has_varadic)
1233 init_frame (file, STACK_POINTER_REGNUM,
1234 UNITS_PER_WORD, crtl->outgoing_args_size);
1236 /* Declare a local variable for the frame. Force its size to be
1237 DImode-compatible. */
1238 if (need_frameptr)
1239 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1240 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1242 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca)
1243 init_softstack_frame (file, alignment, sz);
1245 /* Declare the pseudos we have as ptx registers. */
1246 int maxregs = max_reg_num ();
1247 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1249 if (regno_reg_rtx[i] != const0_rtx)
1251 machine_mode mode = PSEUDO_REGNO_MODE (i);
1252 machine_mode split = maybe_split_mode (mode);
1254 if (split != VOIDmode)
1255 mode = split;
1256 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1257 output_reg (file, i, split, -2);
1258 fprintf (file, ";\n");
1262 /* Emit axis predicates. */
1263 if (cfun->machine->axis_predicate[0])
1264 nvptx_init_axis_predicate (file,
1265 REGNO (cfun->machine->axis_predicate[0]), "y");
1266 if (cfun->machine->axis_predicate[1])
1267 nvptx_init_axis_predicate (file,
1268 REGNO (cfun->machine->axis_predicate[1]), "x");
1269 if (cfun->machine->unisimt_predicate)
1270 nvptx_init_unisimt_predicate (file);
1273 /* Output instruction that sets soft stack pointer in shared memory to the
1274 value in register given by SRC_REGNO. */
1276 const char *
1277 nvptx_output_set_softstack (unsigned src_regno)
1279 if (cfun->machine->has_softstack && !crtl->is_leaf)
1281 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1282 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1283 output_reg (asm_out_file, src_regno, VOIDmode);
1284 fprintf (asm_out_file, ";\n");
1286 return "";
1288 /* Output a return instruction. Also copy the return value to its outgoing
1289 location. */
1291 const char *
1292 nvptx_output_return (void)
1294 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1296 if (mode != VOIDmode)
1297 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1298 nvptx_ptx_type_from_mode (mode, false),
1299 reg_names[NVPTX_RETURN_REGNUM],
1300 reg_names[NVPTX_RETURN_REGNUM]);
1302 return "ret;";
1305 /* Terminate a function by writing a closing brace to FILE. */
1307 void
1308 nvptx_function_end (FILE *file)
1310 fprintf (file, "}\n");
1313 /* Decide whether we can make a sibling call to a function. For ptx, we
1314 can't. */
1316 static bool
1317 nvptx_function_ok_for_sibcall (tree, tree)
1319 return false;
1322 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1324 static rtx
1325 nvptx_get_drap_rtx (void)
1327 if (TARGET_SOFT_STACK && stack_realign_drap)
1328 return arg_pointer_rtx;
1329 return NULL_RTX;
1332 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1333 argument to the next call. */
1335 static void
1336 nvptx_call_args (rtx arg, tree fntype)
1338 if (!cfun->machine->doing_call)
1340 cfun->machine->doing_call = true;
1341 cfun->machine->is_varadic = false;
1342 cfun->machine->num_args = 0;
1344 if (fntype && stdarg_p (fntype))
1346 cfun->machine->is_varadic = true;
1347 cfun->machine->has_varadic = true;
1348 cfun->machine->num_args++;
1352 if (REG_P (arg) && arg != pc_rtx)
1354 cfun->machine->num_args++;
1355 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1356 cfun->machine->call_args);
1360 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1361 information we recorded. */
1363 static void
1364 nvptx_end_call_args (void)
1366 cfun->machine->doing_call = false;
1367 free_EXPR_LIST_list (&cfun->machine->call_args);
1370 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1371 track of whether calls involving static chains or varargs were seen
1372 in the current function.
1373 For libcalls, maintain a hash table of decls we have seen, and
1374 record a function decl for later when encountering a new one. */
1376 void
1377 nvptx_expand_call (rtx retval, rtx address)
1379 rtx callee = XEXP (address, 0);
1380 rtx varargs = NULL_RTX;
1381 unsigned parallel = 0;
1383 if (!call_insn_operand (callee, Pmode))
1385 callee = force_reg (Pmode, callee);
1386 address = change_address (address, QImode, callee);
1389 if (GET_CODE (callee) == SYMBOL_REF)
1391 tree decl = SYMBOL_REF_DECL (callee);
1392 if (decl != NULL_TREE)
1394 if (DECL_STATIC_CHAIN (decl))
1395 cfun->machine->has_chain = true;
1397 tree attr = oacc_get_fn_attrib (decl);
1398 if (attr)
1400 tree dims = TREE_VALUE (attr);
1402 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1403 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1405 if (TREE_PURPOSE (dims)
1406 && !integer_zerop (TREE_PURPOSE (dims)))
1407 break;
1408 /* Not on this axis. */
1409 parallel ^= GOMP_DIM_MASK (ix);
1410 dims = TREE_CHAIN (dims);
1416 unsigned nargs = cfun->machine->num_args;
1417 if (cfun->machine->is_varadic)
1419 varargs = gen_reg_rtx (Pmode);
1420 emit_move_insn (varargs, stack_pointer_rtx);
1423 rtvec vec = rtvec_alloc (nargs + 1);
1424 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1425 int vec_pos = 0;
1427 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1428 rtx tmp_retval = retval;
1429 if (retval)
1431 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1432 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1433 call = gen_rtx_SET (tmp_retval, call);
1435 XVECEXP (pat, 0, vec_pos++) = call;
1437 /* Construct the call insn, including a USE for each argument pseudo
1438 register. These will be used when printing the insn. */
1439 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1440 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1442 if (varargs)
1443 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1445 gcc_assert (vec_pos = XVECLEN (pat, 0));
1447 nvptx_emit_forking (parallel, true);
1448 emit_call_insn (pat);
1449 nvptx_emit_joining (parallel, true);
1451 if (tmp_retval != retval)
1452 emit_move_insn (retval, tmp_retval);
1455 /* Emit a comparison COMPARE, and return the new test to be used in the
1456 jump. */
1459 nvptx_expand_compare (rtx compare)
1461 rtx pred = gen_reg_rtx (BImode);
1462 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1463 XEXP (compare, 0), XEXP (compare, 1));
1464 emit_insn (gen_rtx_SET (pred, cmp));
1465 return gen_rtx_NE (BImode, pred, const0_rtx);
1468 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1470 void
1471 nvptx_expand_oacc_fork (unsigned mode)
1473 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1476 void
1477 nvptx_expand_oacc_join (unsigned mode)
1479 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1482 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1483 objects. */
1485 static rtx
1486 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1488 rtx res;
1490 switch (GET_MODE (src))
1492 case DImode:
1493 res = gen_unpackdisi2 (dst0, dst1, src);
1494 break;
1495 case DFmode:
1496 res = gen_unpackdfsi2 (dst0, dst1, src);
1497 break;
1498 default: gcc_unreachable ();
1500 return res;
1503 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1504 object. */
1506 static rtx
1507 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1509 rtx res;
1511 switch (GET_MODE (dst))
1513 case DImode:
1514 res = gen_packsidi2 (dst, src0, src1);
1515 break;
1516 case DFmode:
1517 res = gen_packsidf2 (dst, src0, src1);
1518 break;
1519 default: gcc_unreachable ();
1521 return res;
1524 /* Generate an instruction or sequence to broadcast register REG
1525 across the vectors of a single warp. */
1528 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1530 rtx res;
1532 switch (GET_MODE (dst))
1534 case SImode:
1535 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1536 break;
1537 case SFmode:
1538 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1539 break;
1540 case DImode:
1541 case DFmode:
1543 rtx tmp0 = gen_reg_rtx (SImode);
1544 rtx tmp1 = gen_reg_rtx (SImode);
1546 start_sequence ();
1547 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1548 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1549 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1550 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1551 res = get_insns ();
1552 end_sequence ();
1554 break;
1555 case BImode:
1557 rtx tmp = gen_reg_rtx (SImode);
1559 start_sequence ();
1560 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1561 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1562 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1563 res = get_insns ();
1564 end_sequence ();
1566 break;
1567 case QImode:
1568 case HImode:
1570 rtx tmp = gen_reg_rtx (SImode);
1572 start_sequence ();
1573 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1574 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1575 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1576 tmp)));
1577 res = get_insns ();
1578 end_sequence ();
1580 break;
1582 default:
1583 gcc_unreachable ();
1585 return res;
1588 /* Generate an instruction or sequence to broadcast register REG
1589 across the vectors of a single warp. */
1591 static rtx
1592 nvptx_gen_vcast (rtx reg)
1594 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1597 /* Structure used when generating a worker-level spill or fill. */
1599 struct wcast_data_t
1601 rtx base; /* Register holding base addr of buffer. */
1602 rtx ptr; /* Iteration var, if needed. */
1603 unsigned offset; /* Offset into worker buffer. */
1606 /* Direction of the spill/fill and looping setup/teardown indicator. */
1608 enum propagate_mask
1610 PM_read = 1 << 0,
1611 PM_write = 1 << 1,
1612 PM_loop_begin = 1 << 2,
1613 PM_loop_end = 1 << 3,
1615 PM_read_write = PM_read | PM_write
1618 /* Generate instruction(s) to spill or fill register REG to/from the
1619 worker broadcast array. PM indicates what is to be done, REP
1620 how many loop iterations will be executed (0 for not a loop). */
1622 static rtx
1623 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1625 rtx res;
1626 machine_mode mode = GET_MODE (reg);
1628 switch (mode)
1630 case BImode:
1632 rtx tmp = gen_reg_rtx (SImode);
1634 start_sequence ();
1635 if (pm & PM_read)
1636 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1637 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1638 if (pm & PM_write)
1639 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1640 res = get_insns ();
1641 end_sequence ();
1643 break;
1645 default:
1647 rtx addr = data->ptr;
1649 if (!addr)
1651 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1653 if (align > worker_bcast_align)
1654 worker_bcast_align = align;
1655 data->offset = (data->offset + align - 1) & ~(align - 1);
1656 addr = data->base;
1657 if (data->offset)
1658 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1661 addr = gen_rtx_MEM (mode, addr);
1662 if (pm == PM_read)
1663 res = gen_rtx_SET (addr, reg);
1664 else if (pm == PM_write)
1665 res = gen_rtx_SET (reg, addr);
1666 else
1667 gcc_unreachable ();
1669 if (data->ptr)
1671 /* We're using a ptr, increment it. */
1672 start_sequence ();
1674 emit_insn (res);
1675 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1676 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1677 res = get_insns ();
1678 end_sequence ();
1680 else
1681 rep = 1;
1682 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1684 break;
1686 return res;
1689 /* Returns true if X is a valid address for use in a memory reference. */
1691 static bool
1692 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1694 enum rtx_code code = GET_CODE (x);
1696 switch (code)
1698 case REG:
1699 return true;
1701 case PLUS:
1702 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1703 return true;
1704 return false;
1706 case CONST:
1707 case SYMBOL_REF:
1708 case LABEL_REF:
1709 return true;
1711 default:
1712 return false;
1716 /* Machinery to output constant initializers. When beginning an
1717 initializer, we decide on a fragment size (which is visible in ptx
1718 in the type used), and then all initializer data is buffered until
1719 a fragment is filled and ready to be written out. */
1721 static struct
1723 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1724 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1725 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1726 out. */
1727 unsigned size; /* Fragment size to accumulate. */
1728 unsigned offset; /* Offset within current fragment. */
1729 bool started; /* Whether we've output any initializer. */
1730 } init_frag;
1732 /* The current fragment is full, write it out. SYM may provide a
1733 symbolic reference we should output, in which case the fragment
1734 value is the addend. */
1736 static void
1737 output_init_frag (rtx sym)
1739 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1740 unsigned HOST_WIDE_INT val = init_frag.val;
1742 init_frag.started = true;
1743 init_frag.val = 0;
1744 init_frag.offset = 0;
1745 init_frag.remaining--;
1747 if (sym)
1749 fprintf (asm_out_file, "generic(");
1750 output_address (VOIDmode, sym);
1751 fprintf (asm_out_file, val ? ") + " : ")");
1754 if (!sym || val)
1755 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1758 /* Add value VAL of size SIZE to the data we're emitting, and keep
1759 writing out chunks as they fill up. */
1761 static void
1762 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1764 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1766 for (unsigned part = 0; size; size -= part)
1768 val >>= part * BITS_PER_UNIT;
1769 part = init_frag.size - init_frag.offset;
1770 if (part > size)
1771 part = size;
1773 unsigned HOST_WIDE_INT partial
1774 = val << (init_frag.offset * BITS_PER_UNIT);
1775 init_frag.val |= partial & init_frag.mask;
1776 init_frag.offset += part;
1778 if (init_frag.offset == init_frag.size)
1779 output_init_frag (NULL);
1783 /* Target hook for assembling integer object X of size SIZE. */
1785 static bool
1786 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1788 HOST_WIDE_INT val = 0;
1790 switch (GET_CODE (x))
1792 default:
1793 /* Let the generic machinery figure it out, usually for a
1794 CONST_WIDE_INT. */
1795 return false;
1797 case CONST_INT:
1798 nvptx_assemble_value (INTVAL (x), size);
1799 break;
1801 case CONST:
1802 x = XEXP (x, 0);
1803 gcc_assert (GET_CODE (x) == PLUS);
1804 val = INTVAL (XEXP (x, 1));
1805 x = XEXP (x, 0);
1806 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1807 /* FALLTHROUGH */
1809 case SYMBOL_REF:
1810 gcc_assert (size == init_frag.size);
1811 if (init_frag.offset)
1812 sorry ("cannot emit unaligned pointers in ptx assembly");
1814 nvptx_maybe_record_fnsym (x);
1815 init_frag.val = val;
1816 output_init_frag (x);
1817 break;
1820 return true;
1823 /* Output SIZE zero bytes. We ignore the FILE argument since the
1824 functions we're calling to perform the output just use
1825 asm_out_file. */
1827 void
1828 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1830 /* Finish the current fragment, if it's started. */
1831 if (init_frag.offset)
1833 unsigned part = init_frag.size - init_frag.offset;
1834 if (part > size)
1835 part = (unsigned) size;
1836 size -= part;
1837 nvptx_assemble_value (0, part);
1840 /* If this skip doesn't terminate the initializer, write as many
1841 remaining pieces as possible directly. */
1842 if (size < init_frag.remaining * init_frag.size)
1844 while (size >= init_frag.size)
1846 size -= init_frag.size;
1847 output_init_frag (NULL_RTX);
1849 if (size)
1850 nvptx_assemble_value (0, size);
1854 /* Output a string STR with length SIZE. As in nvptx_output_skip we
1855 ignore the FILE arg. */
1857 void
1858 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1860 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1861 nvptx_assemble_value (str[i], 1);
1864 /* Emit a PTX variable decl and prepare for emission of its
1865 initializer. NAME is the symbol name and SETION the PTX data
1866 area. The type is TYPE, object size SIZE and alignment is ALIGN.
1867 The caller has already emitted any indentation and linkage
1868 specifier. It is responsible for any initializer, terminating ;
1869 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
1870 this is the opposite way round that PTX wants them! */
1872 static void
1873 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
1874 const_tree type, HOST_WIDE_INT size, unsigned align)
1876 while (TREE_CODE (type) == ARRAY_TYPE)
1877 type = TREE_TYPE (type);
1879 if (TREE_CODE (type) == VECTOR_TYPE
1880 || TREE_CODE (type) == COMPLEX_TYPE)
1881 /* Neither vector nor complex types can contain the other. */
1882 type = TREE_TYPE (type);
1884 unsigned elt_size = int_size_in_bytes (type);
1886 /* Largest mode we're prepared to accept. For BLKmode types we
1887 don't know if it'll contain pointer constants, so have to choose
1888 pointer size, otherwise we can choose DImode. */
1889 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
1891 elt_size |= GET_MODE_SIZE (elt_mode);
1892 elt_size &= -elt_size; /* Extract LSB set. */
1894 init_frag.size = elt_size;
1895 /* Avoid undefined shift behavior by using '2'. */
1896 init_frag.mask = ((unsigned HOST_WIDE_INT)2
1897 << (elt_size * BITS_PER_UNIT - 1)) - 1;
1898 init_frag.val = 0;
1899 init_frag.offset = 0;
1900 init_frag.started = false;
1901 /* Size might not be a multiple of elt size, if there's an
1902 initialized trailing struct array with smaller type than
1903 elt_size. */
1904 init_frag.remaining = (size + elt_size - 1) / elt_size;
1906 fprintf (file, "%s .align %d .u%d ",
1907 section, align / BITS_PER_UNIT,
1908 elt_size * BITS_PER_UNIT);
1909 assemble_name (file, name);
1911 if (size)
1912 /* We make everything an array, to simplify any initialization
1913 emission. */
1914 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
1917 /* Called when the initializer for a decl has been completely output through
1918 combinations of the three functions above. */
1920 static void
1921 nvptx_assemble_decl_end (void)
1923 if (init_frag.offset)
1924 /* This can happen with a packed struct with trailing array member. */
1925 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
1926 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
1929 /* Output an uninitialized common or file-scope variable. */
1931 void
1932 nvptx_output_aligned_decl (FILE *file, const char *name,
1933 const_tree decl, HOST_WIDE_INT size, unsigned align)
1935 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1937 /* If this is public, it is common. The nearest thing we have to
1938 common is weak. */
1939 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
1941 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1942 TREE_TYPE (decl), size, align);
1943 nvptx_assemble_decl_end ();
1946 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1947 writing a constant variable EXP with NAME and SIZE and its
1948 initializer to FILE. */
1950 static void
1951 nvptx_asm_declare_constant_name (FILE *file, const char *name,
1952 const_tree exp, HOST_WIDE_INT obj_size)
1954 write_var_marker (file, true, false, name);
1956 fprintf (file, "\t");
1958 tree type = TREE_TYPE (exp);
1959 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
1960 TYPE_ALIGN (type));
1963 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1964 a variable DECL with NAME to FILE. */
1966 void
1967 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1969 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1971 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
1972 : DECL_WEAK (decl) ? ".weak " : ".visible "));
1974 tree type = TREE_TYPE (decl);
1975 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
1976 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1977 type, obj_size, DECL_ALIGN (decl));
1980 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1982 static void
1983 nvptx_globalize_label (FILE *, const char *)
1987 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1988 declaration only for variable DECL with NAME to FILE. */
1990 static void
1991 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1993 /* The middle end can place constant pool decls into the varpool as
1994 undefined. Until that is fixed, catch the problem here. */
1995 if (DECL_IN_CONSTANT_POOL (decl))
1996 return;
1998 /* We support weak defintions, and hence have the right
1999 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2000 if (DECL_WEAK (decl))
2001 error_at (DECL_SOURCE_LOCATION (decl),
2002 "PTX does not support weak declarations"
2003 " (only weak definitions)");
2004 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2006 fprintf (file, "\t.extern ");
2007 tree size = DECL_SIZE_UNIT (decl);
2008 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2009 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2010 DECL_ALIGN (decl));
2011 nvptx_assemble_decl_end ();
2014 /* Output a pattern for a move instruction. */
2016 const char *
2017 nvptx_output_mov_insn (rtx dst, rtx src)
2019 machine_mode dst_mode = GET_MODE (dst);
2020 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2021 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2022 machine_mode src_inner = (GET_CODE (src) == SUBREG
2023 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2025 rtx sym = src;
2026 if (GET_CODE (sym) == CONST)
2027 sym = XEXP (XEXP (sym, 0), 0);
2028 if (SYMBOL_REF_P (sym))
2030 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2031 return "%.\tcvta%D1%t0\t%0, %1;";
2032 nvptx_maybe_record_fnsym (sym);
2035 if (src_inner == dst_inner)
2036 return "%.\tmov%t0\t%0, %1;";
2038 if (CONSTANT_P (src))
2039 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2040 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2041 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2043 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2044 return "%.\tmov.b%T0\t%0, %1;";
2046 return "%.\tcvt%t0%t1\t%0, %1;";
2049 static void nvptx_print_operand (FILE *, rtx, int);
2051 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2052 involves writing .param declarations and in/out copies into them. For
2053 indirect calls, also write the .callprototype. */
2055 const char *
2056 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2058 char buf[16];
2059 static int labelno;
2060 bool needs_tgt = register_operand (callee, Pmode);
2061 rtx pat = PATTERN (insn);
2062 if (GET_CODE (pat) == COND_EXEC)
2063 pat = COND_EXEC_CODE (pat);
2064 int arg_end = XVECLEN (pat, 0);
2065 tree decl = NULL_TREE;
2067 fprintf (asm_out_file, "\t{\n");
2068 if (result != NULL)
2069 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2070 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2071 reg_names[NVPTX_RETURN_REGNUM]);
2073 /* Ensure we have a ptx declaration in the output if necessary. */
2074 if (GET_CODE (callee) == SYMBOL_REF)
2076 decl = SYMBOL_REF_DECL (callee);
2077 if (!decl
2078 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2079 nvptx_record_libfunc (callee, result, pat);
2080 else if (DECL_EXTERNAL (decl))
2081 nvptx_record_fndecl (decl);
2084 if (needs_tgt)
2086 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2087 labelno++;
2088 ASM_OUTPUT_LABEL (asm_out_file, buf);
2089 std::stringstream s;
2090 write_fn_proto_from_insn (s, NULL, result, pat);
2091 fputs (s.str().c_str(), asm_out_file);
2094 for (int argno = 1; argno < arg_end; argno++)
2096 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2097 machine_mode mode = GET_MODE (t);
2098 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2100 /* Mode splitting has already been done. */
2101 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2102 "\t\tst.param%s [%%out_arg%d], ",
2103 ptx_type, argno, ptx_type, argno);
2104 output_reg (asm_out_file, REGNO (t), VOIDmode);
2105 fprintf (asm_out_file, ";\n");
2108 /* The '.' stands for the call's predicate, if any. */
2109 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2110 fprintf (asm_out_file, "\t\tcall ");
2111 if (result != NULL_RTX)
2112 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2114 if (decl)
2116 const char *name = get_fnname_from_decl (decl);
2117 name = nvptx_name_replacement (name);
2118 assemble_name (asm_out_file, name);
2120 else
2121 output_address (VOIDmode, callee);
2123 const char *open = "(";
2124 for (int argno = 1; argno < arg_end; argno++)
2126 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2127 open = "";
2129 if (decl && DECL_STATIC_CHAIN (decl))
2131 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2132 open = "";
2134 if (!open[0])
2135 fprintf (asm_out_file, ")");
2137 if (needs_tgt)
2139 fprintf (asm_out_file, ", ");
2140 assemble_name (asm_out_file, buf);
2142 fprintf (asm_out_file, ";\n");
2144 if (find_reg_note (insn, REG_NORETURN, NULL))
2145 /* No return functions confuse the PTX JIT, as it doesn't realize
2146 the flow control barrier they imply. It can seg fault if it
2147 encounters what looks like an unexitable loop. Emit a trailing
2148 trap, which it does grok. */
2149 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2151 if (result)
2153 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2155 if (!rval[0])
2156 /* We must escape the '%' that starts RETURN_REGNUM. */
2157 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2158 reg_names[NVPTX_RETURN_REGNUM]);
2159 return rval;
2162 return "}";
2165 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2167 static bool
2168 nvptx_print_operand_punct_valid_p (unsigned char c)
2170 return c == '.' || c== '#';
2173 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2175 static void
2176 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2178 rtx off;
2179 if (GET_CODE (x) == CONST)
2180 x = XEXP (x, 0);
2181 switch (GET_CODE (x))
2183 case PLUS:
2184 off = XEXP (x, 1);
2185 output_address (VOIDmode, XEXP (x, 0));
2186 fprintf (file, "+");
2187 output_address (VOIDmode, off);
2188 break;
2190 case SYMBOL_REF:
2191 case LABEL_REF:
2192 output_addr_const (file, x);
2193 break;
2195 default:
2196 gcc_assert (GET_CODE (x) != MEM);
2197 nvptx_print_operand (file, x, 0);
2198 break;
2202 /* Write assembly language output for the address ADDR to FILE. */
2204 static void
2205 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2207 nvptx_print_address_operand (file, addr, mode);
2210 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2212 Meaning of CODE:
2213 . -- print the predicate for the instruction or an emptry string for an
2214 unconditional one.
2215 # -- print a rounding mode for the instruction
2217 A -- print a data area for a MEM
2218 c -- print an opcode suffix for a comparison operator, including a type code
2219 D -- print a data area for a MEM operand
2220 S -- print a shuffle kind specified by CONST_INT
2221 t -- print a type opcode suffix, promoting QImode to 32 bits
2222 T -- print a type size in bits
2223 u -- print a type opcode suffix without promotions. */
2225 static void
2226 nvptx_print_operand (FILE *file, rtx x, int code)
2228 if (code == '.')
2230 x = current_insn_predicate;
2231 if (x)
2233 fputs ("@", file);
2234 if (GET_CODE (x) == EQ)
2235 fputs ("!", file);
2236 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2238 return;
2240 else if (code == '#')
2242 fputs (".rn", file);
2243 return;
2246 enum rtx_code x_code = GET_CODE (x);
2247 machine_mode mode = GET_MODE (x);
2249 switch (code)
2251 case 'A':
2252 x = XEXP (x, 0);
2253 /* FALLTHROUGH. */
2255 case 'D':
2256 if (GET_CODE (x) == CONST)
2257 x = XEXP (x, 0);
2258 if (GET_CODE (x) == PLUS)
2259 x = XEXP (x, 0);
2261 if (GET_CODE (x) == SYMBOL_REF)
2262 fputs (section_for_sym (x), file);
2263 break;
2265 case 't':
2266 case 'u':
2267 if (x_code == SUBREG)
2269 mode = GET_MODE (SUBREG_REG (x));
2270 if (mode == TImode)
2271 mode = DImode;
2272 else if (COMPLEX_MODE_P (mode))
2273 mode = GET_MODE_INNER (mode);
2275 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2276 break;
2278 case 'S':
2280 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2281 /* Same order as nvptx_shuffle_kind. */
2282 static const char *const kinds[] =
2283 {".up", ".down", ".bfly", ".idx"};
2284 fputs (kinds[kind], file);
2286 break;
2288 case 'T':
2289 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2290 break;
2292 case 'j':
2293 fprintf (file, "@");
2294 goto common;
2296 case 'J':
2297 fprintf (file, "@!");
2298 goto common;
2300 case 'c':
2301 mode = GET_MODE (XEXP (x, 0));
2302 switch (x_code)
2304 case EQ:
2305 fputs (".eq", file);
2306 break;
2307 case NE:
2308 if (FLOAT_MODE_P (mode))
2309 fputs (".neu", file);
2310 else
2311 fputs (".ne", file);
2312 break;
2313 case LE:
2314 case LEU:
2315 fputs (".le", file);
2316 break;
2317 case GE:
2318 case GEU:
2319 fputs (".ge", file);
2320 break;
2321 case LT:
2322 case LTU:
2323 fputs (".lt", file);
2324 break;
2325 case GT:
2326 case GTU:
2327 fputs (".gt", file);
2328 break;
2329 case LTGT:
2330 fputs (".ne", file);
2331 break;
2332 case UNEQ:
2333 fputs (".equ", file);
2334 break;
2335 case UNLE:
2336 fputs (".leu", file);
2337 break;
2338 case UNGE:
2339 fputs (".geu", file);
2340 break;
2341 case UNLT:
2342 fputs (".ltu", file);
2343 break;
2344 case UNGT:
2345 fputs (".gtu", file);
2346 break;
2347 case UNORDERED:
2348 fputs (".nan", file);
2349 break;
2350 case ORDERED:
2351 fputs (".num", file);
2352 break;
2353 default:
2354 gcc_unreachable ();
2356 if (FLOAT_MODE_P (mode)
2357 || x_code == EQ || x_code == NE
2358 || x_code == GEU || x_code == GTU
2359 || x_code == LEU || x_code == LTU)
2360 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2361 else
2362 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2363 break;
2364 default:
2365 common:
2366 switch (x_code)
2368 case SUBREG:
2370 rtx inner_x = SUBREG_REG (x);
2371 machine_mode inner_mode = GET_MODE (inner_x);
2372 machine_mode split = maybe_split_mode (inner_mode);
2374 if (split != VOIDmode
2375 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2376 output_reg (file, REGNO (inner_x), split);
2377 else
2378 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2380 break;
2382 case REG:
2383 output_reg (file, REGNO (x), maybe_split_mode (mode));
2384 break;
2386 case MEM:
2387 fputc ('[', file);
2388 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2389 fputc (']', file);
2390 break;
2392 case CONST_INT:
2393 output_addr_const (file, x);
2394 break;
2396 case CONST:
2397 case SYMBOL_REF:
2398 case LABEL_REF:
2399 /* We could use output_addr_const, but that can print things like
2400 "x-8", which breaks ptxas. Need to ensure it is output as
2401 "x+-8". */
2402 nvptx_print_address_operand (file, x, VOIDmode);
2403 break;
2405 case CONST_DOUBLE:
2406 long vals[2];
2407 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2408 vals[0] &= 0xffffffff;
2409 vals[1] &= 0xffffffff;
2410 if (mode == SFmode)
2411 fprintf (file, "0f%08lx", vals[0]);
2412 else
2413 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2414 break;
2416 default:
2417 output_addr_const (file, x);
2422 /* Record replacement regs used to deal with subreg operands. */
2423 struct reg_replace
2425 rtx replacement[MAX_RECOG_OPERANDS];
2426 machine_mode mode;
2427 int n_allocated;
2428 int n_in_use;
2431 /* Allocate or reuse a replacement in R and return the rtx. */
2433 static rtx
2434 get_replacement (struct reg_replace *r)
2436 if (r->n_allocated == r->n_in_use)
2437 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2438 return r->replacement[r->n_in_use++];
2441 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2442 the presence of subregs would break the rules for most instructions.
2443 Replace them with a suitable new register of the right size, plus
2444 conversion copyin/copyout instructions. */
2446 static void
2447 nvptx_reorg_subreg (void)
2449 struct reg_replace qiregs, hiregs, siregs, diregs;
2450 rtx_insn *insn, *next;
2452 qiregs.n_allocated = 0;
2453 hiregs.n_allocated = 0;
2454 siregs.n_allocated = 0;
2455 diregs.n_allocated = 0;
2456 qiregs.mode = QImode;
2457 hiregs.mode = HImode;
2458 siregs.mode = SImode;
2459 diregs.mode = DImode;
2461 for (insn = get_insns (); insn; insn = next)
2463 next = NEXT_INSN (insn);
2464 if (!NONDEBUG_INSN_P (insn)
2465 || asm_noperands (PATTERN (insn)) >= 0
2466 || GET_CODE (PATTERN (insn)) == USE
2467 || GET_CODE (PATTERN (insn)) == CLOBBER)
2468 continue;
2470 qiregs.n_in_use = 0;
2471 hiregs.n_in_use = 0;
2472 siregs.n_in_use = 0;
2473 diregs.n_in_use = 0;
2474 extract_insn (insn);
2475 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2477 for (int i = 0; i < recog_data.n_operands; i++)
2479 rtx op = recog_data.operand[i];
2480 if (GET_CODE (op) != SUBREG)
2481 continue;
2483 rtx inner = SUBREG_REG (op);
2485 machine_mode outer_mode = GET_MODE (op);
2486 machine_mode inner_mode = GET_MODE (inner);
2487 gcc_assert (s_ok);
2488 if (s_ok
2489 && (GET_MODE_PRECISION (inner_mode)
2490 >= GET_MODE_PRECISION (outer_mode)))
2491 continue;
2492 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2493 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2494 : outer_mode == HImode ? &hiregs
2495 : outer_mode == SImode ? &siregs
2496 : &diregs);
2497 rtx new_reg = get_replacement (r);
2499 if (recog_data.operand_type[i] != OP_OUT)
2501 enum rtx_code code;
2502 if (GET_MODE_PRECISION (inner_mode)
2503 < GET_MODE_PRECISION (outer_mode))
2504 code = ZERO_EXTEND;
2505 else
2506 code = TRUNCATE;
2508 rtx pat = gen_rtx_SET (new_reg,
2509 gen_rtx_fmt_e (code, outer_mode, inner));
2510 emit_insn_before (pat, insn);
2513 if (recog_data.operand_type[i] != OP_IN)
2515 enum rtx_code code;
2516 if (GET_MODE_PRECISION (inner_mode)
2517 < GET_MODE_PRECISION (outer_mode))
2518 code = TRUNCATE;
2519 else
2520 code = ZERO_EXTEND;
2522 rtx pat = gen_rtx_SET (inner,
2523 gen_rtx_fmt_e (code, inner_mode, new_reg));
2524 emit_insn_after (pat, insn);
2526 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2531 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2532 first use. */
2534 static rtx
2535 nvptx_get_unisimt_master ()
2537 rtx &master = cfun->machine->unisimt_master;
2538 return master ? master : master = gen_reg_rtx (SImode);
2541 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2543 static rtx
2544 nvptx_get_unisimt_predicate ()
2546 rtx &pred = cfun->machine->unisimt_predicate;
2547 return pred ? pred : pred = gen_reg_rtx (BImode);
2550 /* Return true if given call insn references one of the functions provided by
2551 the CUDA runtime: malloc, free, vprintf. */
2553 static bool
2554 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2556 rtx pat = PATTERN (insn);
2557 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2558 pat = XVECEXP (pat, 0, 0);
2559 if (GET_CODE (pat) == SET)
2560 pat = SET_SRC (pat);
2561 gcc_checking_assert (GET_CODE (pat) == CALL
2562 && GET_CODE (XEXP (pat, 0)) == MEM);
2563 rtx addr = XEXP (XEXP (pat, 0), 0);
2564 if (GET_CODE (addr) != SYMBOL_REF)
2565 return false;
2566 const char *name = XSTR (addr, 0);
2567 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2568 references with forced assembler name refer to PTX syscalls. For vprintf,
2569 accept both normal and forced-assembler-name references. */
2570 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2571 || !strcmp (name, "*malloc")
2572 || !strcmp (name, "*free"));
2575 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2576 propagate its value from lane MASTER to current lane. */
2578 static void
2579 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2581 rtx reg;
2582 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2583 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2586 /* Adjust code for uniform-simt code generation variant by making atomics and
2587 "syscalls" conditionally executed, and inserting shuffle-based propagation
2588 for registers being set. */
2590 static void
2591 nvptx_reorg_uniform_simt ()
2593 rtx_insn *insn, *next;
2595 for (insn = get_insns (); insn; insn = next)
2597 next = NEXT_INSN (insn);
2598 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2599 && !(NONJUMP_INSN_P (insn)
2600 && GET_CODE (PATTERN (insn)) == PARALLEL
2601 && get_attr_atomic (insn)))
2602 continue;
2603 rtx pat = PATTERN (insn);
2604 rtx master = nvptx_get_unisimt_master ();
2605 for (int i = 0; i < XVECLEN (pat, 0); i++)
2606 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2607 rtx pred = nvptx_get_unisimt_predicate ();
2608 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2609 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2610 validate_change (insn, &PATTERN (insn), pat, false);
2614 /* Loop structure of the function. The entire function is described as
2615 a NULL loop. */
2617 struct parallel
2619 /* Parent parallel. */
2620 parallel *parent;
2622 /* Next sibling parallel. */
2623 parallel *next;
2625 /* First child parallel. */
2626 parallel *inner;
2628 /* Partitioning mask of the parallel. */
2629 unsigned mask;
2631 /* Partitioning used within inner parallels. */
2632 unsigned inner_mask;
2634 /* Location of parallel forked and join. The forked is the first
2635 block in the parallel and the join is the first block after of
2636 the partition. */
2637 basic_block forked_block;
2638 basic_block join_block;
2640 rtx_insn *forked_insn;
2641 rtx_insn *join_insn;
2643 rtx_insn *fork_insn;
2644 rtx_insn *joining_insn;
2646 /* Basic blocks in this parallel, but not in child parallels. The
2647 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2648 blocks are not. */
2649 auto_vec<basic_block> blocks;
2651 public:
2652 parallel (parallel *parent, unsigned mode);
2653 ~parallel ();
2656 /* Constructor links the new parallel into it's parent's chain of
2657 children. */
2659 parallel::parallel (parallel *parent_, unsigned mask_)
2660 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2662 forked_block = join_block = 0;
2663 forked_insn = join_insn = 0;
2664 fork_insn = joining_insn = 0;
2666 if (parent)
2668 next = parent->inner;
2669 parent->inner = this;
2673 parallel::~parallel ()
2675 delete inner;
2676 delete next;
2679 /* Map of basic blocks to insns */
2680 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2682 /* A tuple of an insn of interest and the BB in which it resides. */
2683 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2684 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2686 /* Split basic blocks such that each forked and join unspecs are at
2687 the start of their basic blocks. Thus afterwards each block will
2688 have a single partitioning mode. We also do the same for return
2689 insns, as they are executed by every thread. Return the
2690 partitioning mode of the function as a whole. Populate MAP with
2691 head and tail blocks. We also clear the BB visited flag, which is
2692 used when finding partitions. */
2694 static void
2695 nvptx_split_blocks (bb_insn_map_t *map)
2697 insn_bb_vec_t worklist;
2698 basic_block block;
2699 rtx_insn *insn;
2701 /* Locate all the reorg instructions of interest. */
2702 FOR_ALL_BB_FN (block, cfun)
2704 bool seen_insn = false;
2706 /* Clear visited flag, for use by parallel locator */
2707 block->flags &= ~BB_VISITED;
2709 FOR_BB_INSNS (block, insn)
2711 if (!INSN_P (insn))
2712 continue;
2713 switch (recog_memoized (insn))
2715 default:
2716 seen_insn = true;
2717 continue;
2718 case CODE_FOR_nvptx_forked:
2719 case CODE_FOR_nvptx_join:
2720 break;
2722 case CODE_FOR_return:
2723 /* We also need to split just before return insns, as
2724 that insn needs executing by all threads, but the
2725 block it is in probably does not. */
2726 break;
2729 if (seen_insn)
2730 /* We've found an instruction that must be at the start of
2731 a block, but isn't. Add it to the worklist. */
2732 worklist.safe_push (insn_bb_t (insn, block));
2733 else
2734 /* It was already the first instruction. Just add it to
2735 the map. */
2736 map->get_or_insert (block) = insn;
2737 seen_insn = true;
2741 /* Split blocks on the worklist. */
2742 unsigned ix;
2743 insn_bb_t *elt;
2744 basic_block remap = 0;
2745 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2747 if (remap != elt->second)
2749 block = elt->second;
2750 remap = block;
2753 /* Split block before insn. The insn is in the new block */
2754 edge e = split_block (block, PREV_INSN (elt->first));
2756 block = e->dest;
2757 map->get_or_insert (block) = elt->first;
2761 /* BLOCK is a basic block containing a head or tail instruction.
2762 Locate the associated prehead or pretail instruction, which must be
2763 in the single predecessor block. */
2765 static rtx_insn *
2766 nvptx_discover_pre (basic_block block, int expected)
2768 gcc_assert (block->preds->length () == 1);
2769 basic_block pre_block = (*block->preds)[0]->src;
2770 rtx_insn *pre_insn;
2772 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2773 pre_insn = PREV_INSN (pre_insn))
2774 gcc_assert (pre_insn != BB_HEAD (pre_block));
2776 gcc_assert (recog_memoized (pre_insn) == expected);
2777 return pre_insn;
2780 /* Dump this parallel and all its inner parallels. */
2782 static void
2783 nvptx_dump_pars (parallel *par, unsigned depth)
2785 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2786 depth, par->mask,
2787 par->forked_block ? par->forked_block->index : -1,
2788 par->join_block ? par->join_block->index : -1);
2790 fprintf (dump_file, " blocks:");
2792 basic_block block;
2793 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2794 fprintf (dump_file, " %d", block->index);
2795 fprintf (dump_file, "\n");
2796 if (par->inner)
2797 nvptx_dump_pars (par->inner, depth + 1);
2799 if (par->next)
2800 nvptx_dump_pars (par->next, depth);
2803 /* If BLOCK contains a fork/join marker, process it to create or
2804 terminate a loop structure. Add this block to the current loop,
2805 and then walk successor blocks. */
2807 static parallel *
2808 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2810 if (block->flags & BB_VISITED)
2811 return par;
2812 block->flags |= BB_VISITED;
2814 if (rtx_insn **endp = map->get (block))
2816 rtx_insn *end = *endp;
2818 /* This is a block head or tail, or return instruction. */
2819 switch (recog_memoized (end))
2821 case CODE_FOR_return:
2822 /* Return instructions are in their own block, and we
2823 don't need to do anything more. */
2824 return par;
2826 case CODE_FOR_nvptx_forked:
2827 /* Loop head, create a new inner loop and add it into
2828 our parent's child list. */
2830 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2832 gcc_assert (mask);
2833 par = new parallel (par, mask);
2834 par->forked_block = block;
2835 par->forked_insn = end;
2836 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2837 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2838 par->fork_insn
2839 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2841 break;
2843 case CODE_FOR_nvptx_join:
2844 /* A loop tail. Finish the current loop and return to
2845 parent. */
2847 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2849 gcc_assert (par->mask == mask);
2850 par->join_block = block;
2851 par->join_insn = end;
2852 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2853 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2854 par->joining_insn
2855 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2856 par = par->parent;
2858 break;
2860 default:
2861 gcc_unreachable ();
2865 if (par)
2866 /* Add this block onto the current loop's list of blocks. */
2867 par->blocks.safe_push (block);
2868 else
2869 /* This must be the entry block. Create a NULL parallel. */
2870 par = new parallel (0, 0);
2872 /* Walk successor blocks. */
2873 edge e;
2874 edge_iterator ei;
2876 FOR_EACH_EDGE (e, ei, block->succs)
2877 nvptx_find_par (map, par, e->dest);
2879 return par;
2882 /* DFS walk the CFG looking for fork & join markers. Construct
2883 loop structures as we go. MAP is a mapping of basic blocks
2884 to head & tail markers, discovered when splitting blocks. This
2885 speeds up the discovery. We rely on the BB visited flag having
2886 been cleared when splitting blocks. */
2888 static parallel *
2889 nvptx_discover_pars (bb_insn_map_t *map)
2891 basic_block block;
2893 /* Mark exit blocks as visited. */
2894 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2895 block->flags |= BB_VISITED;
2897 /* And entry block as not. */
2898 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2899 block->flags &= ~BB_VISITED;
2901 parallel *par = nvptx_find_par (map, 0, block);
2903 if (dump_file)
2905 fprintf (dump_file, "\nLoops\n");
2906 nvptx_dump_pars (par, 0);
2907 fprintf (dump_file, "\n");
2910 return par;
2913 /* Analyse a group of BBs within a partitioned region and create N
2914 Single-Entry-Single-Exit regions. Some of those regions will be
2915 trivial ones consisting of a single BB. The blocks of a
2916 partitioned region might form a set of disjoint graphs -- because
2917 the region encloses a differently partitoned sub region.
2919 We use the linear time algorithm described in 'Finding Regions Fast:
2920 Single Entry Single Exit and control Regions in Linear Time'
2921 Johnson, Pearson & Pingali. That algorithm deals with complete
2922 CFGs, where a back edge is inserted from END to START, and thus the
2923 problem becomes one of finding equivalent loops.
2925 In this case we have a partial CFG. We complete it by redirecting
2926 any incoming edge to the graph to be from an arbitrary external BB,
2927 and similarly redirecting any outgoing edge to be to that BB.
2928 Thus we end up with a closed graph.
2930 The algorithm works by building a spanning tree of an undirected
2931 graph and keeping track of back edges from nodes further from the
2932 root in the tree to nodes nearer to the root in the tree. In the
2933 description below, the root is up and the tree grows downwards.
2935 We avoid having to deal with degenerate back-edges to the same
2936 block, by splitting each BB into 3 -- one for input edges, one for
2937 the node itself and one for the output edges. Such back edges are
2938 referred to as 'Brackets'. Cycle equivalent nodes will have the
2939 same set of brackets.
2941 Determining bracket equivalency is done by maintaining a list of
2942 brackets in such a manner that the list length and final bracket
2943 uniquely identify the set.
2945 We use coloring to mark all BBs with cycle equivalency with the
2946 same color. This is the output of the 'Finding Regions Fast'
2947 algorithm. Notice it doesn't actually find the set of nodes within
2948 a particular region, just unorderd sets of nodes that are the
2949 entries and exits of SESE regions.
2951 After determining cycle equivalency, we need to find the minimal
2952 set of SESE regions. Do this with a DFS coloring walk of the
2953 complete graph. We're either 'looking' or 'coloring'. When
2954 looking, and we're in the subgraph, we start coloring the color of
2955 the current node, and remember that node as the start of the
2956 current color's SESE region. Every time we go to a new node, we
2957 decrement the count of nodes with thet color. If it reaches zero,
2958 we remember that node as the end of the current color's SESE region
2959 and return to 'looking'. Otherwise we color the node the current
2960 color.
2962 This way we end up with coloring the inside of non-trivial SESE
2963 regions with the color of that region. */
2965 /* A pair of BBs. We use this to represent SESE regions. */
2966 typedef std::pair<basic_block, basic_block> bb_pair_t;
2967 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
2969 /* A node in the undirected CFG. The discriminator SECOND indicates just
2970 above or just below the BB idicated by FIRST. */
2971 typedef std::pair<basic_block, int> pseudo_node_t;
2973 /* A bracket indicates an edge towards the root of the spanning tree of the
2974 undirected graph. Each bracket has a color, determined
2975 from the currrent set of brackets. */
2976 struct bracket
2978 pseudo_node_t back; /* Back target */
2980 /* Current color and size of set. */
2981 unsigned color;
2982 unsigned size;
2984 bracket (pseudo_node_t back_)
2985 : back (back_), color (~0u), size (~0u)
2989 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
2991 if (length != size)
2993 size = length;
2994 color = color_counts.length ();
2995 color_counts.quick_push (0);
2997 color_counts[color]++;
2998 return color;
3002 typedef auto_vec<bracket> bracket_vec_t;
3004 /* Basic block info for finding SESE regions. */
3006 struct bb_sese
3008 int node; /* Node number in spanning tree. */
3009 int parent; /* Parent node number. */
3011 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3012 edges arrive at pseudo-node Ai and the outgoing edges leave at
3013 pseudo-node Ao. We have to remember which way we arrived at a
3014 particular node when generating the spanning tree. dir > 0 means
3015 we arrived at Ai, dir < 0 means we arrived at Ao. */
3016 int dir;
3018 /* Lowest numbered pseudo-node reached via a backedge from thsis
3019 node, or any descendant. */
3020 pseudo_node_t high;
3022 int color; /* Cycle-equivalence color */
3024 /* Stack of brackets for this node. */
3025 bracket_vec_t brackets;
3027 bb_sese (unsigned node_, unsigned p, int dir_)
3028 :node (node_), parent (p), dir (dir_)
3031 ~bb_sese ();
3033 /* Push a bracket ending at BACK. */
3034 void push (const pseudo_node_t &back)
3036 if (dump_file)
3037 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3038 back.first ? back.first->index : 0, back.second);
3039 brackets.safe_push (bracket (back));
3042 void append (bb_sese *child);
3043 void remove (const pseudo_node_t &);
3045 /* Set node's color. */
3046 void set_color (auto_vec<unsigned> &color_counts)
3048 color = brackets.last ().get_color (color_counts, brackets.length ());
3052 bb_sese::~bb_sese ()
3056 /* Destructively append CHILD's brackets. */
3058 void
3059 bb_sese::append (bb_sese *child)
3061 if (int len = child->brackets.length ())
3063 int ix;
3065 if (dump_file)
3067 for (ix = 0; ix < len; ix++)
3069 const pseudo_node_t &pseudo = child->brackets[ix].back;
3070 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3071 child->node, pseudo.first ? pseudo.first->index : 0,
3072 pseudo.second);
3075 if (!brackets.length ())
3076 std::swap (brackets, child->brackets);
3077 else
3079 brackets.reserve (len);
3080 for (ix = 0; ix < len; ix++)
3081 brackets.quick_push (child->brackets[ix]);
3086 /* Remove brackets that terminate at PSEUDO. */
3088 void
3089 bb_sese::remove (const pseudo_node_t &pseudo)
3091 unsigned removed = 0;
3092 int len = brackets.length ();
3094 for (int ix = 0; ix < len; ix++)
3096 if (brackets[ix].back == pseudo)
3098 if (dump_file)
3099 fprintf (dump_file, "Removing backedge %d:%+d\n",
3100 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3101 removed++;
3103 else if (removed)
3104 brackets[ix-removed] = brackets[ix];
3106 while (removed--)
3107 brackets.pop ();
3110 /* Accessors for BB's aux pointer. */
3111 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3112 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3114 /* DFS walk creating SESE data structures. Only cover nodes with
3115 BB_VISITED set. Append discovered blocks to LIST. We number in
3116 increments of 3 so that the above and below pseudo nodes can be
3117 implicitly numbered too. */
3119 static int
3120 nvptx_sese_number (int n, int p, int dir, basic_block b,
3121 auto_vec<basic_block> *list)
3123 if (BB_GET_SESE (b))
3124 return n;
3126 if (dump_file)
3127 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3128 b->index, n, p, dir);
3130 BB_SET_SESE (b, new bb_sese (n, p, dir));
3131 p = n;
3133 n += 3;
3134 list->quick_push (b);
3136 /* First walk the nodes on the 'other side' of this node, then walk
3137 the nodes on the same side. */
3138 for (unsigned ix = 2; ix; ix--)
3140 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3141 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3142 : offsetof (edge_def, src));
3143 edge e;
3144 edge_iterator (ei);
3146 FOR_EACH_EDGE (e, ei, edges)
3148 basic_block target = *(basic_block *)((char *)e + offset);
3150 if (target->flags & BB_VISITED)
3151 n = nvptx_sese_number (n, p, dir, target, list);
3153 dir = -dir;
3155 return n;
3158 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3159 EDGES are the outgoing edges and OFFSET is the offset to the src
3160 or dst block on the edges. */
3162 static void
3163 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3164 vec<edge, va_gc> *edges, size_t offset)
3166 edge e;
3167 edge_iterator (ei);
3168 int hi_back = depth;
3169 pseudo_node_t node_back (0, depth);
3170 int hi_child = depth;
3171 pseudo_node_t node_child (0, depth);
3172 basic_block child = NULL;
3173 unsigned num_children = 0;
3174 int usd = -dir * sese->dir;
3176 if (dump_file)
3177 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3178 me->index, sese->node, dir);
3180 if (dir < 0)
3182 /* This is the above pseudo-child. It has the BB itself as an
3183 additional child node. */
3184 node_child = sese->high;
3185 hi_child = node_child.second;
3186 if (node_child.first)
3187 hi_child += BB_GET_SESE (node_child.first)->node;
3188 num_children++;
3191 /* Examine each edge.
3192 - if it is a child (a) append its bracket list and (b) record
3193 whether it is the child with the highest reaching bracket.
3194 - if it is an edge to ancestor, record whether it's the highest
3195 reaching backlink. */
3196 FOR_EACH_EDGE (e, ei, edges)
3198 basic_block target = *(basic_block *)((char *)e + offset);
3200 if (bb_sese *t_sese = BB_GET_SESE (target))
3202 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3204 /* Child node. Append its bracket list. */
3205 num_children++;
3206 sese->append (t_sese);
3208 /* Compare it's hi value. */
3209 int t_hi = t_sese->high.second;
3211 if (basic_block child_hi_block = t_sese->high.first)
3212 t_hi += BB_GET_SESE (child_hi_block)->node;
3214 if (hi_child > t_hi)
3216 hi_child = t_hi;
3217 node_child = t_sese->high;
3218 child = target;
3221 else if (t_sese->node < sese->node + dir
3222 && !(dir < 0 && sese->parent == t_sese->node))
3224 /* Non-parental ancestor node -- a backlink. */
3225 int d = usd * t_sese->dir;
3226 int back = t_sese->node + d;
3228 if (hi_back > back)
3230 hi_back = back;
3231 node_back = pseudo_node_t (target, d);
3235 else
3236 { /* Fallen off graph, backlink to entry node. */
3237 hi_back = 0;
3238 node_back = pseudo_node_t (0, 0);
3242 /* Remove any brackets that terminate at this pseudo node. */
3243 sese->remove (pseudo_node_t (me, dir));
3245 /* Now push any backlinks from this pseudo node. */
3246 FOR_EACH_EDGE (e, ei, edges)
3248 basic_block target = *(basic_block *)((char *)e + offset);
3249 if (bb_sese *t_sese = BB_GET_SESE (target))
3251 if (t_sese->node < sese->node + dir
3252 && !(dir < 0 && sese->parent == t_sese->node))
3253 /* Non-parental ancestor node - backedge from me. */
3254 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3256 else
3258 /* back edge to entry node */
3259 sese->push (pseudo_node_t (0, 0));
3263 /* If this node leads directly or indirectly to a no-return region of
3264 the graph, then fake a backedge to entry node. */
3265 if (!sese->brackets.length () || !edges || !edges->length ())
3267 hi_back = 0;
3268 node_back = pseudo_node_t (0, 0);
3269 sese->push (node_back);
3272 /* Record the highest reaching backedge from us or a descendant. */
3273 sese->high = hi_back < hi_child ? node_back : node_child;
3275 if (num_children > 1)
3277 /* There is more than one child -- this is a Y shaped piece of
3278 spanning tree. We have to insert a fake backedge from this
3279 node to the highest ancestor reached by not-the-highest
3280 reaching child. Note that there may be multiple children
3281 with backedges to the same highest node. That's ok and we
3282 insert the edge to that highest node. */
3283 hi_child = depth;
3284 if (dir < 0 && child)
3286 node_child = sese->high;
3287 hi_child = node_child.second;
3288 if (node_child.first)
3289 hi_child += BB_GET_SESE (node_child.first)->node;
3292 FOR_EACH_EDGE (e, ei, edges)
3294 basic_block target = *(basic_block *)((char *)e + offset);
3296 if (target == child)
3297 /* Ignore the highest child. */
3298 continue;
3300 bb_sese *t_sese = BB_GET_SESE (target);
3301 if (!t_sese)
3302 continue;
3303 if (t_sese->parent != sese->node)
3304 /* Not a child. */
3305 continue;
3307 /* Compare its hi value. */
3308 int t_hi = t_sese->high.second;
3310 if (basic_block child_hi_block = t_sese->high.first)
3311 t_hi += BB_GET_SESE (child_hi_block)->node;
3313 if (hi_child > t_hi)
3315 hi_child = t_hi;
3316 node_child = t_sese->high;
3320 sese->push (node_child);
3325 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3326 proceed to successors. Set SESE entry and exit nodes of
3327 REGIONS. */
3329 static void
3330 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3331 basic_block block, int coloring)
3333 bb_sese *sese = BB_GET_SESE (block);
3335 if (block->flags & BB_VISITED)
3337 /* If we've already encountered this block, either we must not
3338 be coloring, or it must have been colored the current color. */
3339 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3340 return;
3343 block->flags |= BB_VISITED;
3345 if (sese)
3347 if (coloring < 0)
3349 /* Start coloring a region. */
3350 regions[sese->color].first = block;
3351 coloring = sese->color;
3354 if (!--color_counts[sese->color] && sese->color == coloring)
3356 /* Found final block of SESE region. */
3357 regions[sese->color].second = block;
3358 coloring = -1;
3360 else
3361 /* Color the node, so we can assert on revisiting the node
3362 that the graph is indeed SESE. */
3363 sese->color = coloring;
3365 else
3366 /* Fallen off the subgraph, we cannot be coloring. */
3367 gcc_assert (coloring < 0);
3369 /* Walk each successor block. */
3370 if (block->succs && block->succs->length ())
3372 edge e;
3373 edge_iterator ei;
3375 FOR_EACH_EDGE (e, ei, block->succs)
3376 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3378 else
3379 gcc_assert (coloring < 0);
3382 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3383 end up with NULL entries in it. */
3385 static void
3386 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3388 basic_block block;
3389 int ix;
3391 /* First clear each BB of the whole function. */
3392 FOR_ALL_BB_FN (block, cfun)
3394 block->flags &= ~BB_VISITED;
3395 BB_SET_SESE (block, 0);
3398 /* Mark blocks in the function that are in this graph. */
3399 for (ix = 0; blocks.iterate (ix, &block); ix++)
3400 block->flags |= BB_VISITED;
3402 /* Counts of nodes assigned to each color. There cannot be more
3403 colors than blocks (and hopefully there will be fewer). */
3404 auto_vec<unsigned> color_counts;
3405 color_counts.reserve (blocks.length ());
3407 /* Worklist of nodes in the spanning tree. Again, there cannot be
3408 more nodes in the tree than blocks (there will be fewer if the
3409 CFG of blocks is disjoint). */
3410 auto_vec<basic_block> spanlist;
3411 spanlist.reserve (blocks.length ());
3413 /* Make sure every block has its cycle class determined. */
3414 for (ix = 0; blocks.iterate (ix, &block); ix++)
3416 if (BB_GET_SESE (block))
3417 /* We already met this block in an earlier graph solve. */
3418 continue;
3420 if (dump_file)
3421 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3423 /* Number the nodes reachable from block initial DFS order. */
3424 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3426 /* Now walk in reverse DFS order to find cycle equivalents. */
3427 while (spanlist.length ())
3429 block = spanlist.pop ();
3430 bb_sese *sese = BB_GET_SESE (block);
3432 /* Do the pseudo node below. */
3433 nvptx_sese_pseudo (block, sese, depth, +1,
3434 sese->dir > 0 ? block->succs : block->preds,
3435 (sese->dir > 0 ? offsetof (edge_def, dest)
3436 : offsetof (edge_def, src)));
3437 sese->set_color (color_counts);
3438 /* Do the pseudo node above. */
3439 nvptx_sese_pseudo (block, sese, depth, -1,
3440 sese->dir < 0 ? block->succs : block->preds,
3441 (sese->dir < 0 ? offsetof (edge_def, dest)
3442 : offsetof (edge_def, src)));
3444 if (dump_file)
3445 fprintf (dump_file, "\n");
3448 if (dump_file)
3450 unsigned count;
3451 const char *comma = "";
3453 fprintf (dump_file, "Found %d cycle equivalents\n",
3454 color_counts.length ());
3455 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3457 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3459 comma = "";
3460 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3461 if (BB_GET_SESE (block)->color == ix)
3463 block->flags |= BB_VISITED;
3464 fprintf (dump_file, "%s%d", comma, block->index);
3465 comma=",";
3467 fprintf (dump_file, "}");
3468 comma = ", ";
3470 fprintf (dump_file, "\n");
3473 /* Now we've colored every block in the subgraph. We now need to
3474 determine the minimal set of SESE regions that cover that
3475 subgraph. Do this with a DFS walk of the complete function.
3476 During the walk we're either 'looking' or 'coloring'. When we
3477 reach the last node of a particular color, we stop coloring and
3478 return to looking. */
3480 /* There cannot be more SESE regions than colors. */
3481 regions.reserve (color_counts.length ());
3482 for (ix = color_counts.length (); ix--;)
3483 regions.quick_push (bb_pair_t (0, 0));
3485 for (ix = 0; blocks.iterate (ix, &block); ix++)
3486 block->flags &= ~BB_VISITED;
3488 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3490 if (dump_file)
3492 const char *comma = "";
3493 int len = regions.length ();
3495 fprintf (dump_file, "SESE regions:");
3496 for (ix = 0; ix != len; ix++)
3498 basic_block from = regions[ix].first;
3499 basic_block to = regions[ix].second;
3501 if (from)
3503 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3504 if (to != from)
3505 fprintf (dump_file, "->%d", to->index);
3507 int color = BB_GET_SESE (from)->color;
3509 /* Print the blocks within the region (excluding ends). */
3510 FOR_EACH_BB_FN (block, cfun)
3512 bb_sese *sese = BB_GET_SESE (block);
3514 if (sese && sese->color == color
3515 && block != from && block != to)
3516 fprintf (dump_file, ".%d", block->index);
3518 fprintf (dump_file, "}");
3520 comma = ",";
3522 fprintf (dump_file, "\n\n");
3525 for (ix = 0; blocks.iterate (ix, &block); ix++)
3526 delete BB_GET_SESE (block);
3529 #undef BB_SET_SESE
3530 #undef BB_GET_SESE
3532 /* Propagate live state at the start of a partitioned region. BLOCK
3533 provides the live register information, and might not contain
3534 INSN. Propagation is inserted just after INSN. RW indicates whether
3535 we are reading and/or writing state. This
3536 separation is needed for worker-level proppagation where we
3537 essentially do a spill & fill. FN is the underlying worker
3538 function to generate the propagation instructions for single
3539 register. DATA is user data.
3541 We propagate the live register set and the entire frame. We could
3542 do better by (a) propagating just the live set that is used within
3543 the partitioned regions and (b) only propagating stack entries that
3544 are used. The latter might be quite hard to determine. */
3546 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3548 static void
3549 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3550 propagator_fn fn, void *data)
3552 bitmap live = DF_LIVE_IN (block);
3553 bitmap_iterator iterator;
3554 unsigned ix;
3556 /* Copy the frame array. */
3557 HOST_WIDE_INT fs = get_frame_size ();
3558 if (fs)
3560 rtx tmp = gen_reg_rtx (DImode);
3561 rtx idx = NULL_RTX;
3562 rtx ptr = gen_reg_rtx (Pmode);
3563 rtx pred = NULL_RTX;
3564 rtx_code_label *label = NULL;
3566 /* The frame size might not be DImode compatible, but the frame
3567 array's declaration will be. So it's ok to round up here. */
3568 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3569 /* Detect single iteration loop. */
3570 if (fs == 1)
3571 fs = 0;
3573 start_sequence ();
3574 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3575 if (fs)
3577 idx = gen_reg_rtx (SImode);
3578 pred = gen_reg_rtx (BImode);
3579 label = gen_label_rtx ();
3581 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3582 /* Allow worker function to initialize anything needed. */
3583 rtx init = fn (tmp, PM_loop_begin, fs, data);
3584 if (init)
3585 emit_insn (init);
3586 emit_label (label);
3587 LABEL_NUSES (label)++;
3588 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3590 if (rw & PM_read)
3591 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3592 emit_insn (fn (tmp, rw, fs, data));
3593 if (rw & PM_write)
3594 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3595 if (fs)
3597 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3598 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3599 emit_insn (gen_br_true_uni (pred, label));
3600 rtx fini = fn (tmp, PM_loop_end, fs, data);
3601 if (fini)
3602 emit_insn (fini);
3603 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3605 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3606 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3607 rtx cpy = get_insns ();
3608 end_sequence ();
3609 insn = emit_insn_after (cpy, insn);
3612 /* Copy live registers. */
3613 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3615 rtx reg = regno_reg_rtx[ix];
3617 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3619 rtx bcast = fn (reg, rw, 0, data);
3621 insn = emit_insn_after (bcast, insn);
3626 /* Worker for nvptx_vpropagate. */
3628 static rtx
3629 vprop_gen (rtx reg, propagate_mask pm,
3630 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3632 if (!(pm & PM_read_write))
3633 return 0;
3635 return nvptx_gen_vcast (reg);
3638 /* Propagate state that is live at start of BLOCK across the vectors
3639 of a single warp. Propagation is inserted just after INSN. */
3641 static void
3642 nvptx_vpropagate (basic_block block, rtx_insn *insn)
3644 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3647 /* Worker for nvptx_wpropagate. */
3649 static rtx
3650 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3652 wcast_data_t *data = (wcast_data_t *)data_;
3654 if (pm & PM_loop_begin)
3656 /* Starting a loop, initialize pointer. */
3657 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3659 if (align > worker_bcast_align)
3660 worker_bcast_align = align;
3661 data->offset = (data->offset + align - 1) & ~(align - 1);
3663 data->ptr = gen_reg_rtx (Pmode);
3665 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3667 else if (pm & PM_loop_end)
3669 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3670 data->ptr = NULL_RTX;
3671 return clobber;
3673 else
3674 return nvptx_gen_wcast (reg, pm, rep, data);
3677 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3678 indicates if this is just before partitioned mode (do spill), or
3679 just after it starts (do fill). Sequence is inserted just after
3680 INSN. */
3682 static void
3683 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3685 wcast_data_t data;
3687 data.base = gen_reg_rtx (Pmode);
3688 data.offset = 0;
3689 data.ptr = NULL_RTX;
3691 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3692 if (data.offset)
3694 /* Stuff was emitted, initialize the base pointer now. */
3695 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3696 emit_insn_after (init, insn);
3698 if (worker_bcast_size < data.offset)
3699 worker_bcast_size = data.offset;
3703 /* Emit a worker-level synchronization barrier. We use different
3704 markers for before and after synchronizations. */
3706 static rtx
3707 nvptx_wsync (bool after)
3709 return gen_nvptx_barsync (GEN_INT (after));
3712 /* Single neutering according to MASK. FROM is the incoming block and
3713 TO is the outgoing block. These may be the same block. Insert at
3714 start of FROM:
3716 if (tid.<axis>) goto end.
3718 and insert before ending branch of TO (if there is such an insn):
3720 end:
3721 <possibly-broadcast-cond>
3722 <branch>
3724 We currently only use differnt FROM and TO when skipping an entire
3725 loop. We could do more if we detected superblocks. */
3727 static void
3728 nvptx_single (unsigned mask, basic_block from, basic_block to)
3730 rtx_insn *head = BB_HEAD (from);
3731 rtx_insn *tail = BB_END (to);
3732 unsigned skip_mask = mask;
3734 /* Find first insn of from block */
3735 while (head != BB_END (from) && !INSN_P (head))
3736 head = NEXT_INSN (head);
3738 /* Find last insn of to block */
3739 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3740 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3741 tail = PREV_INSN (tail);
3743 /* Detect if tail is a branch. */
3744 rtx tail_branch = NULL_RTX;
3745 rtx cond_branch = NULL_RTX;
3746 if (tail && INSN_P (tail))
3748 tail_branch = PATTERN (tail);
3749 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3750 tail_branch = NULL_RTX;
3751 else
3753 cond_branch = SET_SRC (tail_branch);
3754 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3755 cond_branch = NULL_RTX;
3759 if (tail == head)
3761 /* If this is empty, do nothing. */
3762 if (!head || !INSN_P (head))
3763 return;
3765 /* If this is a dummy insn, do nothing. */
3766 switch (recog_memoized (head))
3768 default:
3769 break;
3770 case CODE_FOR_nvptx_fork:
3771 case CODE_FOR_nvptx_forked:
3772 case CODE_FOR_nvptx_joining:
3773 case CODE_FOR_nvptx_join:
3774 return;
3777 if (cond_branch)
3779 /* If we're only doing vector single, there's no need to
3780 emit skip code because we'll not insert anything. */
3781 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3782 skip_mask = 0;
3784 else if (tail_branch)
3785 /* Block with only unconditional branch. Nothing to do. */
3786 return;
3789 /* Insert the vector test inside the worker test. */
3790 unsigned mode;
3791 rtx_insn *before = tail;
3792 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3793 if (GOMP_DIM_MASK (mode) & skip_mask)
3795 rtx_code_label *label = gen_label_rtx ();
3796 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3798 if (!pred)
3800 pred = gen_reg_rtx (BImode);
3801 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3804 rtx br;
3805 if (mode == GOMP_DIM_VECTOR)
3806 br = gen_br_true (pred, label);
3807 else
3808 br = gen_br_true_uni (pred, label);
3809 emit_insn_before (br, head);
3811 LABEL_NUSES (label)++;
3812 if (tail_branch)
3813 before = emit_label_before (label, before);
3814 else
3815 emit_label_after (label, tail);
3818 /* Now deal with propagating the branch condition. */
3819 if (cond_branch)
3821 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3823 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3825 /* Vector mode only, do a shuffle. */
3826 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3828 else
3830 /* Includes worker mode, do spill & fill. By construction
3831 we should never have worker mode only. */
3832 wcast_data_t data;
3834 data.base = worker_bcast_sym;
3835 data.ptr = 0;
3837 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3838 worker_bcast_size = GET_MODE_SIZE (SImode);
3840 data.offset = 0;
3841 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3842 before);
3843 /* Barrier so other workers can see the write. */
3844 emit_insn_before (nvptx_wsync (false), tail);
3845 data.offset = 0;
3846 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3847 /* This barrier is needed to avoid worker zero clobbering
3848 the broadcast buffer before all the other workers have
3849 had a chance to read this instance of it. */
3850 emit_insn_before (nvptx_wsync (true), tail);
3853 extract_insn (tail);
3854 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3855 UNSPEC_BR_UNIFIED);
3856 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3860 /* PAR is a parallel that is being skipped in its entirety according to
3861 MASK. Treat this as skipping a superblock starting at forked
3862 and ending at joining. */
3864 static void
3865 nvptx_skip_par (unsigned mask, parallel *par)
3867 basic_block tail = par->join_block;
3868 gcc_assert (tail->preds->length () == 1);
3870 basic_block pre_tail = (*tail->preds)[0]->src;
3871 gcc_assert (pre_tail->succs->length () == 1);
3873 nvptx_single (mask, par->forked_block, pre_tail);
3876 /* If PAR has a single inner parallel and PAR itself only contains
3877 empty entry and exit blocks, swallow the inner PAR. */
3879 static void
3880 nvptx_optimize_inner (parallel *par)
3882 parallel *inner = par->inner;
3884 /* We mustn't be the outer dummy par. */
3885 if (!par->mask)
3886 return;
3888 /* We must have a single inner par. */
3889 if (!inner || inner->next)
3890 return;
3892 /* We must only contain 2 blocks ourselves -- the head and tail of
3893 the inner par. */
3894 if (par->blocks.length () != 2)
3895 return;
3897 /* We must be disjoint partitioning. As we only have vector and
3898 worker partitioning, this is sufficient to guarantee the pars
3899 have adjacent partitioning. */
3900 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
3901 /* This indicates malformed code generation. */
3902 return;
3904 /* The outer forked insn should be immediately followed by the inner
3905 fork insn. */
3906 rtx_insn *forked = par->forked_insn;
3907 rtx_insn *fork = BB_END (par->forked_block);
3909 if (NEXT_INSN (forked) != fork)
3910 return;
3911 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
3913 /* The outer joining insn must immediately follow the inner join
3914 insn. */
3915 rtx_insn *joining = par->joining_insn;
3916 rtx_insn *join = inner->join_insn;
3917 if (NEXT_INSN (join) != joining)
3918 return;
3920 /* Preconditions met. Swallow the inner par. */
3921 if (dump_file)
3922 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3923 inner->mask, inner->forked_block->index,
3924 inner->join_block->index,
3925 par->mask, par->forked_block->index, par->join_block->index);
3927 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
3929 par->blocks.reserve (inner->blocks.length ());
3930 while (inner->blocks.length ())
3931 par->blocks.quick_push (inner->blocks.pop ());
3933 par->inner = inner->inner;
3934 inner->inner = NULL;
3936 delete inner;
3939 /* Process the parallel PAR and all its contained
3940 parallels. We do everything but the neutering. Return mask of
3941 partitioned modes used within this parallel. */
3943 static unsigned
3944 nvptx_process_pars (parallel *par)
3946 if (nvptx_optimize)
3947 nvptx_optimize_inner (par);
3949 unsigned inner_mask = par->mask;
3951 /* Do the inner parallels first. */
3952 if (par->inner)
3954 par->inner_mask = nvptx_process_pars (par->inner);
3955 inner_mask |= par->inner_mask;
3958 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3959 /* No propagation needed for a call. */;
3960 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3962 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
3963 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
3964 /* Insert begin and end synchronizations. */
3965 emit_insn_after (nvptx_wsync (false), par->forked_insn);
3966 emit_insn_before (nvptx_wsync (true), par->joining_insn);
3968 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3969 nvptx_vpropagate (par->forked_block, par->forked_insn);
3971 /* Now do siblings. */
3972 if (par->next)
3973 inner_mask |= nvptx_process_pars (par->next);
3974 return inner_mask;
3977 /* Neuter the parallel described by PAR. We recurse in depth-first
3978 order. MODES are the partitioning of the execution and OUTER is
3979 the partitioning of the parallels we are contained in. */
3981 static void
3982 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
3984 unsigned me = (par->mask
3985 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
3986 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3987 unsigned skip_mask = 0, neuter_mask = 0;
3989 if (par->inner)
3990 nvptx_neuter_pars (par->inner, modes, outer | me);
3992 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3994 if ((outer | me) & GOMP_DIM_MASK (mode))
3995 {} /* Mode is partitioned: no neutering. */
3996 else if (!(modes & GOMP_DIM_MASK (mode)))
3997 {} /* Mode is not used: nothing to do. */
3998 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3999 || !par->forked_insn)
4000 /* Partitioned in inner parallels, or we're not a partitioned
4001 at all: neuter individual blocks. */
4002 neuter_mask |= GOMP_DIM_MASK (mode);
4003 else if (!par->parent || !par->parent->forked_insn
4004 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4005 /* Parent isn't a parallel or contains this paralleling: skip
4006 parallel at this level. */
4007 skip_mask |= GOMP_DIM_MASK (mode);
4008 else
4009 {} /* Parent will skip this parallel itself. */
4012 if (neuter_mask)
4014 int ix, len;
4016 if (nvptx_optimize)
4018 /* Neuter whole SESE regions. */
4019 bb_pair_vec_t regions;
4021 nvptx_find_sese (par->blocks, regions);
4022 len = regions.length ();
4023 for (ix = 0; ix != len; ix++)
4025 basic_block from = regions[ix].first;
4026 basic_block to = regions[ix].second;
4028 if (from)
4029 nvptx_single (neuter_mask, from, to);
4030 else
4031 gcc_assert (!to);
4034 else
4036 /* Neuter each BB individually. */
4037 len = par->blocks.length ();
4038 for (ix = 0; ix != len; ix++)
4040 basic_block block = par->blocks[ix];
4042 nvptx_single (neuter_mask, block, block);
4047 if (skip_mask)
4048 nvptx_skip_par (skip_mask, par);
4050 if (par->next)
4051 nvptx_neuter_pars (par->next, modes, outer);
4054 /* PTX-specific reorganization
4055 - Split blocks at fork and join instructions
4056 - Compute live registers
4057 - Mark now-unused registers, so function begin doesn't declare
4058 unused registers.
4059 - Insert state propagation when entering partitioned mode
4060 - Insert neutering instructions when in single mode
4061 - Replace subregs with suitable sequences.
4064 static void
4065 nvptx_reorg (void)
4067 /* We are freeing block_for_insn in the toplev to keep compatibility
4068 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4069 compute_bb_for_insn ();
4071 thread_prologue_and_epilogue_insns ();
4073 /* Split blocks and record interesting unspecs. */
4074 bb_insn_map_t bb_insn_map;
4076 nvptx_split_blocks (&bb_insn_map);
4078 /* Compute live regs */
4079 df_clear_flags (DF_LR_RUN_DCE);
4080 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
4081 df_live_add_problem ();
4082 df_live_set_all_dirty ();
4083 df_analyze ();
4084 regstat_init_n_sets_and_refs ();
4086 if (dump_file)
4087 df_dump (dump_file);
4089 /* Mark unused regs as unused. */
4090 int max_regs = max_reg_num ();
4091 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
4092 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4093 regno_reg_rtx[i] = const0_rtx;
4095 /* Determine launch dimensions of the function. If it is not an
4096 offloaded function (i.e. this is a regular compiler), the
4097 function has no neutering. */
4098 tree attr = oacc_get_fn_attrib (current_function_decl);
4099 if (attr)
4101 /* If we determined this mask before RTL expansion, we could
4102 elide emission of some levels of forks and joins. */
4103 unsigned mask = 0;
4104 tree dims = TREE_VALUE (attr);
4105 unsigned ix;
4107 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4109 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4110 tree allowed = TREE_PURPOSE (dims);
4112 if (size != 1 && !(allowed && integer_zerop (allowed)))
4113 mask |= GOMP_DIM_MASK (ix);
4115 /* If there is worker neutering, there must be vector
4116 neutering. Otherwise the hardware will fail. */
4117 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4118 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4120 /* Discover & process partitioned regions. */
4121 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4122 nvptx_process_pars (pars);
4123 nvptx_neuter_pars (pars, mask, 0);
4124 delete pars;
4127 /* Replace subregs. */
4128 nvptx_reorg_subreg ();
4130 if (TARGET_UNIFORM_SIMT)
4131 nvptx_reorg_uniform_simt ();
4133 regstat_free_n_sets_and_refs ();
4135 df_finish_pass (true);
4138 /* Handle a "kernel" attribute; arguments as in
4139 struct attribute_spec.handler. */
4141 static tree
4142 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4143 int ARG_UNUSED (flags), bool *no_add_attrs)
4145 tree decl = *node;
4147 if (TREE_CODE (decl) != FUNCTION_DECL)
4149 error ("%qE attribute only applies to functions", name);
4150 *no_add_attrs = true;
4152 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
4154 error ("%qE attribute requires a void return type", name);
4155 *no_add_attrs = true;
4158 return NULL_TREE;
4161 /* Handle a "shared" attribute; arguments as in
4162 struct attribute_spec.handler. */
4164 static tree
4165 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4166 int ARG_UNUSED (flags), bool *no_add_attrs)
4168 tree decl = *node;
4170 if (TREE_CODE (decl) != VAR_DECL)
4172 error ("%qE attribute only applies to variables", name);
4173 *no_add_attrs = true;
4175 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4177 error ("%qE attribute not allowed with auto storage class", name);
4178 *no_add_attrs = true;
4181 return NULL_TREE;
4184 /* Table of valid machine attributes. */
4185 static const struct attribute_spec nvptx_attribute_table[] =
4187 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
4188 affects_type_identity } */
4189 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
4190 { "shared", 0, 0, true, false, false, nvptx_handle_shared_attribute, false },
4191 { NULL, 0, 0, false, false, false, NULL, false }
4194 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
4196 static HOST_WIDE_INT
4197 nvptx_vector_alignment (const_tree type)
4199 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4201 return MIN (align, BIGGEST_ALIGNMENT);
4204 /* Indicate that INSN cannot be duplicated. */
4206 static bool
4207 nvptx_cannot_copy_insn_p (rtx_insn *insn)
4209 switch (recog_memoized (insn))
4211 case CODE_FOR_nvptx_shufflesi:
4212 case CODE_FOR_nvptx_shufflesf:
4213 case CODE_FOR_nvptx_barsync:
4214 case CODE_FOR_nvptx_fork:
4215 case CODE_FOR_nvptx_forked:
4216 case CODE_FOR_nvptx_joining:
4217 case CODE_FOR_nvptx_join:
4218 return true;
4219 default:
4220 return false;
4224 /* Section anchors do not work. Initialization for flag_section_anchor
4225 probes the existence of the anchoring target hooks and prevents
4226 anchoring if they don't exist. However, we may be being used with
4227 a host-side compiler that does support anchoring, and hence see
4228 the anchor flag set (as it's not recalculated). So provide an
4229 implementation denying anchoring. */
4231 static bool
4232 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4234 return false;
4237 /* Record a symbol for mkoffload to enter into the mapping table. */
4239 static void
4240 nvptx_record_offload_symbol (tree decl)
4242 switch (TREE_CODE (decl))
4244 case VAR_DECL:
4245 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4246 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4247 break;
4249 case FUNCTION_DECL:
4251 tree attr = oacc_get_fn_attrib (decl);
4252 /* OpenMP offloading does not set this attribute. */
4253 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
4255 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4256 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4258 for (; dims; dims = TREE_CHAIN (dims))
4260 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4262 gcc_assert (!TREE_PURPOSE (dims));
4263 fprintf (asm_out_file, ", %#x", size);
4266 fprintf (asm_out_file, "\n");
4268 break;
4270 default:
4271 gcc_unreachable ();
4275 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4276 at the start of a file. */
4278 static void
4279 nvptx_file_start (void)
4281 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4282 fputs ("\t.version\t3.1\n", asm_out_file);
4283 fputs ("\t.target\tsm_30\n", asm_out_file);
4284 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4285 fputs ("// END PREAMBLE\n", asm_out_file);
4288 /* Emit a declaration for a worker-level buffer in .shared memory. */
4290 static void
4291 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
4293 const char *name = XSTR (sym, 0);
4295 write_var_marker (file, true, false, name);
4296 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4297 align, name, size);
4300 /* Write out the function declarations we've collected and declare storage
4301 for the broadcast buffer. */
4303 static void
4304 nvptx_file_end (void)
4306 hash_table<tree_hasher>::iterator iter;
4307 tree decl;
4308 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
4309 nvptx_record_fndecl (decl);
4310 fputs (func_decls.str().c_str(), asm_out_file);
4312 if (worker_bcast_size)
4313 write_worker_buffer (asm_out_file, worker_bcast_sym,
4314 worker_bcast_align, worker_bcast_size);
4316 if (worker_red_size)
4317 write_worker_buffer (asm_out_file, worker_red_sym,
4318 worker_red_align, worker_red_size);
4320 if (need_softstack_decl)
4322 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4323 /* 32 is the maximum number of warps in a block. Even though it's an
4324 external declaration, emit the array size explicitly; otherwise, it
4325 may fail at PTX JIT time if the definition is later in link order. */
4326 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
4327 POINTER_SIZE);
4329 if (need_unisimt_decl)
4331 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
4332 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
4336 /* Expander for the shuffle builtins. */
4338 static rtx
4339 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4341 if (ignore)
4342 return target;
4344 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4345 NULL_RTX, mode, EXPAND_NORMAL);
4346 if (!REG_P (src))
4347 src = copy_to_mode_reg (mode, src);
4349 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
4350 NULL_RTX, SImode, EXPAND_NORMAL);
4351 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
4352 NULL_RTX, SImode, EXPAND_NORMAL);
4354 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4355 idx = copy_to_mode_reg (SImode, idx);
4357 rtx pat = nvptx_gen_shuffle (target, src, idx,
4358 (nvptx_shuffle_kind) INTVAL (op));
4359 if (pat)
4360 emit_insn (pat);
4362 return target;
4365 /* Worker reduction address expander. */
4367 static rtx
4368 nvptx_expand_worker_addr (tree exp, rtx target,
4369 machine_mode ARG_UNUSED (mode), int ignore)
4371 if (ignore)
4372 return target;
4374 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4375 if (align > worker_red_align)
4376 worker_red_align = align;
4378 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4379 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4380 if (size + offset > worker_red_size)
4381 worker_red_size = size + offset;
4383 rtx addr = worker_red_sym;
4384 if (offset)
4386 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4387 addr = gen_rtx_CONST (Pmode, addr);
4390 emit_move_insn (target, addr);
4392 return target;
4395 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4396 not require taking the address of any object, other than the memory
4397 cell being operated on. */
4399 static rtx
4400 nvptx_expand_cmp_swap (tree exp, rtx target,
4401 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4403 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4405 if (!target)
4406 target = gen_reg_rtx (mode);
4408 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4409 NULL_RTX, Pmode, EXPAND_NORMAL);
4410 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4411 NULL_RTX, mode, EXPAND_NORMAL);
4412 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4413 NULL_RTX, mode, EXPAND_NORMAL);
4414 rtx pat;
4416 mem = gen_rtx_MEM (mode, mem);
4417 if (!REG_P (cmp))
4418 cmp = copy_to_mode_reg (mode, cmp);
4419 if (!REG_P (src))
4420 src = copy_to_mode_reg (mode, src);
4422 if (mode == SImode)
4423 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4424 else
4425 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4427 emit_insn (pat);
4429 return target;
4433 /* Codes for all the NVPTX builtins. */
4434 enum nvptx_builtins
4436 NVPTX_BUILTIN_SHUFFLE,
4437 NVPTX_BUILTIN_SHUFFLELL,
4438 NVPTX_BUILTIN_WORKER_ADDR,
4439 NVPTX_BUILTIN_CMP_SWAP,
4440 NVPTX_BUILTIN_CMP_SWAPLL,
4441 NVPTX_BUILTIN_MAX
4444 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4446 /* Return the NVPTX builtin for CODE. */
4448 static tree
4449 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4451 if (code >= NVPTX_BUILTIN_MAX)
4452 return error_mark_node;
4454 return nvptx_builtin_decls[code];
4457 /* Set up all builtin functions for this target. */
4459 static void
4460 nvptx_init_builtins (void)
4462 #define DEF(ID, NAME, T) \
4463 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4464 = add_builtin_function ("__builtin_nvptx_" NAME, \
4465 build_function_type_list T, \
4466 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4467 #define ST sizetype
4468 #define UINT unsigned_type_node
4469 #define LLUINT long_long_unsigned_type_node
4470 #define PTRVOID ptr_type_node
4472 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4473 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4474 DEF (WORKER_ADDR, "worker_addr",
4475 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4476 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4477 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4479 #undef DEF
4480 #undef ST
4481 #undef UINT
4482 #undef LLUINT
4483 #undef PTRVOID
4486 /* Expand an expression EXP that calls a built-in function,
4487 with result going to TARGET if that's convenient
4488 (and in mode MODE if that's convenient).
4489 SUBTARGET may be used as the target for computing one of EXP's operands.
4490 IGNORE is nonzero if the value is to be ignored. */
4492 static rtx
4493 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4494 machine_mode mode, int ignore)
4496 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4497 switch (DECL_FUNCTION_CODE (fndecl))
4499 case NVPTX_BUILTIN_SHUFFLE:
4500 case NVPTX_BUILTIN_SHUFFLELL:
4501 return nvptx_expand_shuffle (exp, target, mode, ignore);
4503 case NVPTX_BUILTIN_WORKER_ADDR:
4504 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4506 case NVPTX_BUILTIN_CMP_SWAP:
4507 case NVPTX_BUILTIN_CMP_SWAPLL:
4508 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4510 default: gcc_unreachable ();
4514 /* Define dimension sizes for known hardware. */
4515 #define PTX_VECTOR_LENGTH 32
4516 #define PTX_WORKER_LENGTH 32
4517 #define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
4519 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
4521 static int
4522 nvptx_simt_vf ()
4524 return PTX_VECTOR_LENGTH;
4527 /* Validate compute dimensions of an OpenACC offload or routine, fill
4528 in non-unity defaults. FN_LEVEL indicates the level at which a
4529 routine might spawn a loop. It is negative for non-routines. If
4530 DECL is null, we are validating the default dimensions. */
4532 static bool
4533 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4535 bool changed = false;
4537 /* The vector size must be 32, unless this is a SEQ routine. */
4538 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4539 && dims[GOMP_DIM_VECTOR] >= 0
4540 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4542 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
4543 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4544 dims[GOMP_DIM_VECTOR]
4545 ? G_("using vector_length (%d), ignoring %d")
4546 : G_("using vector_length (%d), ignoring runtime setting"),
4547 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4548 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4549 changed = true;
4552 /* Check the num workers is not too large. */
4553 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4555 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4556 "using num_workers (%d), ignoring %d",
4557 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4558 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4559 changed = true;
4562 if (!decl)
4564 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4565 if (dims[GOMP_DIM_WORKER] < 0)
4566 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4567 if (dims[GOMP_DIM_GANG] < 0)
4568 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
4569 changed = true;
4572 return changed;
4575 /* Return maximum dimension size, or zero for unbounded. */
4577 static int
4578 nvptx_dim_limit (int axis)
4580 switch (axis)
4582 case GOMP_DIM_WORKER:
4583 return PTX_WORKER_LENGTH;
4585 case GOMP_DIM_VECTOR:
4586 return PTX_VECTOR_LENGTH;
4588 default:
4589 break;
4591 return 0;
4594 /* Determine whether fork & joins are needed. */
4596 static bool
4597 nvptx_goacc_fork_join (gcall *call, const int dims[],
4598 bool ARG_UNUSED (is_fork))
4600 tree arg = gimple_call_arg (call, 2);
4601 unsigned axis = TREE_INT_CST_LOW (arg);
4603 /* We only care about worker and vector partitioning. */
4604 if (axis < GOMP_DIM_WORKER)
4605 return false;
4607 /* If the size is 1, there's no partitioning. */
4608 if (dims[axis] == 1)
4609 return false;
4611 return true;
4614 /* Generate a PTX builtin function call that returns the address in
4615 the worker reduction buffer at OFFSET. TYPE is the type of the
4616 data at that location. */
4618 static tree
4619 nvptx_get_worker_red_addr (tree type, tree offset)
4621 machine_mode mode = TYPE_MODE (type);
4622 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4623 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4624 tree align = build_int_cst (unsigned_type_node,
4625 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4626 tree call = build_call_expr (fndecl, 3, offset, size, align);
4628 return fold_convert (build_pointer_type (type), call);
4631 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4632 will cast the variable if necessary. */
4634 static void
4635 nvptx_generate_vector_shuffle (location_t loc,
4636 tree dest_var, tree var, unsigned shift,
4637 gimple_seq *seq)
4639 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4640 tree_code code = NOP_EXPR;
4641 tree arg_type = unsigned_type_node;
4642 tree var_type = TREE_TYPE (var);
4643 tree dest_type = var_type;
4645 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4646 var_type = TREE_TYPE (var_type);
4648 if (TREE_CODE (var_type) == REAL_TYPE)
4649 code = VIEW_CONVERT_EXPR;
4651 if (TYPE_SIZE (var_type)
4652 == TYPE_SIZE (long_long_unsigned_type_node))
4654 fn = NVPTX_BUILTIN_SHUFFLELL;
4655 arg_type = long_long_unsigned_type_node;
4658 tree call = nvptx_builtin_decl (fn, true);
4659 tree bits = build_int_cst (unsigned_type_node, shift);
4660 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4661 tree expr;
4663 if (var_type != dest_type)
4665 /* Do real and imaginary parts separately. */
4666 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4667 real = fold_build1 (code, arg_type, real);
4668 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4669 real = fold_build1 (code, var_type, real);
4671 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4672 imag = fold_build1 (code, arg_type, imag);
4673 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4674 imag = fold_build1 (code, var_type, imag);
4676 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4678 else
4680 expr = fold_build1 (code, arg_type, var);
4681 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4682 expr = fold_build1 (code, dest_type, expr);
4685 gimplify_assign (dest_var, expr, seq);
4688 /* Lazily generate the global lock var decl and return its address. */
4690 static tree
4691 nvptx_global_lock_addr ()
4693 tree v = global_lock_var;
4695 if (!v)
4697 tree name = get_identifier ("__reduction_lock");
4698 tree type = build_qualified_type (unsigned_type_node,
4699 TYPE_QUAL_VOLATILE);
4700 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4701 global_lock_var = v;
4702 DECL_ARTIFICIAL (v) = 1;
4703 DECL_EXTERNAL (v) = 1;
4704 TREE_STATIC (v) = 1;
4705 TREE_PUBLIC (v) = 1;
4706 TREE_USED (v) = 1;
4707 mark_addressable (v);
4708 mark_decl_referenced (v);
4711 return build_fold_addr_expr (v);
4714 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
4715 GSI. We use a lockless scheme for nearly all case, which looks
4716 like:
4717 actual = initval(OP);
4718 do {
4719 guess = actual;
4720 write = guess OP myval;
4721 actual = cmp&swap (ptr, guess, write)
4722 } while (actual bit-different-to guess);
4723 return write;
4725 This relies on a cmp&swap instruction, which is available for 32-
4726 and 64-bit types. Larger types must use a locking scheme. */
4728 static tree
4729 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4730 tree ptr, tree var, tree_code op)
4732 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4733 tree_code code = NOP_EXPR;
4734 tree arg_type = unsigned_type_node;
4735 tree var_type = TREE_TYPE (var);
4737 if (TREE_CODE (var_type) == COMPLEX_TYPE
4738 || TREE_CODE (var_type) == REAL_TYPE)
4739 code = VIEW_CONVERT_EXPR;
4741 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
4743 arg_type = long_long_unsigned_type_node;
4744 fn = NVPTX_BUILTIN_CMP_SWAPLL;
4747 tree swap_fn = nvptx_builtin_decl (fn, true);
4749 gimple_seq init_seq = NULL;
4750 tree init_var = make_ssa_name (arg_type);
4751 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4752 init_expr = fold_build1 (code, arg_type, init_expr);
4753 gimplify_assign (init_var, init_expr, &init_seq);
4754 gimple *init_end = gimple_seq_last (init_seq);
4756 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4758 /* Split the block just after the init stmts. */
4759 basic_block pre_bb = gsi_bb (*gsi);
4760 edge pre_edge = split_block (pre_bb, init_end);
4761 basic_block loop_bb = pre_edge->dest;
4762 pre_bb = pre_edge->src;
4763 /* Reset the iterator. */
4764 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4766 tree expect_var = make_ssa_name (arg_type);
4767 tree actual_var = make_ssa_name (arg_type);
4768 tree write_var = make_ssa_name (arg_type);
4770 /* Build and insert the reduction calculation. */
4771 gimple_seq red_seq = NULL;
4772 tree write_expr = fold_build1 (code, var_type, expect_var);
4773 write_expr = fold_build2 (op, var_type, write_expr, var);
4774 write_expr = fold_build1 (code, arg_type, write_expr);
4775 gimplify_assign (write_var, write_expr, &red_seq);
4777 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4779 /* Build & insert the cmp&swap sequence. */
4780 gimple_seq latch_seq = NULL;
4781 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4782 ptr, expect_var, write_var);
4783 gimplify_assign (actual_var, swap_expr, &latch_seq);
4785 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4786 NULL_TREE, NULL_TREE);
4787 gimple_seq_add_stmt (&latch_seq, cond);
4789 gimple *latch_end = gimple_seq_last (latch_seq);
4790 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
4792 /* Split the block just after the latch stmts. */
4793 edge post_edge = split_block (loop_bb, latch_end);
4794 basic_block post_bb = post_edge->dest;
4795 loop_bb = post_edge->src;
4796 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4798 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4799 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4800 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4801 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4803 gphi *phi = create_phi_node (expect_var, loop_bb);
4804 add_phi_arg (phi, init_var, pre_edge, loc);
4805 add_phi_arg (phi, actual_var, loop_edge, loc);
4807 loop *loop = alloc_loop ();
4808 loop->header = loop_bb;
4809 loop->latch = loop_bb;
4810 add_loop (loop, loop_bb->loop_father);
4812 return fold_build1 (code, var_type, write_var);
4815 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
4816 GSI. This is necessary for types larger than 64 bits, where there
4817 is no cmp&swap instruction to implement a lockless scheme. We use
4818 a lock variable in global memory.
4820 while (cmp&swap (&lock_var, 0, 1))
4821 continue;
4822 T accum = *ptr;
4823 accum = accum OP var;
4824 *ptr = accum;
4825 cmp&swap (&lock_var, 1, 0);
4826 return accum;
4828 A lock in global memory is necessary to force execution engine
4829 descheduling and avoid resource starvation that can occur if the
4830 lock is in .shared memory. */
4832 static tree
4833 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4834 tree ptr, tree var, tree_code op)
4836 tree var_type = TREE_TYPE (var);
4837 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4838 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4839 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4841 /* Split the block just before the gsi. Insert a gimple nop to make
4842 this easier. */
4843 gimple *nop = gimple_build_nop ();
4844 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4845 basic_block entry_bb = gsi_bb (*gsi);
4846 edge entry_edge = split_block (entry_bb, nop);
4847 basic_block lock_bb = entry_edge->dest;
4848 /* Reset the iterator. */
4849 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4851 /* Build and insert the locking sequence. */
4852 gimple_seq lock_seq = NULL;
4853 tree lock_var = make_ssa_name (unsigned_type_node);
4854 tree lock_expr = nvptx_global_lock_addr ();
4855 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4856 uns_unlocked, uns_locked);
4857 gimplify_assign (lock_var, lock_expr, &lock_seq);
4858 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4859 NULL_TREE, NULL_TREE);
4860 gimple_seq_add_stmt (&lock_seq, cond);
4861 gimple *lock_end = gimple_seq_last (lock_seq);
4862 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4864 /* Split the block just after the lock sequence. */
4865 edge locked_edge = split_block (lock_bb, lock_end);
4866 basic_block update_bb = locked_edge->dest;
4867 lock_bb = locked_edge->src;
4868 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4870 /* Create the lock loop ... */
4871 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4872 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
4873 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
4874 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
4876 /* ... and the loop structure. */
4877 loop *lock_loop = alloc_loop ();
4878 lock_loop->header = lock_bb;
4879 lock_loop->latch = lock_bb;
4880 lock_loop->nb_iterations_estimate = 1;
4881 lock_loop->any_estimate = true;
4882 add_loop (lock_loop, entry_bb->loop_father);
4884 /* Build and insert the reduction calculation. */
4885 gimple_seq red_seq = NULL;
4886 tree acc_in = make_ssa_name (var_type);
4887 tree ref_in = build_simple_mem_ref (ptr);
4888 TREE_THIS_VOLATILE (ref_in) = 1;
4889 gimplify_assign (acc_in, ref_in, &red_seq);
4891 tree acc_out = make_ssa_name (var_type);
4892 tree update_expr = fold_build2 (op, var_type, ref_in, var);
4893 gimplify_assign (acc_out, update_expr, &red_seq);
4895 tree ref_out = build_simple_mem_ref (ptr);
4896 TREE_THIS_VOLATILE (ref_out) = 1;
4897 gimplify_assign (ref_out, acc_out, &red_seq);
4899 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4901 /* Build & insert the unlock sequence. */
4902 gimple_seq unlock_seq = NULL;
4903 tree unlock_expr = nvptx_global_lock_addr ();
4904 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
4905 uns_locked, uns_unlocked);
4906 gimplify_and_add (unlock_expr, &unlock_seq);
4907 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
4909 return acc_out;
4912 /* Emit a sequence to update a reduction accumlator at *PTR with the
4913 value held in VAR using operator OP. Return the updated value.
4915 TODO: optimize for atomic ops and indepedent complex ops. */
4917 static tree
4918 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
4919 tree ptr, tree var, tree_code op)
4921 tree type = TREE_TYPE (var);
4922 tree size = TYPE_SIZE (type);
4924 if (size == TYPE_SIZE (unsigned_type_node)
4925 || size == TYPE_SIZE (long_long_unsigned_type_node))
4926 return nvptx_lockless_update (loc, gsi, ptr, var, op);
4927 else
4928 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
4931 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4933 static void
4934 nvptx_goacc_reduction_setup (gcall *call)
4936 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4937 tree lhs = gimple_call_lhs (call);
4938 tree var = gimple_call_arg (call, 2);
4939 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4940 gimple_seq seq = NULL;
4942 push_gimplify_context (true);
4944 if (level != GOMP_DIM_GANG)
4946 /* Copy the receiver object. */
4947 tree ref_to_res = gimple_call_arg (call, 1);
4949 if (!integer_zerop (ref_to_res))
4950 var = build_simple_mem_ref (ref_to_res);
4953 if (level == GOMP_DIM_WORKER)
4955 /* Store incoming value to worker reduction buffer. */
4956 tree offset = gimple_call_arg (call, 5);
4957 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4958 tree ptr = make_ssa_name (TREE_TYPE (call));
4960 gimplify_assign (ptr, call, &seq);
4961 tree ref = build_simple_mem_ref (ptr);
4962 TREE_THIS_VOLATILE (ref) = 1;
4963 gimplify_assign (ref, var, &seq);
4966 if (lhs)
4967 gimplify_assign (lhs, var, &seq);
4969 pop_gimplify_context (NULL);
4970 gsi_replace_with_seq (&gsi, seq, true);
4973 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
4975 static void
4976 nvptx_goacc_reduction_init (gcall *call)
4978 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4979 tree lhs = gimple_call_lhs (call);
4980 tree var = gimple_call_arg (call, 2);
4981 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4982 enum tree_code rcode
4983 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4984 tree init = omp_reduction_init_op (gimple_location (call), rcode,
4985 TREE_TYPE (var));
4986 gimple_seq seq = NULL;
4988 push_gimplify_context (true);
4990 if (level == GOMP_DIM_VECTOR)
4992 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4993 tree tid = make_ssa_name (integer_type_node);
4994 tree dim_vector = gimple_call_arg (call, 3);
4995 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
4996 dim_vector);
4997 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
4998 NULL_TREE, NULL_TREE);
5000 gimple_call_set_lhs (tid_call, tid);
5001 gimple_seq_add_stmt (&seq, tid_call);
5002 gimple_seq_add_stmt (&seq, cond_stmt);
5004 /* Split the block just after the call. */
5005 edge init_edge = split_block (gsi_bb (gsi), call);
5006 basic_block init_bb = init_edge->dest;
5007 basic_block call_bb = init_edge->src;
5009 /* Fixup flags from call_bb to init_bb. */
5010 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
5012 /* Set the initialization stmts. */
5013 gimple_seq init_seq = NULL;
5014 tree init_var = make_ssa_name (TREE_TYPE (var));
5015 gimplify_assign (init_var, init, &init_seq);
5016 gsi = gsi_start_bb (init_bb);
5017 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5019 /* Split block just after the init stmt. */
5020 gsi_prev (&gsi);
5021 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5022 basic_block dst_bb = inited_edge->dest;
5024 /* Create false edge from call_bb to dst_bb. */
5025 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
5027 /* Create phi node in dst block. */
5028 gphi *phi = create_phi_node (lhs, dst_bb);
5029 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5030 add_phi_arg (phi, var, nop_edge, gimple_location (call));
5032 /* Reset dominator of dst bb. */
5033 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5035 /* Reset the gsi. */
5036 gsi = gsi_for_stmt (call);
5038 else
5040 if (level == GOMP_DIM_GANG)
5042 /* If there's no receiver object, propagate the incoming VAR. */
5043 tree ref_to_res = gimple_call_arg (call, 1);
5044 if (integer_zerop (ref_to_res))
5045 init = var;
5048 gimplify_assign (lhs, init, &seq);
5051 pop_gimplify_context (NULL);
5052 gsi_replace_with_seq (&gsi, seq, true);
5055 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
5057 static void
5058 nvptx_goacc_reduction_fini (gcall *call)
5060 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5061 tree lhs = gimple_call_lhs (call);
5062 tree ref_to_res = gimple_call_arg (call, 1);
5063 tree var = gimple_call_arg (call, 2);
5064 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5065 enum tree_code op
5066 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5067 gimple_seq seq = NULL;
5068 tree r = NULL_TREE;;
5070 push_gimplify_context (true);
5072 if (level == GOMP_DIM_VECTOR)
5074 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5075 but that requires a method of emitting a unified jump at the
5076 gimple level. */
5077 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5079 tree other_var = make_ssa_name (TREE_TYPE (var));
5080 nvptx_generate_vector_shuffle (gimple_location (call),
5081 other_var, var, shfl, &seq);
5083 r = make_ssa_name (TREE_TYPE (var));
5084 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5085 var, other_var), &seq);
5086 var = r;
5089 else
5091 tree accum = NULL_TREE;
5093 if (level == GOMP_DIM_WORKER)
5095 /* Get reduction buffer address. */
5096 tree offset = gimple_call_arg (call, 5);
5097 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5098 tree ptr = make_ssa_name (TREE_TYPE (call));
5100 gimplify_assign (ptr, call, &seq);
5101 accum = ptr;
5103 else if (integer_zerop (ref_to_res))
5104 r = var;
5105 else
5106 accum = ref_to_res;
5108 if (accum)
5110 /* UPDATE the accumulator. */
5111 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5112 seq = NULL;
5113 r = nvptx_reduction_update (gimple_location (call), &gsi,
5114 accum, var, op);
5118 if (lhs)
5119 gimplify_assign (lhs, r, &seq);
5120 pop_gimplify_context (NULL);
5122 gsi_replace_with_seq (&gsi, seq, true);
5125 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5127 static void
5128 nvptx_goacc_reduction_teardown (gcall *call)
5130 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5131 tree lhs = gimple_call_lhs (call);
5132 tree var = gimple_call_arg (call, 2);
5133 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5134 gimple_seq seq = NULL;
5136 push_gimplify_context (true);
5137 if (level == GOMP_DIM_WORKER)
5139 /* Read the worker reduction buffer. */
5140 tree offset = gimple_call_arg (call, 5);
5141 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
5142 tree ptr = make_ssa_name (TREE_TYPE (call));
5144 gimplify_assign (ptr, call, &seq);
5145 var = build_simple_mem_ref (ptr);
5146 TREE_THIS_VOLATILE (var) = 1;
5149 if (level != GOMP_DIM_GANG)
5151 /* Write to the receiver object. */
5152 tree ref_to_res = gimple_call_arg (call, 1);
5154 if (!integer_zerop (ref_to_res))
5155 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5158 if (lhs)
5159 gimplify_assign (lhs, var, &seq);
5161 pop_gimplify_context (NULL);
5163 gsi_replace_with_seq (&gsi, seq, true);
5166 /* NVPTX reduction expander. */
5168 static void
5169 nvptx_goacc_reduction (gcall *call)
5171 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5173 switch (code)
5175 case IFN_GOACC_REDUCTION_SETUP:
5176 nvptx_goacc_reduction_setup (call);
5177 break;
5179 case IFN_GOACC_REDUCTION_INIT:
5180 nvptx_goacc_reduction_init (call);
5181 break;
5183 case IFN_GOACC_REDUCTION_FINI:
5184 nvptx_goacc_reduction_fini (call);
5185 break;
5187 case IFN_GOACC_REDUCTION_TEARDOWN:
5188 nvptx_goacc_reduction_teardown (call);
5189 break;
5191 default:
5192 gcc_unreachable ();
5196 #undef TARGET_OPTION_OVERRIDE
5197 #define TARGET_OPTION_OVERRIDE nvptx_option_override
5199 #undef TARGET_ATTRIBUTE_TABLE
5200 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5202 #undef TARGET_LRA_P
5203 #define TARGET_LRA_P hook_bool_void_false
5205 #undef TARGET_LEGITIMATE_ADDRESS_P
5206 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5208 #undef TARGET_PROMOTE_FUNCTION_MODE
5209 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5211 #undef TARGET_FUNCTION_ARG
5212 #define TARGET_FUNCTION_ARG nvptx_function_arg
5213 #undef TARGET_FUNCTION_INCOMING_ARG
5214 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5215 #undef TARGET_FUNCTION_ARG_ADVANCE
5216 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
5217 #undef TARGET_FUNCTION_ARG_BOUNDARY
5218 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
5219 #undef TARGET_PASS_BY_REFERENCE
5220 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5221 #undef TARGET_FUNCTION_VALUE_REGNO_P
5222 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5223 #undef TARGET_FUNCTION_VALUE
5224 #define TARGET_FUNCTION_VALUE nvptx_function_value
5225 #undef TARGET_LIBCALL_VALUE
5226 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
5227 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
5228 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
5229 #undef TARGET_GET_DRAP_RTX
5230 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
5231 #undef TARGET_SPLIT_COMPLEX_ARG
5232 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5233 #undef TARGET_RETURN_IN_MEMORY
5234 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5235 #undef TARGET_OMIT_STRUCT_RETURN_REG
5236 #define TARGET_OMIT_STRUCT_RETURN_REG true
5237 #undef TARGET_STRICT_ARGUMENT_NAMING
5238 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
5239 #undef TARGET_CALL_ARGS
5240 #define TARGET_CALL_ARGS nvptx_call_args
5241 #undef TARGET_END_CALL_ARGS
5242 #define TARGET_END_CALL_ARGS nvptx_end_call_args
5244 #undef TARGET_ASM_FILE_START
5245 #define TARGET_ASM_FILE_START nvptx_file_start
5246 #undef TARGET_ASM_FILE_END
5247 #define TARGET_ASM_FILE_END nvptx_file_end
5248 #undef TARGET_ASM_GLOBALIZE_LABEL
5249 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
5250 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
5251 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
5252 #undef TARGET_PRINT_OPERAND
5253 #define TARGET_PRINT_OPERAND nvptx_print_operand
5254 #undef TARGET_PRINT_OPERAND_ADDRESS
5255 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
5256 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
5257 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
5258 #undef TARGET_ASM_INTEGER
5259 #define TARGET_ASM_INTEGER nvptx_assemble_integer
5260 #undef TARGET_ASM_DECL_END
5261 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
5262 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
5263 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
5264 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
5265 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
5266 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
5267 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
5269 #undef TARGET_MACHINE_DEPENDENT_REORG
5270 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
5271 #undef TARGET_NO_REGISTER_ALLOCATION
5272 #define TARGET_NO_REGISTER_ALLOCATION true
5274 #undef TARGET_ENCODE_SECTION_INFO
5275 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
5276 #undef TARGET_RECORD_OFFLOAD_SYMBOL
5277 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
5279 #undef TARGET_VECTOR_ALIGNMENT
5280 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
5282 #undef TARGET_CANNOT_COPY_INSN_P
5283 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
5285 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
5286 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
5288 #undef TARGET_INIT_BUILTINS
5289 #define TARGET_INIT_BUILTINS nvptx_init_builtins
5290 #undef TARGET_EXPAND_BUILTIN
5291 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
5292 #undef TARGET_BUILTIN_DECL
5293 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
5295 #undef TARGET_SIMT_VF
5296 #define TARGET_SIMT_VF nvptx_simt_vf
5298 #undef TARGET_GOACC_VALIDATE_DIMS
5299 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
5301 #undef TARGET_GOACC_DIM_LIMIT
5302 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
5304 #undef TARGET_GOACC_FORK_JOIN
5305 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
5307 #undef TARGET_GOACC_REDUCTION
5308 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
5310 struct gcc_target targetm = TARGET_INITIALIZER;
5312 #include "gt-nvptx.h"