2016-11-10 Vladimir Makarov <vmakarov@redhat.com>
[official-gcc.git] / gcc / config / nvptx / nvptx.c
blob782bbdecb37d648b717d53450ba8dc8c276c7207
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2016 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include <sstream>
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "diagnostic.h"
39 #include "alias.h"
40 #include "insn-flags.h"
41 #include "output.h"
42 #include "insn-attr.h"
43 #include "flags.h"
44 #include "dojump.h"
45 #include "explow.h"
46 #include "calls.h"
47 #include "varasm.h"
48 #include "stmt.h"
49 #include "expr.h"
50 #include "tm-preds.h"
51 #include "tm-constrs.h"
52 #include "langhooks.h"
53 #include "dbxout.h"
54 #include "cfgrtl.h"
55 #include "gimple.h"
56 #include "stor-layout.h"
57 #include "builtins.h"
58 #include "omp-low.h"
59 #include "gomp-constants.h"
60 #include "dumpfile.h"
61 #include "internal-fn.h"
62 #include "gimple-iterator.h"
63 #include "stringpool.h"
64 #include "tree-vrp.h"
65 #include "tree-ssa-operands.h"
66 #include "tree-ssanames.h"
67 #include "gimplify.h"
68 #include "tree-phinodes.h"
69 #include "cfgloop.h"
70 #include "fold-const.h"
72 /* This file should be included last. */
73 #include "target-def.h"
75 /* The kind of shuffe instruction. */
76 enum nvptx_shuffle_kind
78 SHUFFLE_UP,
79 SHUFFLE_DOWN,
80 SHUFFLE_BFLY,
81 SHUFFLE_IDX,
82 SHUFFLE_MAX
85 /* The various PTX memory areas an object might reside in. */
86 enum nvptx_data_area
88 DATA_AREA_GENERIC,
89 DATA_AREA_GLOBAL,
90 DATA_AREA_SHARED,
91 DATA_AREA_LOCAL,
92 DATA_AREA_CONST,
93 DATA_AREA_PARAM,
94 DATA_AREA_MAX
97 /* We record the data area in the target symbol flags. */
98 #define SYMBOL_DATA_AREA(SYM) \
99 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
100 & 7)
101 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
102 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
104 /* Record the function decls we've written, and the libfuncs and function
105 decls corresponding to them. */
106 static std::stringstream func_decls;
108 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
110 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
111 static bool equal (rtx a, rtx b) { return a == b; }
114 static GTY((cache))
115 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
117 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
119 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
120 static bool equal (tree a, tree b) { return a == b; }
123 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
124 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
126 /* Buffer needed to broadcast across workers. This is used for both
127 worker-neutering and worker broadcasting. It is shared by all
128 functions emitted. The buffer is placed in shared memory. It'd be
129 nice if PTX supported common blocks, because then this could be
130 shared across TUs (taking the largest size). */
131 static unsigned worker_bcast_size;
132 static unsigned worker_bcast_align;
133 static GTY(()) rtx worker_bcast_sym;
135 /* Buffer needed for worker reductions. This has to be distinct from
136 the worker broadcast array, as both may be live concurrently. */
137 static unsigned worker_red_size;
138 static unsigned worker_red_align;
139 static GTY(()) rtx worker_red_sym;
141 /* Global lock variable, needed for 128bit worker & gang reductions. */
142 static GTY(()) tree global_lock_var;
144 /* Allocate a new, cleared machine_function structure. */
146 static struct machine_function *
147 nvptx_init_machine_status (void)
149 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
150 p->return_mode = VOIDmode;
151 return p;
154 /* Implement TARGET_OPTION_OVERRIDE. */
156 static void
157 nvptx_option_override (void)
159 init_machine_status = nvptx_init_machine_status;
161 /* Set toplevel_reorder, unless explicitly disabled. We need
162 reordering so that we emit necessary assembler decls of
163 undeclared variables. */
164 if (!global_options_set.x_flag_toplevel_reorder)
165 flag_toplevel_reorder = 1;
167 /* Set flag_no_common, unless explicitly disabled. We fake common
168 using .weak, and that's not entirely accurate, so avoid it
169 unless forced. */
170 if (!global_options_set.x_flag_no_common)
171 flag_no_common = 1;
173 /* Assumes that it will see only hard registers. */
174 flag_var_tracking = 0;
176 if (nvptx_optimize < 0)
177 nvptx_optimize = optimize > 0;
179 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
180 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
181 declared_libfuncs_htab
182 = hash_table<declared_libfunc_hasher>::create_ggc (17);
184 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
185 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
186 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
188 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
189 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
190 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
193 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
194 deal with ptx ideosyncracies. */
196 const char *
197 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
199 switch (mode)
201 case BLKmode:
202 return ".b8";
203 case BImode:
204 return ".pred";
205 case QImode:
206 if (promote)
207 return ".u32";
208 else
209 return ".u8";
210 case HImode:
211 return ".u16";
212 case SImode:
213 return ".u32";
214 case DImode:
215 return ".u64";
217 case SFmode:
218 return ".f32";
219 case DFmode:
220 return ".f64";
222 default:
223 gcc_unreachable ();
227 /* Encode the PTX data area that DECL (which might not actually be a
228 _DECL) should reside in. */
230 static void
231 nvptx_encode_section_info (tree decl, rtx rtl, int first)
233 default_encode_section_info (decl, rtl, first);
234 if (first && MEM_P (rtl))
236 nvptx_data_area area = DATA_AREA_GENERIC;
238 if (TREE_CONSTANT (decl))
239 area = DATA_AREA_CONST;
240 else if (TREE_CODE (decl) == VAR_DECL)
241 /* TODO: This would be a good place to check for a .shared or
242 other section name. */
243 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
245 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
249 /* Return the PTX name of the data area in which SYM should be
250 placed. The symbol must have already been processed by
251 nvptx_encode_seciton_info, or equivalent. */
253 static const char *
254 section_for_sym (rtx sym)
256 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
257 /* Same order as nvptx_data_area enum. */
258 static char const *const areas[] =
259 {"", ".global", ".shared", ".local", ".const", ".param"};
261 return areas[area];
264 /* Similarly for a decl. */
266 static const char *
267 section_for_decl (const_tree decl)
269 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
272 /* Check NAME for special function names and redirect them by returning a
273 replacement. This applies to malloc, free and realloc, for which we
274 want to use libgcc wrappers, and call, which triggers a bug in
275 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
276 not active in an offload compiler -- the names are all set by the
277 host-side compiler. */
279 static const char *
280 nvptx_name_replacement (const char *name)
282 if (strcmp (name, "call") == 0)
283 return "__nvptx_call";
284 if (strcmp (name, "malloc") == 0)
285 return "__nvptx_malloc";
286 if (strcmp (name, "free") == 0)
287 return "__nvptx_free";
288 if (strcmp (name, "realloc") == 0)
289 return "__nvptx_realloc";
290 return name;
293 /* If MODE should be treated as two registers of an inner mode, return
294 that inner mode. Otherwise return VOIDmode. */
296 static machine_mode
297 maybe_split_mode (machine_mode mode)
299 if (COMPLEX_MODE_P (mode))
300 return GET_MODE_INNER (mode);
302 if (mode == TImode)
303 return DImode;
305 return VOIDmode;
308 /* Output a register, subreg, or register pair (with optional
309 enclosing braces). */
311 static void
312 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
313 int subreg_offset = -1)
315 if (inner_mode == VOIDmode)
317 if (HARD_REGISTER_NUM_P (regno))
318 fprintf (file, "%s", reg_names[regno]);
319 else
320 fprintf (file, "%%r%d", regno);
322 else if (subreg_offset >= 0)
324 output_reg (file, regno, VOIDmode);
325 fprintf (file, "$%d", subreg_offset);
327 else
329 if (subreg_offset == -1)
330 fprintf (file, "{");
331 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
332 fprintf (file, ",");
333 output_reg (file, regno, inner_mode, 0);
334 if (subreg_offset == -1)
335 fprintf (file, "}");
339 /* Emit forking instructions for MASK. */
341 static void
342 nvptx_emit_forking (unsigned mask, bool is_call)
344 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
345 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
346 if (mask)
348 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
350 /* Emit fork at all levels. This helps form SESE regions, as
351 it creates a block with a single successor before entering a
352 partitooned region. That is a good candidate for the end of
353 an SESE region. */
354 if (!is_call)
355 emit_insn (gen_nvptx_fork (op));
356 emit_insn (gen_nvptx_forked (op));
360 /* Emit joining instructions for MASK. */
362 static void
363 nvptx_emit_joining (unsigned mask, bool is_call)
365 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
366 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
367 if (mask)
369 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
371 /* Emit joining for all non-call pars to ensure there's a single
372 predecessor for the block the join insn ends up in. This is
373 needed for skipping entire loops. */
374 if (!is_call)
375 emit_insn (gen_nvptx_joining (op));
376 emit_insn (gen_nvptx_join (op));
381 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
382 returned in memory. Integer and floating types supported by the
383 machine are passed in registers, everything else is passed in
384 memory. Complex types are split. */
386 static bool
387 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
389 if (type)
391 if (AGGREGATE_TYPE_P (type))
392 return true;
393 if (TREE_CODE (type) == VECTOR_TYPE)
394 return true;
397 if (!for_return && COMPLEX_MODE_P (mode))
398 /* Complex types are passed as two underlying args. */
399 mode = GET_MODE_INNER (mode);
401 if (GET_MODE_CLASS (mode) != MODE_INT
402 && GET_MODE_CLASS (mode) != MODE_FLOAT)
403 return true;
405 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
406 return true;
408 return false;
411 /* A non-memory argument of mode MODE is being passed, determine the mode it
412 should be promoted to. This is also used for determining return
413 type promotion. */
415 static machine_mode
416 promote_arg (machine_mode mode, bool prototyped)
418 if (!prototyped && mode == SFmode)
419 /* K&R float promotion for unprototyped functions. */
420 mode = DFmode;
421 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
422 mode = SImode;
424 return mode;
427 /* A non-memory return type of MODE is being returned. Determine the
428 mode it should be promoted to. */
430 static machine_mode
431 promote_return (machine_mode mode)
433 return promote_arg (mode, true);
436 /* Implement TARGET_FUNCTION_ARG. */
438 static rtx
439 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
440 const_tree, bool named)
442 if (mode == VOIDmode || !named)
443 return NULL_RTX;
445 return gen_reg_rtx (mode);
448 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
450 static rtx
451 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
452 const_tree, bool named)
454 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
456 if (mode == VOIDmode || !named)
457 return NULL_RTX;
459 /* No need to deal with split modes here, the only case that can
460 happen is complex modes and those are dealt with by
461 TARGET_SPLIT_COMPLEX_ARG. */
462 return gen_rtx_UNSPEC (mode,
463 gen_rtvec (1, GEN_INT (cum->count)),
464 UNSPEC_ARG_REG);
467 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
469 static void
470 nvptx_function_arg_advance (cumulative_args_t cum_v,
471 machine_mode ARG_UNUSED (mode),
472 const_tree ARG_UNUSED (type),
473 bool ARG_UNUSED (named))
475 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
477 cum->count++;
480 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
482 For nvptx This is only used for varadic args. The type has already
483 been promoted and/or converted to invisible reference. */
485 static unsigned
486 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
488 return GET_MODE_ALIGNMENT (mode);
491 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
493 For nvptx, we know how to handle functions declared as stdarg: by
494 passing an extra pointer to the unnamed arguments. However, the
495 Fortran frontend can produce a different situation, where a
496 function pointer is declared with no arguments, but the actual
497 function and calls to it take more arguments. In that case, we
498 want to ensure the call matches the definition of the function. */
500 static bool
501 nvptx_strict_argument_naming (cumulative_args_t cum_v)
503 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
505 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
508 /* Implement TARGET_LIBCALL_VALUE. */
510 static rtx
511 nvptx_libcall_value (machine_mode mode, const_rtx)
513 if (!cfun || !cfun->machine->doing_call)
514 /* Pretend to return in a hard reg for early uses before pseudos can be
515 generated. */
516 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
518 return gen_reg_rtx (mode);
521 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
522 where function FUNC returns or receives a value of data type TYPE. */
524 static rtx
525 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
526 bool outgoing)
528 machine_mode mode = promote_return (TYPE_MODE (type));
530 if (outgoing)
532 gcc_assert (cfun);
533 cfun->machine->return_mode = mode;
534 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
537 return nvptx_libcall_value (mode, NULL_RTX);
540 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
542 static bool
543 nvptx_function_value_regno_p (const unsigned int regno)
545 return regno == NVPTX_RETURN_REGNUM;
548 /* Types with a mode other than those supported by the machine are passed by
549 reference in memory. */
551 static bool
552 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
553 machine_mode mode, const_tree type,
554 bool ARG_UNUSED (named))
556 return pass_in_memory (mode, type, false);
559 /* Implement TARGET_RETURN_IN_MEMORY. */
561 static bool
562 nvptx_return_in_memory (const_tree type, const_tree)
564 return pass_in_memory (TYPE_MODE (type), type, true);
567 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
569 static machine_mode
570 nvptx_promote_function_mode (const_tree type, machine_mode mode,
571 int *ARG_UNUSED (punsignedp),
572 const_tree funtype, int for_return)
574 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
577 /* Helper for write_arg. Emit a single PTX argument of MODE, either
578 in a prototype, or as copy in a function prologue. ARGNO is the
579 index of this argument in the PTX function. FOR_REG is negative,
580 if we're emitting the PTX prototype. It is zero if we're copying
581 to an argument register and it is greater than zero if we're
582 copying to a specific hard register. */
584 static int
585 write_arg_mode (std::stringstream &s, int for_reg, int argno,
586 machine_mode mode)
588 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
590 if (for_reg < 0)
592 /* Writing PTX prototype. */
593 s << (argno ? ", " : " (");
594 s << ".param" << ptx_type << " %in_ar" << argno;
596 else
598 s << "\t.reg" << ptx_type << " ";
599 if (for_reg)
600 s << reg_names[for_reg];
601 else
602 s << "%ar" << argno;
603 s << ";\n";
604 if (argno >= 0)
606 s << "\tld.param" << ptx_type << " ";
607 if (for_reg)
608 s << reg_names[for_reg];
609 else
610 s << "%ar" << argno;
611 s << ", [%in_ar" << argno << "];\n";
614 return argno + 1;
617 /* Process function parameter TYPE to emit one or more PTX
618 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
619 is true, if this is a prototyped function, rather than an old-style
620 C declaration. Returns the next argument number to use.
622 The promotion behavior here must match the regular GCC function
623 parameter marshalling machinery. */
625 static int
626 write_arg_type (std::stringstream &s, int for_reg, int argno,
627 tree type, bool prototyped)
629 machine_mode mode = TYPE_MODE (type);
631 if (mode == VOIDmode)
632 return argno;
634 if (pass_in_memory (mode, type, false))
635 mode = Pmode;
636 else
638 bool split = TREE_CODE (type) == COMPLEX_TYPE;
640 if (split)
642 /* Complex types are sent as two separate args. */
643 type = TREE_TYPE (type);
644 mode = TYPE_MODE (type);
645 prototyped = true;
648 mode = promote_arg (mode, prototyped);
649 if (split)
650 argno = write_arg_mode (s, for_reg, argno, mode);
653 return write_arg_mode (s, for_reg, argno, mode);
656 /* Emit a PTX return as a prototype or function prologue declaration
657 for MODE. */
659 static void
660 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
662 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
663 const char *pfx = "\t.reg";
664 const char *sfx = ";\n";
666 if (for_proto)
667 pfx = "(.param", sfx = "_out) ";
669 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
672 /* Process a function return TYPE to emit a PTX return as a prototype
673 or function prologue declaration. Returns true if return is via an
674 additional pointer parameter. The promotion behavior here must
675 match the regular GCC function return mashalling. */
677 static bool
678 write_return_type (std::stringstream &s, bool for_proto, tree type)
680 machine_mode mode = TYPE_MODE (type);
682 if (mode == VOIDmode)
683 return false;
685 bool return_in_mem = pass_in_memory (mode, type, true);
687 if (return_in_mem)
689 if (for_proto)
690 return return_in_mem;
692 /* Named return values can cause us to return a pointer as well
693 as expect an argument for the return location. This is
694 optimization-level specific, so no caller can make use of
695 this data, but more importantly for us, we must ensure it
696 doesn't change the PTX prototype. */
697 mode = (machine_mode) cfun->machine->return_mode;
699 if (mode == VOIDmode)
700 return return_in_mem;
702 /* Clear return_mode to inhibit copy of retval to non-existent
703 retval parameter. */
704 cfun->machine->return_mode = VOIDmode;
706 else
707 mode = promote_return (mode);
709 write_return_mode (s, for_proto, mode);
711 return return_in_mem;
714 /* Look for attributes in ATTRS that would indicate we must write a function
715 as a .entry kernel rather than a .func. Return true if one is found. */
717 static bool
718 write_as_kernel (tree attrs)
720 return (lookup_attribute ("kernel", attrs) != NULL_TREE
721 || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
724 /* Emit a linker marker for a function decl or defn. */
726 static void
727 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
728 const char *name)
730 s << "\n// BEGIN";
731 if (globalize)
732 s << " GLOBAL";
733 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
734 s << name << "\n";
737 /* Emit a linker marker for a variable decl or defn. */
739 static void
740 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
742 fprintf (file, "\n// BEGIN%s VAR %s: ",
743 globalize ? " GLOBAL" : "",
744 is_defn ? "DEF" : "DECL");
745 assemble_name_raw (file, name);
746 fputs ("\n", file);
749 /* Write a .func or .kernel declaration or definition along with
750 a helper comment for use by ld. S is the stream to write to, DECL
751 the decl for the function with name NAME. For definitions, emit
752 a declaration too. */
754 static const char *
755 write_fn_proto (std::stringstream &s, bool is_defn,
756 const char *name, const_tree decl)
758 if (is_defn)
759 /* Emit a declaration. The PTX assembler gets upset without it. */
760 name = write_fn_proto (s, false, name, decl);
761 else
763 /* Avoid repeating the name replacement. */
764 name = nvptx_name_replacement (name);
765 if (name[0] == '*')
766 name++;
769 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
771 /* PTX declaration. */
772 if (DECL_EXTERNAL (decl))
773 s << ".extern ";
774 else if (TREE_PUBLIC (decl))
775 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
776 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
778 tree fntype = TREE_TYPE (decl);
779 tree result_type = TREE_TYPE (fntype);
781 /* atomic_compare_exchange_$n builtins have an exceptional calling
782 convention. */
783 int not_atomic_weak_arg = -1;
784 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
785 switch (DECL_FUNCTION_CODE (decl))
787 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
788 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
789 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
790 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
791 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
792 /* These atomics skip the 'weak' parm in an actual library
793 call. We must skip it in the prototype too. */
794 not_atomic_weak_arg = 3;
795 break;
797 default:
798 break;
801 /* Declare the result. */
802 bool return_in_mem = write_return_type (s, true, result_type);
804 s << name;
806 int argno = 0;
808 /* Emit argument list. */
809 if (return_in_mem)
810 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
812 /* We get:
813 NULL in TYPE_ARG_TYPES, for old-style functions
814 NULL in DECL_ARGUMENTS, for builtin functions without another
815 declaration.
816 So we have to pick the best one we have. */
817 tree args = TYPE_ARG_TYPES (fntype);
818 bool prototyped = true;
819 if (!args)
821 args = DECL_ARGUMENTS (decl);
822 prototyped = false;
825 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
827 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
829 if (not_atomic_weak_arg)
830 argno = write_arg_type (s, -1, argno, type, prototyped);
831 else
832 gcc_assert (type == boolean_type_node);
835 if (stdarg_p (fntype))
836 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
838 if (DECL_STATIC_CHAIN (decl))
839 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
841 if (!argno && strcmp (name, "main") == 0)
843 argno = write_arg_type (s, -1, argno, integer_type_node, true);
844 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
847 if (argno)
848 s << ")";
850 s << (is_defn ? "\n" : ";\n");
852 return name;
855 /* Construct a function declaration from a call insn. This can be
856 necessary for two reasons - either we have an indirect call which
857 requires a .callprototype declaration, or we have a libcall
858 generated by emit_library_call for which no decl exists. */
860 static void
861 write_fn_proto_from_insn (std::stringstream &s, const char *name,
862 rtx result, rtx pat)
864 if (!name)
866 s << "\t.callprototype ";
867 name = "_";
869 else
871 name = nvptx_name_replacement (name);
872 write_fn_marker (s, false, true, name);
873 s << "\t.extern .func ";
876 if (result != NULL_RTX)
877 write_return_mode (s, true, GET_MODE (result));
879 s << name;
881 int arg_end = XVECLEN (pat, 0);
882 for (int i = 1; i < arg_end; i++)
884 /* We don't have to deal with mode splitting & promotion here,
885 as that was already done when generating the call
886 sequence. */
887 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
889 write_arg_mode (s, -1, i - 1, mode);
891 if (arg_end != 1)
892 s << ")";
893 s << ";\n";
896 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
897 table and and write a ptx prototype. These are emitted at end of
898 compilation. */
900 static void
901 nvptx_record_fndecl (tree decl)
903 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
904 if (*slot == NULL)
906 *slot = decl;
907 const char *name = get_fnname_from_decl (decl);
908 write_fn_proto (func_decls, false, name, decl);
912 /* Record a libcall or unprototyped external function. CALLEE is the
913 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
914 declaration for it. */
916 static void
917 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
919 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
920 if (*slot == NULL)
922 *slot = callee;
924 const char *name = XSTR (callee, 0);
925 write_fn_proto_from_insn (func_decls, name, retval, pat);
929 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
930 is prototyped, record it now. Otherwise record it as needed at end
931 of compilation, when we might have more information about it. */
933 void
934 nvptx_record_needed_fndecl (tree decl)
936 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
938 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
939 if (*slot == NULL)
940 *slot = decl;
942 else
943 nvptx_record_fndecl (decl);
946 /* SYM is a SYMBOL_REF. If it refers to an external function, record
947 it as needed. */
949 static void
950 nvptx_maybe_record_fnsym (rtx sym)
952 tree decl = SYMBOL_REF_DECL (sym);
954 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
955 nvptx_record_needed_fndecl (decl);
958 /* Emit a local array to hold some part of a conventional stack frame
959 and initialize REGNO to point to it. If the size is zero, it'll
960 never be valid to dereference, so we can simply initialize to
961 zero. */
963 static void
964 init_frame (FILE *file, int regno, unsigned align, unsigned size)
966 if (size)
967 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
968 align, reg_names[regno], size);
969 fprintf (file, "\t.reg.u%d %s;\n",
970 POINTER_SIZE, reg_names[regno]);
971 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
972 : "\tmov.u%d %s, 0;\n"),
973 POINTER_SIZE, reg_names[regno], reg_names[regno]);
976 /* Emit code to initialize the REGNO predicate register to indicate
977 whether we are not lane zero on the NAME axis. */
979 static void
980 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
982 fprintf (file, "\t{\n");
983 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
984 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
985 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
986 fprintf (file, "\t}\n");
989 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
990 function, including local var decls and copies from the arguments to
991 local regs. */
993 void
994 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
996 tree fntype = TREE_TYPE (decl);
997 tree result_type = TREE_TYPE (fntype);
998 int argno = 0;
1000 /* We construct the initial part of the function into a string
1001 stream, in order to share the prototype writing code. */
1002 std::stringstream s;
1003 write_fn_proto (s, true, name, decl);
1004 s << "{\n";
1006 bool return_in_mem = write_return_type (s, false, result_type);
1007 if (return_in_mem)
1008 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1010 /* Declare and initialize incoming arguments. */
1011 tree args = TYPE_ARG_TYPES (fntype);
1012 bool prototyped = true;
1013 if (!args)
1015 args = DECL_ARGUMENTS (decl);
1016 prototyped = false;
1019 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1021 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1023 argno = write_arg_type (s, 0, argno, type, prototyped);
1026 if (stdarg_p (fntype))
1027 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1028 true);
1030 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1031 write_arg_type (s, STATIC_CHAIN_REGNUM,
1032 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1033 true);
1035 fprintf (file, "%s", s.str().c_str());
1037 /* Declare a local var for outgoing varargs. */
1038 if (cfun->machine->has_varadic)
1039 init_frame (file, STACK_POINTER_REGNUM,
1040 UNITS_PER_WORD, crtl->outgoing_args_size);
1042 /* Declare a local variable for the frame. Force its size to be
1043 DImode-compatible. */
1044 HOST_WIDE_INT sz = get_frame_size ();
1045 if (sz || cfun->machine->has_chain)
1046 init_frame (file, FRAME_POINTER_REGNUM,
1047 crtl->stack_alignment_needed / BITS_PER_UNIT,
1048 (sz + GET_MODE_SIZE (DImode) - 1)
1049 & ~(HOST_WIDE_INT)(GET_MODE_SIZE (DImode) - 1));
1051 /* Declare the pseudos we have as ptx registers. */
1052 int maxregs = max_reg_num ();
1053 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1055 if (regno_reg_rtx[i] != const0_rtx)
1057 machine_mode mode = PSEUDO_REGNO_MODE (i);
1058 machine_mode split = maybe_split_mode (mode);
1060 if (split != VOIDmode)
1061 mode = split;
1062 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1063 output_reg (file, i, split, -2);
1064 fprintf (file, ";\n");
1068 /* Emit axis predicates. */
1069 if (cfun->machine->axis_predicate[0])
1070 nvptx_init_axis_predicate (file,
1071 REGNO (cfun->machine->axis_predicate[0]), "y");
1072 if (cfun->machine->axis_predicate[1])
1073 nvptx_init_axis_predicate (file,
1074 REGNO (cfun->machine->axis_predicate[1]), "x");
1077 /* Output a return instruction. Also copy the return value to its outgoing
1078 location. */
1080 const char *
1081 nvptx_output_return (void)
1083 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1085 if (mode != VOIDmode)
1086 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1087 nvptx_ptx_type_from_mode (mode, false),
1088 reg_names[NVPTX_RETURN_REGNUM],
1089 reg_names[NVPTX_RETURN_REGNUM]);
1091 return "ret;";
1094 /* Terminate a function by writing a closing brace to FILE. */
1096 void
1097 nvptx_function_end (FILE *file)
1099 fprintf (file, "}\n");
1102 /* Decide whether we can make a sibling call to a function. For ptx, we
1103 can't. */
1105 static bool
1106 nvptx_function_ok_for_sibcall (tree, tree)
1108 return false;
1111 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1113 static rtx
1114 nvptx_get_drap_rtx (void)
1116 return NULL_RTX;
1119 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1120 argument to the next call. */
1122 static void
1123 nvptx_call_args (rtx arg, tree fntype)
1125 if (!cfun->machine->doing_call)
1127 cfun->machine->doing_call = true;
1128 cfun->machine->is_varadic = false;
1129 cfun->machine->num_args = 0;
1131 if (fntype && stdarg_p (fntype))
1133 cfun->machine->is_varadic = true;
1134 cfun->machine->has_varadic = true;
1135 cfun->machine->num_args++;
1139 if (REG_P (arg) && arg != pc_rtx)
1141 cfun->machine->num_args++;
1142 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1143 cfun->machine->call_args);
1147 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1148 information we recorded. */
1150 static void
1151 nvptx_end_call_args (void)
1153 cfun->machine->doing_call = false;
1154 free_EXPR_LIST_list (&cfun->machine->call_args);
1157 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1158 track of whether calls involving static chains or varargs were seen
1159 in the current function.
1160 For libcalls, maintain a hash table of decls we have seen, and
1161 record a function decl for later when encountering a new one. */
1163 void
1164 nvptx_expand_call (rtx retval, rtx address)
1166 rtx callee = XEXP (address, 0);
1167 rtx varargs = NULL_RTX;
1168 unsigned parallel = 0;
1170 if (!call_insn_operand (callee, Pmode))
1172 callee = force_reg (Pmode, callee);
1173 address = change_address (address, QImode, callee);
1176 if (GET_CODE (callee) == SYMBOL_REF)
1178 tree decl = SYMBOL_REF_DECL (callee);
1179 if (decl != NULL_TREE)
1181 if (DECL_STATIC_CHAIN (decl))
1182 cfun->machine->has_chain = true;
1184 tree attr = get_oacc_fn_attrib (decl);
1185 if (attr)
1187 tree dims = TREE_VALUE (attr);
1189 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1190 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1192 if (TREE_PURPOSE (dims)
1193 && !integer_zerop (TREE_PURPOSE (dims)))
1194 break;
1195 /* Not on this axis. */
1196 parallel ^= GOMP_DIM_MASK (ix);
1197 dims = TREE_CHAIN (dims);
1203 unsigned nargs = cfun->machine->num_args;
1204 if (cfun->machine->is_varadic)
1206 varargs = gen_reg_rtx (Pmode);
1207 emit_move_insn (varargs, stack_pointer_rtx);
1210 rtvec vec = rtvec_alloc (nargs + 1);
1211 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1212 int vec_pos = 0;
1214 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1215 rtx tmp_retval = retval;
1216 if (retval)
1218 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1219 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1220 call = gen_rtx_SET (tmp_retval, call);
1222 XVECEXP (pat, 0, vec_pos++) = call;
1224 /* Construct the call insn, including a USE for each argument pseudo
1225 register. These will be used when printing the insn. */
1226 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1227 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1229 if (varargs)
1230 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1232 gcc_assert (vec_pos = XVECLEN (pat, 0));
1234 nvptx_emit_forking (parallel, true);
1235 emit_call_insn (pat);
1236 nvptx_emit_joining (parallel, true);
1238 if (tmp_retval != retval)
1239 emit_move_insn (retval, tmp_retval);
1242 /* Emit a comparison COMPARE, and return the new test to be used in the
1243 jump. */
1246 nvptx_expand_compare (rtx compare)
1248 rtx pred = gen_reg_rtx (BImode);
1249 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1250 XEXP (compare, 0), XEXP (compare, 1));
1251 emit_insn (gen_rtx_SET (pred, cmp));
1252 return gen_rtx_NE (BImode, pred, const0_rtx);
1255 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1257 void
1258 nvptx_expand_oacc_fork (unsigned mode)
1260 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1263 void
1264 nvptx_expand_oacc_join (unsigned mode)
1266 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1269 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1270 objects. */
1272 static rtx
1273 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1275 rtx res;
1277 switch (GET_MODE (src))
1279 case DImode:
1280 res = gen_unpackdisi2 (dst0, dst1, src);
1281 break;
1282 case DFmode:
1283 res = gen_unpackdfsi2 (dst0, dst1, src);
1284 break;
1285 default: gcc_unreachable ();
1287 return res;
1290 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1291 object. */
1293 static rtx
1294 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1296 rtx res;
1298 switch (GET_MODE (dst))
1300 case DImode:
1301 res = gen_packsidi2 (dst, src0, src1);
1302 break;
1303 case DFmode:
1304 res = gen_packsidf2 (dst, src0, src1);
1305 break;
1306 default: gcc_unreachable ();
1308 return res;
1311 /* Generate an instruction or sequence to broadcast register REG
1312 across the vectors of a single warp. */
1314 static rtx
1315 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1317 rtx res;
1319 switch (GET_MODE (dst))
1321 case SImode:
1322 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1323 break;
1324 case SFmode:
1325 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1326 break;
1327 case DImode:
1328 case DFmode:
1330 rtx tmp0 = gen_reg_rtx (SImode);
1331 rtx tmp1 = gen_reg_rtx (SImode);
1333 start_sequence ();
1334 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1335 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1336 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1337 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1338 res = get_insns ();
1339 end_sequence ();
1341 break;
1342 case BImode:
1344 rtx tmp = gen_reg_rtx (SImode);
1346 start_sequence ();
1347 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1348 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1349 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1350 res = get_insns ();
1351 end_sequence ();
1353 break;
1354 case QImode:
1355 case HImode:
1357 rtx tmp = gen_reg_rtx (SImode);
1359 start_sequence ();
1360 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1361 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1362 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1363 tmp)));
1364 res = get_insns ();
1365 end_sequence ();
1367 break;
1369 default:
1370 gcc_unreachable ();
1372 return res;
1375 /* Generate an instruction or sequence to broadcast register REG
1376 across the vectors of a single warp. */
1378 static rtx
1379 nvptx_gen_vcast (rtx reg)
1381 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1384 /* Structure used when generating a worker-level spill or fill. */
1386 struct wcast_data_t
1388 rtx base; /* Register holding base addr of buffer. */
1389 rtx ptr; /* Iteration var, if needed. */
1390 unsigned offset; /* Offset into worker buffer. */
1393 /* Direction of the spill/fill and looping setup/teardown indicator. */
1395 enum propagate_mask
1397 PM_read = 1 << 0,
1398 PM_write = 1 << 1,
1399 PM_loop_begin = 1 << 2,
1400 PM_loop_end = 1 << 3,
1402 PM_read_write = PM_read | PM_write
1405 /* Generate instruction(s) to spill or fill register REG to/from the
1406 worker broadcast array. PM indicates what is to be done, REP
1407 how many loop iterations will be executed (0 for not a loop). */
1409 static rtx
1410 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1412 rtx res;
1413 machine_mode mode = GET_MODE (reg);
1415 switch (mode)
1417 case BImode:
1419 rtx tmp = gen_reg_rtx (SImode);
1421 start_sequence ();
1422 if (pm & PM_read)
1423 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1424 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1425 if (pm & PM_write)
1426 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1427 res = get_insns ();
1428 end_sequence ();
1430 break;
1432 default:
1434 rtx addr = data->ptr;
1436 if (!addr)
1438 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1440 if (align > worker_bcast_align)
1441 worker_bcast_align = align;
1442 data->offset = (data->offset + align - 1) & ~(align - 1);
1443 addr = data->base;
1444 if (data->offset)
1445 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1448 addr = gen_rtx_MEM (mode, addr);
1449 if (pm == PM_read)
1450 res = gen_rtx_SET (addr, reg);
1451 else if (pm == PM_write)
1452 res = gen_rtx_SET (reg, addr);
1453 else
1454 gcc_unreachable ();
1456 if (data->ptr)
1458 /* We're using a ptr, increment it. */
1459 start_sequence ();
1461 emit_insn (res);
1462 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1463 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1464 res = get_insns ();
1465 end_sequence ();
1467 else
1468 rep = 1;
1469 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1471 break;
1473 return res;
1476 /* Returns true if X is a valid address for use in a memory reference. */
1478 static bool
1479 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1481 enum rtx_code code = GET_CODE (x);
1483 switch (code)
1485 case REG:
1486 return true;
1488 case PLUS:
1489 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1490 return true;
1491 return false;
1493 case CONST:
1494 case SYMBOL_REF:
1495 case LABEL_REF:
1496 return true;
1498 default:
1499 return false;
1503 /* Machinery to output constant initializers. When beginning an
1504 initializer, we decide on a fragment size (which is visible in ptx
1505 in the type used), and then all initializer data is buffered until
1506 a fragment is filled and ready to be written out. */
1508 static struct
1510 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1511 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1512 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1513 out. */
1514 unsigned size; /* Fragment size to accumulate. */
1515 unsigned offset; /* Offset within current fragment. */
1516 bool started; /* Whether we've output any initializer. */
1517 } init_frag;
1519 /* The current fragment is full, write it out. SYM may provide a
1520 symbolic reference we should output, in which case the fragment
1521 value is the addend. */
1523 static void
1524 output_init_frag (rtx sym)
1526 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1527 unsigned HOST_WIDE_INT val = init_frag.val;
1529 init_frag.started = true;
1530 init_frag.val = 0;
1531 init_frag.offset = 0;
1532 init_frag.remaining--;
1534 if (sym)
1536 fprintf (asm_out_file, "generic(");
1537 output_address (VOIDmode, sym);
1538 fprintf (asm_out_file, val ? ") + " : ")");
1541 if (!sym || val)
1542 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1545 /* Add value VAL of size SIZE to the data we're emitting, and keep
1546 writing out chunks as they fill up. */
1548 static void
1549 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1551 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1553 for (unsigned part = 0; size; size -= part)
1555 val >>= part * BITS_PER_UNIT;
1556 part = init_frag.size - init_frag.offset;
1557 if (part > size)
1558 part = size;
1560 unsigned HOST_WIDE_INT partial
1561 = val << (init_frag.offset * BITS_PER_UNIT);
1562 init_frag.val |= partial & init_frag.mask;
1563 init_frag.offset += part;
1565 if (init_frag.offset == init_frag.size)
1566 output_init_frag (NULL);
1570 /* Target hook for assembling integer object X of size SIZE. */
1572 static bool
1573 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1575 HOST_WIDE_INT val = 0;
1577 switch (GET_CODE (x))
1579 default:
1580 /* Let the generic machinery figure it out, usually for a
1581 CONST_WIDE_INT. */
1582 return false;
1584 case CONST_INT:
1585 nvptx_assemble_value (INTVAL (x), size);
1586 break;
1588 case CONST:
1589 x = XEXP (x, 0);
1590 gcc_assert (GET_CODE (x) == PLUS);
1591 val = INTVAL (XEXP (x, 1));
1592 x = XEXP (x, 0);
1593 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1594 /* FALLTHROUGH */
1596 case SYMBOL_REF:
1597 gcc_assert (size == init_frag.size);
1598 if (init_frag.offset)
1599 sorry ("cannot emit unaligned pointers in ptx assembly");
1601 nvptx_maybe_record_fnsym (x);
1602 init_frag.val = val;
1603 output_init_frag (x);
1604 break;
1607 return true;
1610 /* Output SIZE zero bytes. We ignore the FILE argument since the
1611 functions we're calling to perform the output just use
1612 asm_out_file. */
1614 void
1615 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1617 /* Finish the current fragment, if it's started. */
1618 if (init_frag.offset)
1620 unsigned part = init_frag.size - init_frag.offset;
1621 if (part > size)
1622 part = (unsigned) size;
1623 size -= part;
1624 nvptx_assemble_value (0, part);
1627 /* If this skip doesn't terminate the initializer, write as many
1628 remaining pieces as possible directly. */
1629 if (size < init_frag.remaining * init_frag.size)
1631 while (size >= init_frag.size)
1633 size -= init_frag.size;
1634 output_init_frag (NULL_RTX);
1636 if (size)
1637 nvptx_assemble_value (0, size);
1641 /* Output a string STR with length SIZE. As in nvptx_output_skip we
1642 ignore the FILE arg. */
1644 void
1645 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1647 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1648 nvptx_assemble_value (str[i], 1);
1651 /* Emit a PTX variable decl and prepare for emission of its
1652 initializer. NAME is the symbol name and SETION the PTX data
1653 area. The type is TYPE, object size SIZE and alignment is ALIGN.
1654 The caller has already emitted any indentation and linkage
1655 specifier. It is responsible for any initializer, terminating ;
1656 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
1657 this is the opposite way round that PTX wants them! */
1659 static void
1660 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
1661 const_tree type, HOST_WIDE_INT size, unsigned align)
1663 while (TREE_CODE (type) == ARRAY_TYPE)
1664 type = TREE_TYPE (type);
1666 if (TREE_CODE (type) == VECTOR_TYPE
1667 || TREE_CODE (type) == COMPLEX_TYPE)
1668 /* Neither vector nor complex types can contain the other. */
1669 type = TREE_TYPE (type);
1671 unsigned elt_size = int_size_in_bytes (type);
1673 /* Largest mode we're prepared to accept. For BLKmode types we
1674 don't know if it'll contain pointer constants, so have to choose
1675 pointer size, otherwise we can choose DImode. */
1676 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
1678 elt_size |= GET_MODE_SIZE (elt_mode);
1679 elt_size &= -elt_size; /* Extract LSB set. */
1681 init_frag.size = elt_size;
1682 /* Avoid undefined shift behavior by using '2'. */
1683 init_frag.mask = ((unsigned HOST_WIDE_INT)2
1684 << (elt_size * BITS_PER_UNIT - 1)) - 1;
1685 init_frag.val = 0;
1686 init_frag.offset = 0;
1687 init_frag.started = false;
1688 /* Size might not be a multiple of elt size, if there's an
1689 initialized trailing struct array with smaller type than
1690 elt_size. */
1691 init_frag.remaining = (size + elt_size - 1) / elt_size;
1693 fprintf (file, "%s .align %d .u%d ",
1694 section, align / BITS_PER_UNIT,
1695 elt_size * BITS_PER_UNIT);
1696 assemble_name (file, name);
1698 if (size)
1699 /* We make everything an array, to simplify any initialization
1700 emission. */
1701 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
1704 /* Called when the initializer for a decl has been completely output through
1705 combinations of the three functions above. */
1707 static void
1708 nvptx_assemble_decl_end (void)
1710 if (init_frag.offset)
1711 /* This can happen with a packed struct with trailing array member. */
1712 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
1713 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
1716 /* Output an uninitialized common or file-scope variable. */
1718 void
1719 nvptx_output_aligned_decl (FILE *file, const char *name,
1720 const_tree decl, HOST_WIDE_INT size, unsigned align)
1722 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1724 /* If this is public, it is common. The nearest thing we have to
1725 common is weak. */
1726 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
1728 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1729 TREE_TYPE (decl), size, align);
1730 nvptx_assemble_decl_end ();
1733 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1734 writing a constant variable EXP with NAME and SIZE and its
1735 initializer to FILE. */
1737 static void
1738 nvptx_asm_declare_constant_name (FILE *file, const char *name,
1739 const_tree exp, HOST_WIDE_INT obj_size)
1741 write_var_marker (file, true, false, name);
1743 fprintf (file, "\t");
1745 tree type = TREE_TYPE (exp);
1746 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
1747 TYPE_ALIGN (type));
1750 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1751 a variable DECL with NAME to FILE. */
1753 void
1754 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1756 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1758 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
1759 : DECL_WEAK (decl) ? ".weak " : ".visible "));
1761 tree type = TREE_TYPE (decl);
1762 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
1763 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1764 type, obj_size, DECL_ALIGN (decl));
1767 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1769 static void
1770 nvptx_globalize_label (FILE *, const char *)
1774 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1775 declaration only for variable DECL with NAME to FILE. */
1777 static void
1778 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1780 /* The middle end can place constant pool decls into the varpool as
1781 undefined. Until that is fixed, catch the problem here. */
1782 if (DECL_IN_CONSTANT_POOL (decl))
1783 return;
1785 /* We support weak defintions, and hence have the right
1786 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
1787 if (DECL_WEAK (decl))
1788 error_at (DECL_SOURCE_LOCATION (decl),
1789 "PTX does not support weak declarations"
1790 " (only weak definitions)");
1791 write_var_marker (file, false, TREE_PUBLIC (decl), name);
1793 fprintf (file, "\t.extern ");
1794 tree size = DECL_SIZE_UNIT (decl);
1795 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1796 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
1797 DECL_ALIGN (decl));
1798 nvptx_assemble_decl_end ();
1801 /* Output a pattern for a move instruction. */
1803 const char *
1804 nvptx_output_mov_insn (rtx dst, rtx src)
1806 machine_mode dst_mode = GET_MODE (dst);
1807 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
1808 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
1809 machine_mode src_inner = (GET_CODE (src) == SUBREG
1810 ? GET_MODE (XEXP (src, 0)) : dst_mode);
1812 rtx sym = src;
1813 if (GET_CODE (sym) == CONST)
1814 sym = XEXP (XEXP (sym, 0), 0);
1815 if (SYMBOL_REF_P (sym))
1817 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
1818 return "%.\tcvta%D1%t0\t%0, %1;";
1819 nvptx_maybe_record_fnsym (sym);
1822 if (src_inner == dst_inner)
1823 return "%.\tmov%t0\t%0, %1;";
1825 if (CONSTANT_P (src))
1826 return (GET_MODE_CLASS (dst_inner) == MODE_INT
1827 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
1828 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
1830 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
1831 return "%.\tmov.b%T0\t%0, %1;";
1833 return "%.\tcvt%t0%t1\t%0, %1;";
1836 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
1837 involves writing .param declarations and in/out copies into them. For
1838 indirect calls, also write the .callprototype. */
1840 const char *
1841 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
1843 char buf[16];
1844 static int labelno;
1845 bool needs_tgt = register_operand (callee, Pmode);
1846 rtx pat = PATTERN (insn);
1847 int arg_end = XVECLEN (pat, 0);
1848 tree decl = NULL_TREE;
1850 fprintf (asm_out_file, "\t{\n");
1851 if (result != NULL)
1852 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
1853 nvptx_ptx_type_from_mode (GET_MODE (result), false),
1854 reg_names[NVPTX_RETURN_REGNUM]);
1856 /* Ensure we have a ptx declaration in the output if necessary. */
1857 if (GET_CODE (callee) == SYMBOL_REF)
1859 decl = SYMBOL_REF_DECL (callee);
1860 if (!decl
1861 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
1862 nvptx_record_libfunc (callee, result, pat);
1863 else if (DECL_EXTERNAL (decl))
1864 nvptx_record_fndecl (decl);
1867 if (needs_tgt)
1869 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
1870 labelno++;
1871 ASM_OUTPUT_LABEL (asm_out_file, buf);
1872 std::stringstream s;
1873 write_fn_proto_from_insn (s, NULL, result, pat);
1874 fputs (s.str().c_str(), asm_out_file);
1877 for (int argno = 1; argno < arg_end; argno++)
1879 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
1880 machine_mode mode = GET_MODE (t);
1881 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
1883 /* Mode splitting has already been done. */
1884 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
1885 "\t\tst.param%s [%%out_arg%d], ",
1886 ptx_type, argno, ptx_type, argno);
1887 output_reg (asm_out_file, REGNO (t), VOIDmode);
1888 fprintf (asm_out_file, ";\n");
1891 fprintf (asm_out_file, "\t\tcall ");
1892 if (result != NULL_RTX)
1893 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
1895 if (decl)
1897 const char *name = get_fnname_from_decl (decl);
1898 name = nvptx_name_replacement (name);
1899 assemble_name (asm_out_file, name);
1901 else
1902 output_address (VOIDmode, callee);
1904 const char *open = "(";
1905 for (int argno = 1; argno < arg_end; argno++)
1907 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
1908 open = "";
1910 if (decl && DECL_STATIC_CHAIN (decl))
1912 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
1913 open = "";
1915 if (!open[0])
1916 fprintf (asm_out_file, ")");
1918 if (needs_tgt)
1920 fprintf (asm_out_file, ", ");
1921 assemble_name (asm_out_file, buf);
1923 fprintf (asm_out_file, ";\n");
1925 if (find_reg_note (insn, REG_NORETURN, NULL))
1926 /* No return functions confuse the PTX JIT, as it doesn't realize
1927 the flow control barrier they imply. It can seg fault if it
1928 encounters what looks like an unexitable loop. Emit a trailing
1929 trap, which it does grok. */
1930 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
1932 if (result)
1934 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
1936 if (!rval[0])
1937 /* We must escape the '%' that starts RETURN_REGNUM. */
1938 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
1939 reg_names[NVPTX_RETURN_REGNUM]);
1940 return rval;
1943 return "}";
1946 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
1948 static bool
1949 nvptx_print_operand_punct_valid_p (unsigned char c)
1951 return c == '.' || c== '#';
1954 static void nvptx_print_operand (FILE *, rtx, int);
1956 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
1958 static void
1959 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
1961 rtx off;
1962 if (GET_CODE (x) == CONST)
1963 x = XEXP (x, 0);
1964 switch (GET_CODE (x))
1966 case PLUS:
1967 off = XEXP (x, 1);
1968 output_address (VOIDmode, XEXP (x, 0));
1969 fprintf (file, "+");
1970 output_address (VOIDmode, off);
1971 break;
1973 case SYMBOL_REF:
1974 case LABEL_REF:
1975 output_addr_const (file, x);
1976 break;
1978 default:
1979 gcc_assert (GET_CODE (x) != MEM);
1980 nvptx_print_operand (file, x, 0);
1981 break;
1985 /* Write assembly language output for the address ADDR to FILE. */
1987 static void
1988 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
1990 nvptx_print_address_operand (file, addr, mode);
1993 /* Print an operand, X, to FILE, with an optional modifier in CODE.
1995 Meaning of CODE:
1996 . -- print the predicate for the instruction or an emptry string for an
1997 unconditional one.
1998 # -- print a rounding mode for the instruction
2000 A -- print a data area for a MEM
2001 c -- print an opcode suffix for a comparison operator, including a type code
2002 D -- print a data area for a MEM operand
2003 S -- print a shuffle kind specified by CONST_INT
2004 t -- print a type opcode suffix, promoting QImode to 32 bits
2005 T -- print a type size in bits
2006 u -- print a type opcode suffix without promotions. */
2008 static void
2009 nvptx_print_operand (FILE *file, rtx x, int code)
2011 if (code == '.')
2013 x = current_insn_predicate;
2014 if (x)
2016 unsigned int regno = REGNO (XEXP (x, 0));
2017 fputs ("[", file);
2018 if (GET_CODE (x) == EQ)
2019 fputs ("!", file);
2020 fputs (reg_names [regno], file);
2021 fputs ("]", file);
2023 return;
2025 else if (code == '#')
2027 fputs (".rn", file);
2028 return;
2031 enum rtx_code x_code = GET_CODE (x);
2032 machine_mode mode = GET_MODE (x);
2034 switch (code)
2036 case 'A':
2037 x = XEXP (x, 0);
2038 /* FALLTHROUGH. */
2040 case 'D':
2041 if (GET_CODE (x) == CONST)
2042 x = XEXP (x, 0);
2043 if (GET_CODE (x) == PLUS)
2044 x = XEXP (x, 0);
2046 if (GET_CODE (x) == SYMBOL_REF)
2047 fputs (section_for_sym (x), file);
2048 break;
2050 case 't':
2051 case 'u':
2052 if (x_code == SUBREG)
2054 mode = GET_MODE (SUBREG_REG (x));
2055 if (mode == TImode)
2056 mode = DImode;
2057 else if (COMPLEX_MODE_P (mode))
2058 mode = GET_MODE_INNER (mode);
2060 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2061 break;
2063 case 'S':
2065 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2066 /* Same order as nvptx_shuffle_kind. */
2067 static const char *const kinds[] =
2068 {".up", ".down", ".bfly", ".idx"};
2069 fputs (kinds[kind], file);
2071 break;
2073 case 'T':
2074 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2075 break;
2077 case 'j':
2078 fprintf (file, "@");
2079 goto common;
2081 case 'J':
2082 fprintf (file, "@!");
2083 goto common;
2085 case 'c':
2086 mode = GET_MODE (XEXP (x, 0));
2087 switch (x_code)
2089 case EQ:
2090 fputs (".eq", file);
2091 break;
2092 case NE:
2093 if (FLOAT_MODE_P (mode))
2094 fputs (".neu", file);
2095 else
2096 fputs (".ne", file);
2097 break;
2098 case LE:
2099 case LEU:
2100 fputs (".le", file);
2101 break;
2102 case GE:
2103 case GEU:
2104 fputs (".ge", file);
2105 break;
2106 case LT:
2107 case LTU:
2108 fputs (".lt", file);
2109 break;
2110 case GT:
2111 case GTU:
2112 fputs (".gt", file);
2113 break;
2114 case LTGT:
2115 fputs (".ne", file);
2116 break;
2117 case UNEQ:
2118 fputs (".equ", file);
2119 break;
2120 case UNLE:
2121 fputs (".leu", file);
2122 break;
2123 case UNGE:
2124 fputs (".geu", file);
2125 break;
2126 case UNLT:
2127 fputs (".ltu", file);
2128 break;
2129 case UNGT:
2130 fputs (".gtu", file);
2131 break;
2132 case UNORDERED:
2133 fputs (".nan", file);
2134 break;
2135 case ORDERED:
2136 fputs (".num", file);
2137 break;
2138 default:
2139 gcc_unreachable ();
2141 if (FLOAT_MODE_P (mode)
2142 || x_code == EQ || x_code == NE
2143 || x_code == GEU || x_code == GTU
2144 || x_code == LEU || x_code == LTU)
2145 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2146 else
2147 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2148 break;
2149 default:
2150 common:
2151 switch (x_code)
2153 case SUBREG:
2155 rtx inner_x = SUBREG_REG (x);
2156 machine_mode inner_mode = GET_MODE (inner_x);
2157 machine_mode split = maybe_split_mode (inner_mode);
2159 if (split != VOIDmode
2160 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2161 output_reg (file, REGNO (inner_x), split);
2162 else
2163 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2165 break;
2167 case REG:
2168 output_reg (file, REGNO (x), maybe_split_mode (mode));
2169 break;
2171 case MEM:
2172 fputc ('[', file);
2173 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2174 fputc (']', file);
2175 break;
2177 case CONST_INT:
2178 output_addr_const (file, x);
2179 break;
2181 case CONST:
2182 case SYMBOL_REF:
2183 case LABEL_REF:
2184 /* We could use output_addr_const, but that can print things like
2185 "x-8", which breaks ptxas. Need to ensure it is output as
2186 "x+-8". */
2187 nvptx_print_address_operand (file, x, VOIDmode);
2188 break;
2190 case CONST_DOUBLE:
2191 long vals[2];
2192 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2193 vals[0] &= 0xffffffff;
2194 vals[1] &= 0xffffffff;
2195 if (mode == SFmode)
2196 fprintf (file, "0f%08lx", vals[0]);
2197 else
2198 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2199 break;
2201 default:
2202 output_addr_const (file, x);
2207 /* Record replacement regs used to deal with subreg operands. */
2208 struct reg_replace
2210 rtx replacement[MAX_RECOG_OPERANDS];
2211 machine_mode mode;
2212 int n_allocated;
2213 int n_in_use;
2216 /* Allocate or reuse a replacement in R and return the rtx. */
2218 static rtx
2219 get_replacement (struct reg_replace *r)
2221 if (r->n_allocated == r->n_in_use)
2222 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2223 return r->replacement[r->n_in_use++];
2226 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2227 the presence of subregs would break the rules for most instructions.
2228 Replace them with a suitable new register of the right size, plus
2229 conversion copyin/copyout instructions. */
2231 static void
2232 nvptx_reorg_subreg (void)
2234 struct reg_replace qiregs, hiregs, siregs, diregs;
2235 rtx_insn *insn, *next;
2237 qiregs.n_allocated = 0;
2238 hiregs.n_allocated = 0;
2239 siregs.n_allocated = 0;
2240 diregs.n_allocated = 0;
2241 qiregs.mode = QImode;
2242 hiregs.mode = HImode;
2243 siregs.mode = SImode;
2244 diregs.mode = DImode;
2246 for (insn = get_insns (); insn; insn = next)
2248 next = NEXT_INSN (insn);
2249 if (!NONDEBUG_INSN_P (insn)
2250 || asm_noperands (PATTERN (insn)) >= 0
2251 || GET_CODE (PATTERN (insn)) == USE
2252 || GET_CODE (PATTERN (insn)) == CLOBBER)
2253 continue;
2255 qiregs.n_in_use = 0;
2256 hiregs.n_in_use = 0;
2257 siregs.n_in_use = 0;
2258 diregs.n_in_use = 0;
2259 extract_insn (insn);
2260 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2262 for (int i = 0; i < recog_data.n_operands; i++)
2264 rtx op = recog_data.operand[i];
2265 if (GET_CODE (op) != SUBREG)
2266 continue;
2268 rtx inner = SUBREG_REG (op);
2270 machine_mode outer_mode = GET_MODE (op);
2271 machine_mode inner_mode = GET_MODE (inner);
2272 gcc_assert (s_ok);
2273 if (s_ok
2274 && (GET_MODE_PRECISION (inner_mode)
2275 >= GET_MODE_PRECISION (outer_mode)))
2276 continue;
2277 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2278 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2279 : outer_mode == HImode ? &hiregs
2280 : outer_mode == SImode ? &siregs
2281 : &diregs);
2282 rtx new_reg = get_replacement (r);
2284 if (recog_data.operand_type[i] != OP_OUT)
2286 enum rtx_code code;
2287 if (GET_MODE_PRECISION (inner_mode)
2288 < GET_MODE_PRECISION (outer_mode))
2289 code = ZERO_EXTEND;
2290 else
2291 code = TRUNCATE;
2293 rtx pat = gen_rtx_SET (new_reg,
2294 gen_rtx_fmt_e (code, outer_mode, inner));
2295 emit_insn_before (pat, insn);
2298 if (recog_data.operand_type[i] != OP_IN)
2300 enum rtx_code code;
2301 if (GET_MODE_PRECISION (inner_mode)
2302 < GET_MODE_PRECISION (outer_mode))
2303 code = TRUNCATE;
2304 else
2305 code = ZERO_EXTEND;
2307 rtx pat = gen_rtx_SET (inner,
2308 gen_rtx_fmt_e (code, inner_mode, new_reg));
2309 emit_insn_after (pat, insn);
2311 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2316 /* Loop structure of the function. The entire function is described as
2317 a NULL loop. */
2319 struct parallel
2321 /* Parent parallel. */
2322 parallel *parent;
2324 /* Next sibling parallel. */
2325 parallel *next;
2327 /* First child parallel. */
2328 parallel *inner;
2330 /* Partitioning mask of the parallel. */
2331 unsigned mask;
2333 /* Partitioning used within inner parallels. */
2334 unsigned inner_mask;
2336 /* Location of parallel forked and join. The forked is the first
2337 block in the parallel and the join is the first block after of
2338 the partition. */
2339 basic_block forked_block;
2340 basic_block join_block;
2342 rtx_insn *forked_insn;
2343 rtx_insn *join_insn;
2345 rtx_insn *fork_insn;
2346 rtx_insn *joining_insn;
2348 /* Basic blocks in this parallel, but not in child parallels. The
2349 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2350 blocks are not. */
2351 auto_vec<basic_block> blocks;
2353 public:
2354 parallel (parallel *parent, unsigned mode);
2355 ~parallel ();
2358 /* Constructor links the new parallel into it's parent's chain of
2359 children. */
2361 parallel::parallel (parallel *parent_, unsigned mask_)
2362 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2364 forked_block = join_block = 0;
2365 forked_insn = join_insn = 0;
2366 fork_insn = joining_insn = 0;
2368 if (parent)
2370 next = parent->inner;
2371 parent->inner = this;
2375 parallel::~parallel ()
2377 delete inner;
2378 delete next;
2381 /* Map of basic blocks to insns */
2382 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2384 /* A tuple of an insn of interest and the BB in which it resides. */
2385 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2386 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2388 /* Split basic blocks such that each forked and join unspecs are at
2389 the start of their basic blocks. Thus afterwards each block will
2390 have a single partitioning mode. We also do the same for return
2391 insns, as they are executed by every thread. Return the
2392 partitioning mode of the function as a whole. Populate MAP with
2393 head and tail blocks. We also clear the BB visited flag, which is
2394 used when finding partitions. */
2396 static void
2397 nvptx_split_blocks (bb_insn_map_t *map)
2399 insn_bb_vec_t worklist;
2400 basic_block block;
2401 rtx_insn *insn;
2403 /* Locate all the reorg instructions of interest. */
2404 FOR_ALL_BB_FN (block, cfun)
2406 bool seen_insn = false;
2408 /* Clear visited flag, for use by parallel locator */
2409 block->flags &= ~BB_VISITED;
2411 FOR_BB_INSNS (block, insn)
2413 if (!INSN_P (insn))
2414 continue;
2415 switch (recog_memoized (insn))
2417 default:
2418 seen_insn = true;
2419 continue;
2420 case CODE_FOR_nvptx_forked:
2421 case CODE_FOR_nvptx_join:
2422 break;
2424 case CODE_FOR_return:
2425 /* We also need to split just before return insns, as
2426 that insn needs executing by all threads, but the
2427 block it is in probably does not. */
2428 break;
2431 if (seen_insn)
2432 /* We've found an instruction that must be at the start of
2433 a block, but isn't. Add it to the worklist. */
2434 worklist.safe_push (insn_bb_t (insn, block));
2435 else
2436 /* It was already the first instruction. Just add it to
2437 the map. */
2438 map->get_or_insert (block) = insn;
2439 seen_insn = true;
2443 /* Split blocks on the worklist. */
2444 unsigned ix;
2445 insn_bb_t *elt;
2446 basic_block remap = 0;
2447 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2449 if (remap != elt->second)
2451 block = elt->second;
2452 remap = block;
2455 /* Split block before insn. The insn is in the new block */
2456 edge e = split_block (block, PREV_INSN (elt->first));
2458 block = e->dest;
2459 map->get_or_insert (block) = elt->first;
2463 /* BLOCK is a basic block containing a head or tail instruction.
2464 Locate the associated prehead or pretail instruction, which must be
2465 in the single predecessor block. */
2467 static rtx_insn *
2468 nvptx_discover_pre (basic_block block, int expected)
2470 gcc_assert (block->preds->length () == 1);
2471 basic_block pre_block = (*block->preds)[0]->src;
2472 rtx_insn *pre_insn;
2474 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2475 pre_insn = PREV_INSN (pre_insn))
2476 gcc_assert (pre_insn != BB_HEAD (pre_block));
2478 gcc_assert (recog_memoized (pre_insn) == expected);
2479 return pre_insn;
2482 /* Dump this parallel and all its inner parallels. */
2484 static void
2485 nvptx_dump_pars (parallel *par, unsigned depth)
2487 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2488 depth, par->mask,
2489 par->forked_block ? par->forked_block->index : -1,
2490 par->join_block ? par->join_block->index : -1);
2492 fprintf (dump_file, " blocks:");
2494 basic_block block;
2495 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2496 fprintf (dump_file, " %d", block->index);
2497 fprintf (dump_file, "\n");
2498 if (par->inner)
2499 nvptx_dump_pars (par->inner, depth + 1);
2501 if (par->next)
2502 nvptx_dump_pars (par->next, depth);
2505 /* If BLOCK contains a fork/join marker, process it to create or
2506 terminate a loop structure. Add this block to the current loop,
2507 and then walk successor blocks. */
2509 static parallel *
2510 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2512 if (block->flags & BB_VISITED)
2513 return par;
2514 block->flags |= BB_VISITED;
2516 if (rtx_insn **endp = map->get (block))
2518 rtx_insn *end = *endp;
2520 /* This is a block head or tail, or return instruction. */
2521 switch (recog_memoized (end))
2523 case CODE_FOR_return:
2524 /* Return instructions are in their own block, and we
2525 don't need to do anything more. */
2526 return par;
2528 case CODE_FOR_nvptx_forked:
2529 /* Loop head, create a new inner loop and add it into
2530 our parent's child list. */
2532 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2534 gcc_assert (mask);
2535 par = new parallel (par, mask);
2536 par->forked_block = block;
2537 par->forked_insn = end;
2538 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2539 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2540 par->fork_insn
2541 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2543 break;
2545 case CODE_FOR_nvptx_join:
2546 /* A loop tail. Finish the current loop and return to
2547 parent. */
2549 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2551 gcc_assert (par->mask == mask);
2552 par->join_block = block;
2553 par->join_insn = end;
2554 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2555 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2556 par->joining_insn
2557 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2558 par = par->parent;
2560 break;
2562 default:
2563 gcc_unreachable ();
2567 if (par)
2568 /* Add this block onto the current loop's list of blocks. */
2569 par->blocks.safe_push (block);
2570 else
2571 /* This must be the entry block. Create a NULL parallel. */
2572 par = new parallel (0, 0);
2574 /* Walk successor blocks. */
2575 edge e;
2576 edge_iterator ei;
2578 FOR_EACH_EDGE (e, ei, block->succs)
2579 nvptx_find_par (map, par, e->dest);
2581 return par;
2584 /* DFS walk the CFG looking for fork & join markers. Construct
2585 loop structures as we go. MAP is a mapping of basic blocks
2586 to head & tail markers, discovered when splitting blocks. This
2587 speeds up the discovery. We rely on the BB visited flag having
2588 been cleared when splitting blocks. */
2590 static parallel *
2591 nvptx_discover_pars (bb_insn_map_t *map)
2593 basic_block block;
2595 /* Mark exit blocks as visited. */
2596 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2597 block->flags |= BB_VISITED;
2599 /* And entry block as not. */
2600 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2601 block->flags &= ~BB_VISITED;
2603 parallel *par = nvptx_find_par (map, 0, block);
2605 if (dump_file)
2607 fprintf (dump_file, "\nLoops\n");
2608 nvptx_dump_pars (par, 0);
2609 fprintf (dump_file, "\n");
2612 return par;
2615 /* Analyse a group of BBs within a partitioned region and create N
2616 Single-Entry-Single-Exit regions. Some of those regions will be
2617 trivial ones consisting of a single BB. The blocks of a
2618 partitioned region might form a set of disjoint graphs -- because
2619 the region encloses a differently partitoned sub region.
2621 We use the linear time algorithm described in 'Finding Regions Fast:
2622 Single Entry Single Exit and control Regions in Linear Time'
2623 Johnson, Pearson & Pingali. That algorithm deals with complete
2624 CFGs, where a back edge is inserted from END to START, and thus the
2625 problem becomes one of finding equivalent loops.
2627 In this case we have a partial CFG. We complete it by redirecting
2628 any incoming edge to the graph to be from an arbitrary external BB,
2629 and similarly redirecting any outgoing edge to be to that BB.
2630 Thus we end up with a closed graph.
2632 The algorithm works by building a spanning tree of an undirected
2633 graph and keeping track of back edges from nodes further from the
2634 root in the tree to nodes nearer to the root in the tree. In the
2635 description below, the root is up and the tree grows downwards.
2637 We avoid having to deal with degenerate back-edges to the same
2638 block, by splitting each BB into 3 -- one for input edges, one for
2639 the node itself and one for the output edges. Such back edges are
2640 referred to as 'Brackets'. Cycle equivalent nodes will have the
2641 same set of brackets.
2643 Determining bracket equivalency is done by maintaining a list of
2644 brackets in such a manner that the list length and final bracket
2645 uniquely identify the set.
2647 We use coloring to mark all BBs with cycle equivalency with the
2648 same color. This is the output of the 'Finding Regions Fast'
2649 algorithm. Notice it doesn't actually find the set of nodes within
2650 a particular region, just unorderd sets of nodes that are the
2651 entries and exits of SESE regions.
2653 After determining cycle equivalency, we need to find the minimal
2654 set of SESE regions. Do this with a DFS coloring walk of the
2655 complete graph. We're either 'looking' or 'coloring'. When
2656 looking, and we're in the subgraph, we start coloring the color of
2657 the current node, and remember that node as the start of the
2658 current color's SESE region. Every time we go to a new node, we
2659 decrement the count of nodes with thet color. If it reaches zero,
2660 we remember that node as the end of the current color's SESE region
2661 and return to 'looking'. Otherwise we color the node the current
2662 color.
2664 This way we end up with coloring the inside of non-trivial SESE
2665 regions with the color of that region. */
2667 /* A pair of BBs. We use this to represent SESE regions. */
2668 typedef std::pair<basic_block, basic_block> bb_pair_t;
2669 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
2671 /* A node in the undirected CFG. The discriminator SECOND indicates just
2672 above or just below the BB idicated by FIRST. */
2673 typedef std::pair<basic_block, int> pseudo_node_t;
2675 /* A bracket indicates an edge towards the root of the spanning tree of the
2676 undirected graph. Each bracket has a color, determined
2677 from the currrent set of brackets. */
2678 struct bracket
2680 pseudo_node_t back; /* Back target */
2682 /* Current color and size of set. */
2683 unsigned color;
2684 unsigned size;
2686 bracket (pseudo_node_t back_)
2687 : back (back_), color (~0u), size (~0u)
2691 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
2693 if (length != size)
2695 size = length;
2696 color = color_counts.length ();
2697 color_counts.quick_push (0);
2699 color_counts[color]++;
2700 return color;
2704 typedef auto_vec<bracket> bracket_vec_t;
2706 /* Basic block info for finding SESE regions. */
2708 struct bb_sese
2710 int node; /* Node number in spanning tree. */
2711 int parent; /* Parent node number. */
2713 /* The algorithm splits each node A into Ai, A', Ao. The incoming
2714 edges arrive at pseudo-node Ai and the outgoing edges leave at
2715 pseudo-node Ao. We have to remember which way we arrived at a
2716 particular node when generating the spanning tree. dir > 0 means
2717 we arrived at Ai, dir < 0 means we arrived at Ao. */
2718 int dir;
2720 /* Lowest numbered pseudo-node reached via a backedge from thsis
2721 node, or any descendant. */
2722 pseudo_node_t high;
2724 int color; /* Cycle-equivalence color */
2726 /* Stack of brackets for this node. */
2727 bracket_vec_t brackets;
2729 bb_sese (unsigned node_, unsigned p, int dir_)
2730 :node (node_), parent (p), dir (dir_)
2733 ~bb_sese ();
2735 /* Push a bracket ending at BACK. */
2736 void push (const pseudo_node_t &back)
2738 if (dump_file)
2739 fprintf (dump_file, "Pushing backedge %d:%+d\n",
2740 back.first ? back.first->index : 0, back.second);
2741 brackets.safe_push (bracket (back));
2744 void append (bb_sese *child);
2745 void remove (const pseudo_node_t &);
2747 /* Set node's color. */
2748 void set_color (auto_vec<unsigned> &color_counts)
2750 color = brackets.last ().get_color (color_counts, brackets.length ());
2754 bb_sese::~bb_sese ()
2758 /* Destructively append CHILD's brackets. */
2760 void
2761 bb_sese::append (bb_sese *child)
2763 if (int len = child->brackets.length ())
2765 int ix;
2767 if (dump_file)
2769 for (ix = 0; ix < len; ix++)
2771 const pseudo_node_t &pseudo = child->brackets[ix].back;
2772 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
2773 child->node, pseudo.first ? pseudo.first->index : 0,
2774 pseudo.second);
2777 if (!brackets.length ())
2778 std::swap (brackets, child->brackets);
2779 else
2781 brackets.reserve (len);
2782 for (ix = 0; ix < len; ix++)
2783 brackets.quick_push (child->brackets[ix]);
2788 /* Remove brackets that terminate at PSEUDO. */
2790 void
2791 bb_sese::remove (const pseudo_node_t &pseudo)
2793 unsigned removed = 0;
2794 int len = brackets.length ();
2796 for (int ix = 0; ix < len; ix++)
2798 if (brackets[ix].back == pseudo)
2800 if (dump_file)
2801 fprintf (dump_file, "Removing backedge %d:%+d\n",
2802 pseudo.first ? pseudo.first->index : 0, pseudo.second);
2803 removed++;
2805 else if (removed)
2806 brackets[ix-removed] = brackets[ix];
2808 while (removed--)
2809 brackets.pop ();
2812 /* Accessors for BB's aux pointer. */
2813 #define BB_SET_SESE(B, S) ((B)->aux = (S))
2814 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
2816 /* DFS walk creating SESE data structures. Only cover nodes with
2817 BB_VISITED set. Append discovered blocks to LIST. We number in
2818 increments of 3 so that the above and below pseudo nodes can be
2819 implicitly numbered too. */
2821 static int
2822 nvptx_sese_number (int n, int p, int dir, basic_block b,
2823 auto_vec<basic_block> *list)
2825 if (BB_GET_SESE (b))
2826 return n;
2828 if (dump_file)
2829 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
2830 b->index, n, p, dir);
2832 BB_SET_SESE (b, new bb_sese (n, p, dir));
2833 p = n;
2835 n += 3;
2836 list->quick_push (b);
2838 /* First walk the nodes on the 'other side' of this node, then walk
2839 the nodes on the same side. */
2840 for (unsigned ix = 2; ix; ix--)
2842 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
2843 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
2844 : offsetof (edge_def, src));
2845 edge e;
2846 edge_iterator (ei);
2848 FOR_EACH_EDGE (e, ei, edges)
2850 basic_block target = *(basic_block *)((char *)e + offset);
2852 if (target->flags & BB_VISITED)
2853 n = nvptx_sese_number (n, p, dir, target, list);
2855 dir = -dir;
2857 return n;
2860 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
2861 EDGES are the outgoing edges and OFFSET is the offset to the src
2862 or dst block on the edges. */
2864 static void
2865 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
2866 vec<edge, va_gc> *edges, size_t offset)
2868 edge e;
2869 edge_iterator (ei);
2870 int hi_back = depth;
2871 pseudo_node_t node_back (0, depth);
2872 int hi_child = depth;
2873 pseudo_node_t node_child (0, depth);
2874 basic_block child = NULL;
2875 unsigned num_children = 0;
2876 int usd = -dir * sese->dir;
2878 if (dump_file)
2879 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
2880 me->index, sese->node, dir);
2882 if (dir < 0)
2884 /* This is the above pseudo-child. It has the BB itself as an
2885 additional child node. */
2886 node_child = sese->high;
2887 hi_child = node_child.second;
2888 if (node_child.first)
2889 hi_child += BB_GET_SESE (node_child.first)->node;
2890 num_children++;
2893 /* Examine each edge.
2894 - if it is a child (a) append its bracket list and (b) record
2895 whether it is the child with the highest reaching bracket.
2896 - if it is an edge to ancestor, record whether it's the highest
2897 reaching backlink. */
2898 FOR_EACH_EDGE (e, ei, edges)
2900 basic_block target = *(basic_block *)((char *)e + offset);
2902 if (bb_sese *t_sese = BB_GET_SESE (target))
2904 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
2906 /* Child node. Append its bracket list. */
2907 num_children++;
2908 sese->append (t_sese);
2910 /* Compare it's hi value. */
2911 int t_hi = t_sese->high.second;
2913 if (basic_block child_hi_block = t_sese->high.first)
2914 t_hi += BB_GET_SESE (child_hi_block)->node;
2916 if (hi_child > t_hi)
2918 hi_child = t_hi;
2919 node_child = t_sese->high;
2920 child = target;
2923 else if (t_sese->node < sese->node + dir
2924 && !(dir < 0 && sese->parent == t_sese->node))
2926 /* Non-parental ancestor node -- a backlink. */
2927 int d = usd * t_sese->dir;
2928 int back = t_sese->node + d;
2930 if (hi_back > back)
2932 hi_back = back;
2933 node_back = pseudo_node_t (target, d);
2937 else
2938 { /* Fallen off graph, backlink to entry node. */
2939 hi_back = 0;
2940 node_back = pseudo_node_t (0, 0);
2944 /* Remove any brackets that terminate at this pseudo node. */
2945 sese->remove (pseudo_node_t (me, dir));
2947 /* Now push any backlinks from this pseudo node. */
2948 FOR_EACH_EDGE (e, ei, edges)
2950 basic_block target = *(basic_block *)((char *)e + offset);
2951 if (bb_sese *t_sese = BB_GET_SESE (target))
2953 if (t_sese->node < sese->node + dir
2954 && !(dir < 0 && sese->parent == t_sese->node))
2955 /* Non-parental ancestor node - backedge from me. */
2956 sese->push (pseudo_node_t (target, usd * t_sese->dir));
2958 else
2960 /* back edge to entry node */
2961 sese->push (pseudo_node_t (0, 0));
2965 /* If this node leads directly or indirectly to a no-return region of
2966 the graph, then fake a backedge to entry node. */
2967 if (!sese->brackets.length () || !edges || !edges->length ())
2969 hi_back = 0;
2970 node_back = pseudo_node_t (0, 0);
2971 sese->push (node_back);
2974 /* Record the highest reaching backedge from us or a descendant. */
2975 sese->high = hi_back < hi_child ? node_back : node_child;
2977 if (num_children > 1)
2979 /* There is more than one child -- this is a Y shaped piece of
2980 spanning tree. We have to insert a fake backedge from this
2981 node to the highest ancestor reached by not-the-highest
2982 reaching child. Note that there may be multiple children
2983 with backedges to the same highest node. That's ok and we
2984 insert the edge to that highest node. */
2985 hi_child = depth;
2986 if (dir < 0 && child)
2988 node_child = sese->high;
2989 hi_child = node_child.second;
2990 if (node_child.first)
2991 hi_child += BB_GET_SESE (node_child.first)->node;
2994 FOR_EACH_EDGE (e, ei, edges)
2996 basic_block target = *(basic_block *)((char *)e + offset);
2998 if (target == child)
2999 /* Ignore the highest child. */
3000 continue;
3002 bb_sese *t_sese = BB_GET_SESE (target);
3003 if (!t_sese)
3004 continue;
3005 if (t_sese->parent != sese->node)
3006 /* Not a child. */
3007 continue;
3009 /* Compare its hi value. */
3010 int t_hi = t_sese->high.second;
3012 if (basic_block child_hi_block = t_sese->high.first)
3013 t_hi += BB_GET_SESE (child_hi_block)->node;
3015 if (hi_child > t_hi)
3017 hi_child = t_hi;
3018 node_child = t_sese->high;
3022 sese->push (node_child);
3027 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3028 proceed to successors. Set SESE entry and exit nodes of
3029 REGIONS. */
3031 static void
3032 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3033 basic_block block, int coloring)
3035 bb_sese *sese = BB_GET_SESE (block);
3037 if (block->flags & BB_VISITED)
3039 /* If we've already encountered this block, either we must not
3040 be coloring, or it must have been colored the current color. */
3041 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3042 return;
3045 block->flags |= BB_VISITED;
3047 if (sese)
3049 if (coloring < 0)
3051 /* Start coloring a region. */
3052 regions[sese->color].first = block;
3053 coloring = sese->color;
3056 if (!--color_counts[sese->color] && sese->color == coloring)
3058 /* Found final block of SESE region. */
3059 regions[sese->color].second = block;
3060 coloring = -1;
3062 else
3063 /* Color the node, so we can assert on revisiting the node
3064 that the graph is indeed SESE. */
3065 sese->color = coloring;
3067 else
3068 /* Fallen off the subgraph, we cannot be coloring. */
3069 gcc_assert (coloring < 0);
3071 /* Walk each successor block. */
3072 if (block->succs && block->succs->length ())
3074 edge e;
3075 edge_iterator ei;
3077 FOR_EACH_EDGE (e, ei, block->succs)
3078 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3080 else
3081 gcc_assert (coloring < 0);
3084 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3085 end up with NULL entries in it. */
3087 static void
3088 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3090 basic_block block;
3091 int ix;
3093 /* First clear each BB of the whole function. */
3094 FOR_ALL_BB_FN (block, cfun)
3096 block->flags &= ~BB_VISITED;
3097 BB_SET_SESE (block, 0);
3100 /* Mark blocks in the function that are in this graph. */
3101 for (ix = 0; blocks.iterate (ix, &block); ix++)
3102 block->flags |= BB_VISITED;
3104 /* Counts of nodes assigned to each color. There cannot be more
3105 colors than blocks (and hopefully there will be fewer). */
3106 auto_vec<unsigned> color_counts;
3107 color_counts.reserve (blocks.length ());
3109 /* Worklist of nodes in the spanning tree. Again, there cannot be
3110 more nodes in the tree than blocks (there will be fewer if the
3111 CFG of blocks is disjoint). */
3112 auto_vec<basic_block> spanlist;
3113 spanlist.reserve (blocks.length ());
3115 /* Make sure every block has its cycle class determined. */
3116 for (ix = 0; blocks.iterate (ix, &block); ix++)
3118 if (BB_GET_SESE (block))
3119 /* We already met this block in an earlier graph solve. */
3120 continue;
3122 if (dump_file)
3123 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3125 /* Number the nodes reachable from block initial DFS order. */
3126 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3128 /* Now walk in reverse DFS order to find cycle equivalents. */
3129 while (spanlist.length ())
3131 block = spanlist.pop ();
3132 bb_sese *sese = BB_GET_SESE (block);
3134 /* Do the pseudo node below. */
3135 nvptx_sese_pseudo (block, sese, depth, +1,
3136 sese->dir > 0 ? block->succs : block->preds,
3137 (sese->dir > 0 ? offsetof (edge_def, dest)
3138 : offsetof (edge_def, src)));
3139 sese->set_color (color_counts);
3140 /* Do the pseudo node above. */
3141 nvptx_sese_pseudo (block, sese, depth, -1,
3142 sese->dir < 0 ? block->succs : block->preds,
3143 (sese->dir < 0 ? offsetof (edge_def, dest)
3144 : offsetof (edge_def, src)));
3146 if (dump_file)
3147 fprintf (dump_file, "\n");
3150 if (dump_file)
3152 unsigned count;
3153 const char *comma = "";
3155 fprintf (dump_file, "Found %d cycle equivalents\n",
3156 color_counts.length ());
3157 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3159 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3161 comma = "";
3162 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3163 if (BB_GET_SESE (block)->color == ix)
3165 block->flags |= BB_VISITED;
3166 fprintf (dump_file, "%s%d", comma, block->index);
3167 comma=",";
3169 fprintf (dump_file, "}");
3170 comma = ", ";
3172 fprintf (dump_file, "\n");
3175 /* Now we've colored every block in the subgraph. We now need to
3176 determine the minimal set of SESE regions that cover that
3177 subgraph. Do this with a DFS walk of the complete function.
3178 During the walk we're either 'looking' or 'coloring'. When we
3179 reach the last node of a particular color, we stop coloring and
3180 return to looking. */
3182 /* There cannot be more SESE regions than colors. */
3183 regions.reserve (color_counts.length ());
3184 for (ix = color_counts.length (); ix--;)
3185 regions.quick_push (bb_pair_t (0, 0));
3187 for (ix = 0; blocks.iterate (ix, &block); ix++)
3188 block->flags &= ~BB_VISITED;
3190 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3192 if (dump_file)
3194 const char *comma = "";
3195 int len = regions.length ();
3197 fprintf (dump_file, "SESE regions:");
3198 for (ix = 0; ix != len; ix++)
3200 basic_block from = regions[ix].first;
3201 basic_block to = regions[ix].second;
3203 if (from)
3205 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3206 if (to != from)
3207 fprintf (dump_file, "->%d", to->index);
3209 int color = BB_GET_SESE (from)->color;
3211 /* Print the blocks within the region (excluding ends). */
3212 FOR_EACH_BB_FN (block, cfun)
3214 bb_sese *sese = BB_GET_SESE (block);
3216 if (sese && sese->color == color
3217 && block != from && block != to)
3218 fprintf (dump_file, ".%d", block->index);
3220 fprintf (dump_file, "}");
3222 comma = ",";
3224 fprintf (dump_file, "\n\n");
3227 for (ix = 0; blocks.iterate (ix, &block); ix++)
3228 delete BB_GET_SESE (block);
3231 #undef BB_SET_SESE
3232 #undef BB_GET_SESE
3234 /* Propagate live state at the start of a partitioned region. BLOCK
3235 provides the live register information, and might not contain
3236 INSN. Propagation is inserted just after INSN. RW indicates whether
3237 we are reading and/or writing state. This
3238 separation is needed for worker-level proppagation where we
3239 essentially do a spill & fill. FN is the underlying worker
3240 function to generate the propagation instructions for single
3241 register. DATA is user data.
3243 We propagate the live register set and the entire frame. We could
3244 do better by (a) propagating just the live set that is used within
3245 the partitioned regions and (b) only propagating stack entries that
3246 are used. The latter might be quite hard to determine. */
3248 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3250 static void
3251 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3252 propagator_fn fn, void *data)
3254 bitmap live = DF_LIVE_IN (block);
3255 bitmap_iterator iterator;
3256 unsigned ix;
3258 /* Copy the frame array. */
3259 HOST_WIDE_INT fs = get_frame_size ();
3260 if (fs)
3262 rtx tmp = gen_reg_rtx (DImode);
3263 rtx idx = NULL_RTX;
3264 rtx ptr = gen_reg_rtx (Pmode);
3265 rtx pred = NULL_RTX;
3266 rtx_code_label *label = NULL;
3268 /* The frame size might not be DImode compatible, but the frame
3269 array's declaration will be. So it's ok to round up here. */
3270 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3271 /* Detect single iteration loop. */
3272 if (fs == 1)
3273 fs = 0;
3275 start_sequence ();
3276 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3277 if (fs)
3279 idx = gen_reg_rtx (SImode);
3280 pred = gen_reg_rtx (BImode);
3281 label = gen_label_rtx ();
3283 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3284 /* Allow worker function to initialize anything needed. */
3285 rtx init = fn (tmp, PM_loop_begin, fs, data);
3286 if (init)
3287 emit_insn (init);
3288 emit_label (label);
3289 LABEL_NUSES (label)++;
3290 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3292 if (rw & PM_read)
3293 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3294 emit_insn (fn (tmp, rw, fs, data));
3295 if (rw & PM_write)
3296 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3297 if (fs)
3299 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3300 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3301 emit_insn (gen_br_true_uni (pred, label));
3302 rtx fini = fn (tmp, PM_loop_end, fs, data);
3303 if (fini)
3304 emit_insn (fini);
3305 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3307 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3308 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3309 rtx cpy = get_insns ();
3310 end_sequence ();
3311 insn = emit_insn_after (cpy, insn);
3314 /* Copy live registers. */
3315 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3317 rtx reg = regno_reg_rtx[ix];
3319 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3321 rtx bcast = fn (reg, rw, 0, data);
3323 insn = emit_insn_after (bcast, insn);
3328 /* Worker for nvptx_vpropagate. */
3330 static rtx
3331 vprop_gen (rtx reg, propagate_mask pm,
3332 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3334 if (!(pm & PM_read_write))
3335 return 0;
3337 return nvptx_gen_vcast (reg);
3340 /* Propagate state that is live at start of BLOCK across the vectors
3341 of a single warp. Propagation is inserted just after INSN. */
3343 static void
3344 nvptx_vpropagate (basic_block block, rtx_insn *insn)
3346 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3349 /* Worker for nvptx_wpropagate. */
3351 static rtx
3352 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3354 wcast_data_t *data = (wcast_data_t *)data_;
3356 if (pm & PM_loop_begin)
3358 /* Starting a loop, initialize pointer. */
3359 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3361 if (align > worker_bcast_align)
3362 worker_bcast_align = align;
3363 data->offset = (data->offset + align - 1) & ~(align - 1);
3365 data->ptr = gen_reg_rtx (Pmode);
3367 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3369 else if (pm & PM_loop_end)
3371 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3372 data->ptr = NULL_RTX;
3373 return clobber;
3375 else
3376 return nvptx_gen_wcast (reg, pm, rep, data);
3379 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3380 indicates if this is just before partitioned mode (do spill), or
3381 just after it starts (do fill). Sequence is inserted just after
3382 INSN. */
3384 static void
3385 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3387 wcast_data_t data;
3389 data.base = gen_reg_rtx (Pmode);
3390 data.offset = 0;
3391 data.ptr = NULL_RTX;
3393 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3394 if (data.offset)
3396 /* Stuff was emitted, initialize the base pointer now. */
3397 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3398 emit_insn_after (init, insn);
3400 if (worker_bcast_size < data.offset)
3401 worker_bcast_size = data.offset;
3405 /* Emit a worker-level synchronization barrier. We use different
3406 markers for before and after synchronizations. */
3408 static rtx
3409 nvptx_wsync (bool after)
3411 return gen_nvptx_barsync (GEN_INT (after));
3414 /* Single neutering according to MASK. FROM is the incoming block and
3415 TO is the outgoing block. These may be the same block. Insert at
3416 start of FROM:
3418 if (tid.<axis>) goto end.
3420 and insert before ending branch of TO (if there is such an insn):
3422 end:
3423 <possibly-broadcast-cond>
3424 <branch>
3426 We currently only use differnt FROM and TO when skipping an entire
3427 loop. We could do more if we detected superblocks. */
3429 static void
3430 nvptx_single (unsigned mask, basic_block from, basic_block to)
3432 rtx_insn *head = BB_HEAD (from);
3433 rtx_insn *tail = BB_END (to);
3434 unsigned skip_mask = mask;
3436 /* Find first insn of from block */
3437 while (head != BB_END (from) && !INSN_P (head))
3438 head = NEXT_INSN (head);
3440 /* Find last insn of to block */
3441 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3442 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3443 tail = PREV_INSN (tail);
3445 /* Detect if tail is a branch. */
3446 rtx tail_branch = NULL_RTX;
3447 rtx cond_branch = NULL_RTX;
3448 if (tail && INSN_P (tail))
3450 tail_branch = PATTERN (tail);
3451 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3452 tail_branch = NULL_RTX;
3453 else
3455 cond_branch = SET_SRC (tail_branch);
3456 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3457 cond_branch = NULL_RTX;
3461 if (tail == head)
3463 /* If this is empty, do nothing. */
3464 if (!head || !INSN_P (head))
3465 return;
3467 /* If this is a dummy insn, do nothing. */
3468 switch (recog_memoized (head))
3470 default:
3471 break;
3472 case CODE_FOR_nvptx_fork:
3473 case CODE_FOR_nvptx_forked:
3474 case CODE_FOR_nvptx_joining:
3475 case CODE_FOR_nvptx_join:
3476 return;
3479 if (cond_branch)
3481 /* If we're only doing vector single, there's no need to
3482 emit skip code because we'll not insert anything. */
3483 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3484 skip_mask = 0;
3486 else if (tail_branch)
3487 /* Block with only unconditional branch. Nothing to do. */
3488 return;
3491 /* Insert the vector test inside the worker test. */
3492 unsigned mode;
3493 rtx_insn *before = tail;
3494 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3495 if (GOMP_DIM_MASK (mode) & skip_mask)
3497 rtx_code_label *label = gen_label_rtx ();
3498 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3500 if (!pred)
3502 pred = gen_reg_rtx (BImode);
3503 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3506 rtx br;
3507 if (mode == GOMP_DIM_VECTOR)
3508 br = gen_br_true (pred, label);
3509 else
3510 br = gen_br_true_uni (pred, label);
3511 emit_insn_before (br, head);
3513 LABEL_NUSES (label)++;
3514 if (tail_branch)
3515 before = emit_label_before (label, before);
3516 else
3517 emit_label_after (label, tail);
3520 /* Now deal with propagating the branch condition. */
3521 if (cond_branch)
3523 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3525 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3527 /* Vector mode only, do a shuffle. */
3528 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3530 else
3532 /* Includes worker mode, do spill & fill. By construction
3533 we should never have worker mode only. */
3534 wcast_data_t data;
3536 data.base = worker_bcast_sym;
3537 data.ptr = 0;
3539 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3540 worker_bcast_size = GET_MODE_SIZE (SImode);
3542 data.offset = 0;
3543 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3544 before);
3545 /* Barrier so other workers can see the write. */
3546 emit_insn_before (nvptx_wsync (false), tail);
3547 data.offset = 0;
3548 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3549 /* This barrier is needed to avoid worker zero clobbering
3550 the broadcast buffer before all the other workers have
3551 had a chance to read this instance of it. */
3552 emit_insn_before (nvptx_wsync (true), tail);
3555 extract_insn (tail);
3556 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3557 UNSPEC_BR_UNIFIED);
3558 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3562 /* PAR is a parallel that is being skipped in its entirety according to
3563 MASK. Treat this as skipping a superblock starting at forked
3564 and ending at joining. */
3566 static void
3567 nvptx_skip_par (unsigned mask, parallel *par)
3569 basic_block tail = par->join_block;
3570 gcc_assert (tail->preds->length () == 1);
3572 basic_block pre_tail = (*tail->preds)[0]->src;
3573 gcc_assert (pre_tail->succs->length () == 1);
3575 nvptx_single (mask, par->forked_block, pre_tail);
3578 /* If PAR has a single inner parallel and PAR itself only contains
3579 empty entry and exit blocks, swallow the inner PAR. */
3581 static void
3582 nvptx_optimize_inner (parallel *par)
3584 parallel *inner = par->inner;
3586 /* We mustn't be the outer dummy par. */
3587 if (!par->mask)
3588 return;
3590 /* We must have a single inner par. */
3591 if (!inner || inner->next)
3592 return;
3594 /* We must only contain 2 blocks ourselves -- the head and tail of
3595 the inner par. */
3596 if (par->blocks.length () != 2)
3597 return;
3599 /* We must be disjoint partitioning. As we only have vector and
3600 worker partitioning, this is sufficient to guarantee the pars
3601 have adjacent partitioning. */
3602 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
3603 /* This indicates malformed code generation. */
3604 return;
3606 /* The outer forked insn should be immediately followed by the inner
3607 fork insn. */
3608 rtx_insn *forked = par->forked_insn;
3609 rtx_insn *fork = BB_END (par->forked_block);
3611 if (NEXT_INSN (forked) != fork)
3612 return;
3613 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
3615 /* The outer joining insn must immediately follow the inner join
3616 insn. */
3617 rtx_insn *joining = par->joining_insn;
3618 rtx_insn *join = inner->join_insn;
3619 if (NEXT_INSN (join) != joining)
3620 return;
3622 /* Preconditions met. Swallow the inner par. */
3623 if (dump_file)
3624 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3625 inner->mask, inner->forked_block->index,
3626 inner->join_block->index,
3627 par->mask, par->forked_block->index, par->join_block->index);
3629 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
3631 par->blocks.reserve (inner->blocks.length ());
3632 while (inner->blocks.length ())
3633 par->blocks.quick_push (inner->blocks.pop ());
3635 par->inner = inner->inner;
3636 inner->inner = NULL;
3638 delete inner;
3641 /* Process the parallel PAR and all its contained
3642 parallels. We do everything but the neutering. Return mask of
3643 partitioned modes used within this parallel. */
3645 static unsigned
3646 nvptx_process_pars (parallel *par)
3648 if (nvptx_optimize)
3649 nvptx_optimize_inner (par);
3651 unsigned inner_mask = par->mask;
3653 /* Do the inner parallels first. */
3654 if (par->inner)
3656 par->inner_mask = nvptx_process_pars (par->inner);
3657 inner_mask |= par->inner_mask;
3660 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3661 /* No propagation needed for a call. */;
3662 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3664 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
3665 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
3666 /* Insert begin and end synchronizations. */
3667 emit_insn_after (nvptx_wsync (false), par->forked_insn);
3668 emit_insn_before (nvptx_wsync (true), par->joining_insn);
3670 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3671 nvptx_vpropagate (par->forked_block, par->forked_insn);
3673 /* Now do siblings. */
3674 if (par->next)
3675 inner_mask |= nvptx_process_pars (par->next);
3676 return inner_mask;
3679 /* Neuter the parallel described by PAR. We recurse in depth-first
3680 order. MODES are the partitioning of the execution and OUTER is
3681 the partitioning of the parallels we are contained in. */
3683 static void
3684 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
3686 unsigned me = (par->mask
3687 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
3688 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3689 unsigned skip_mask = 0, neuter_mask = 0;
3691 if (par->inner)
3692 nvptx_neuter_pars (par->inner, modes, outer | me);
3694 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3696 if ((outer | me) & GOMP_DIM_MASK (mode))
3697 {} /* Mode is partitioned: no neutering. */
3698 else if (!(modes & GOMP_DIM_MASK (mode)))
3699 {} /* Mode is not used: nothing to do. */
3700 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3701 || !par->forked_insn)
3702 /* Partitioned in inner parallels, or we're not a partitioned
3703 at all: neuter individual blocks. */
3704 neuter_mask |= GOMP_DIM_MASK (mode);
3705 else if (!par->parent || !par->parent->forked_insn
3706 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
3707 /* Parent isn't a parallel or contains this paralleling: skip
3708 parallel at this level. */
3709 skip_mask |= GOMP_DIM_MASK (mode);
3710 else
3711 {} /* Parent will skip this parallel itself. */
3714 if (neuter_mask)
3716 int ix, len;
3718 if (nvptx_optimize)
3720 /* Neuter whole SESE regions. */
3721 bb_pair_vec_t regions;
3723 nvptx_find_sese (par->blocks, regions);
3724 len = regions.length ();
3725 for (ix = 0; ix != len; ix++)
3727 basic_block from = regions[ix].first;
3728 basic_block to = regions[ix].second;
3730 if (from)
3731 nvptx_single (neuter_mask, from, to);
3732 else
3733 gcc_assert (!to);
3736 else
3738 /* Neuter each BB individually. */
3739 len = par->blocks.length ();
3740 for (ix = 0; ix != len; ix++)
3742 basic_block block = par->blocks[ix];
3744 nvptx_single (neuter_mask, block, block);
3749 if (skip_mask)
3750 nvptx_skip_par (skip_mask, par);
3752 if (par->next)
3753 nvptx_neuter_pars (par->next, modes, outer);
3756 /* PTX-specific reorganization
3757 - Split blocks at fork and join instructions
3758 - Compute live registers
3759 - Mark now-unused registers, so function begin doesn't declare
3760 unused registers.
3761 - Insert state propagation when entering partitioned mode
3762 - Insert neutering instructions when in single mode
3763 - Replace subregs with suitable sequences.
3766 static void
3767 nvptx_reorg (void)
3769 /* We are freeing block_for_insn in the toplev to keep compatibility
3770 with old MDEP_REORGS that are not CFG based. Recompute it now. */
3771 compute_bb_for_insn ();
3773 thread_prologue_and_epilogue_insns ();
3775 /* Split blocks and record interesting unspecs. */
3776 bb_insn_map_t bb_insn_map;
3778 nvptx_split_blocks (&bb_insn_map);
3780 /* Compute live regs */
3781 df_clear_flags (DF_LR_RUN_DCE);
3782 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
3783 df_live_add_problem ();
3784 df_live_set_all_dirty ();
3785 df_analyze ();
3786 regstat_init_n_sets_and_refs ();
3788 if (dump_file)
3789 df_dump (dump_file);
3791 /* Mark unused regs as unused. */
3792 int max_regs = max_reg_num ();
3793 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
3794 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
3795 regno_reg_rtx[i] = const0_rtx;
3797 /* Determine launch dimensions of the function. If it is not an
3798 offloaded function (i.e. this is a regular compiler), the
3799 function has no neutering. */
3800 tree attr = get_oacc_fn_attrib (current_function_decl);
3801 if (attr)
3803 /* If we determined this mask before RTL expansion, we could
3804 elide emission of some levels of forks and joins. */
3805 unsigned mask = 0;
3806 tree dims = TREE_VALUE (attr);
3807 unsigned ix;
3809 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3811 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3812 tree allowed = TREE_PURPOSE (dims);
3814 if (size != 1 && !(allowed && integer_zerop (allowed)))
3815 mask |= GOMP_DIM_MASK (ix);
3817 /* If there is worker neutering, there must be vector
3818 neutering. Otherwise the hardware will fail. */
3819 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3820 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3822 /* Discover & process partitioned regions. */
3823 parallel *pars = nvptx_discover_pars (&bb_insn_map);
3824 nvptx_process_pars (pars);
3825 nvptx_neuter_pars (pars, mask, 0);
3826 delete pars;
3829 /* Replace subregs. */
3830 nvptx_reorg_subreg ();
3832 regstat_free_n_sets_and_refs ();
3834 df_finish_pass (true);
3837 /* Handle a "kernel" attribute; arguments as in
3838 struct attribute_spec.handler. */
3840 static tree
3841 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
3842 int ARG_UNUSED (flags), bool *no_add_attrs)
3844 tree decl = *node;
3846 if (TREE_CODE (decl) != FUNCTION_DECL)
3848 error ("%qE attribute only applies to functions", name);
3849 *no_add_attrs = true;
3851 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
3853 error ("%qE attribute requires a void return type", name);
3854 *no_add_attrs = true;
3857 return NULL_TREE;
3860 /* Table of valid machine attributes. */
3861 static const struct attribute_spec nvptx_attribute_table[] =
3863 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
3864 affects_type_identity } */
3865 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
3866 { NULL, 0, 0, false, false, false, NULL, false }
3869 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
3871 static HOST_WIDE_INT
3872 nvptx_vector_alignment (const_tree type)
3874 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
3876 return MIN (align, BIGGEST_ALIGNMENT);
3879 /* Indicate that INSN cannot be duplicated. */
3881 static bool
3882 nvptx_cannot_copy_insn_p (rtx_insn *insn)
3884 switch (recog_memoized (insn))
3886 case CODE_FOR_nvptx_shufflesi:
3887 case CODE_FOR_nvptx_shufflesf:
3888 case CODE_FOR_nvptx_barsync:
3889 case CODE_FOR_nvptx_fork:
3890 case CODE_FOR_nvptx_forked:
3891 case CODE_FOR_nvptx_joining:
3892 case CODE_FOR_nvptx_join:
3893 return true;
3894 default:
3895 return false;
3899 /* Section anchors do not work. Initialization for flag_section_anchor
3900 probes the existence of the anchoring target hooks and prevents
3901 anchoring if they don't exist. However, we may be being used with
3902 a host-side compiler that does support anchoring, and hence see
3903 the anchor flag set (as it's not recalculated). So provide an
3904 implementation denying anchoring. */
3906 static bool
3907 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
3909 return false;
3912 /* Record a symbol for mkoffload to enter into the mapping table. */
3914 static void
3915 nvptx_record_offload_symbol (tree decl)
3917 switch (TREE_CODE (decl))
3919 case VAR_DECL:
3920 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
3921 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3922 break;
3924 case FUNCTION_DECL:
3926 tree attr = get_oacc_fn_attrib (decl);
3927 tree dims = TREE_VALUE (attr);
3928 unsigned ix;
3930 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
3931 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3933 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3935 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3937 gcc_assert (!TREE_PURPOSE (dims));
3938 fprintf (asm_out_file, ", %#x", size);
3941 fprintf (asm_out_file, "\n");
3943 break;
3945 default:
3946 gcc_unreachable ();
3950 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
3951 at the start of a file. */
3953 static void
3954 nvptx_file_start (void)
3956 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
3957 fputs ("\t.version\t3.1\n", asm_out_file);
3958 fputs ("\t.target\tsm_30\n", asm_out_file);
3959 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
3960 fputs ("// END PREAMBLE\n", asm_out_file);
3963 /* Emit a declaration for a worker-level buffer in .shared memory. */
3965 static void
3966 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
3968 const char *name = XSTR (sym, 0);
3970 write_var_marker (file, true, false, name);
3971 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
3972 align, name, size);
3975 /* Write out the function declarations we've collected and declare storage
3976 for the broadcast buffer. */
3978 static void
3979 nvptx_file_end (void)
3981 hash_table<tree_hasher>::iterator iter;
3982 tree decl;
3983 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
3984 nvptx_record_fndecl (decl);
3985 fputs (func_decls.str().c_str(), asm_out_file);
3987 if (worker_bcast_size)
3988 write_worker_buffer (asm_out_file, worker_bcast_sym,
3989 worker_bcast_align, worker_bcast_size);
3991 if (worker_red_size)
3992 write_worker_buffer (asm_out_file, worker_red_sym,
3993 worker_red_align, worker_red_size);
3996 /* Expander for the shuffle builtins. */
3998 static rtx
3999 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4001 if (ignore)
4002 return target;
4004 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4005 NULL_RTX, mode, EXPAND_NORMAL);
4006 if (!REG_P (src))
4007 src = copy_to_mode_reg (mode, src);
4009 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
4010 NULL_RTX, SImode, EXPAND_NORMAL);
4011 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
4012 NULL_RTX, SImode, EXPAND_NORMAL);
4014 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4015 idx = copy_to_mode_reg (SImode, idx);
4017 rtx pat = nvptx_gen_shuffle (target, src, idx,
4018 (nvptx_shuffle_kind) INTVAL (op));
4019 if (pat)
4020 emit_insn (pat);
4022 return target;
4025 /* Worker reduction address expander. */
4027 static rtx
4028 nvptx_expand_worker_addr (tree exp, rtx target,
4029 machine_mode ARG_UNUSED (mode), int ignore)
4031 if (ignore)
4032 return target;
4034 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4035 if (align > worker_red_align)
4036 worker_red_align = align;
4038 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4039 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4040 if (size + offset > worker_red_size)
4041 worker_red_size = size + offset;
4043 rtx addr = worker_red_sym;
4044 if (offset)
4046 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4047 addr = gen_rtx_CONST (Pmode, addr);
4050 emit_move_insn (target, addr);
4052 return target;
4055 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4056 not require taking the address of any object, other than the memory
4057 cell being operated on. */
4059 static rtx
4060 nvptx_expand_cmp_swap (tree exp, rtx target,
4061 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4063 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4065 if (!target)
4066 target = gen_reg_rtx (mode);
4068 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4069 NULL_RTX, Pmode, EXPAND_NORMAL);
4070 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4071 NULL_RTX, mode, EXPAND_NORMAL);
4072 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4073 NULL_RTX, mode, EXPAND_NORMAL);
4074 rtx pat;
4076 mem = gen_rtx_MEM (mode, mem);
4077 if (!REG_P (cmp))
4078 cmp = copy_to_mode_reg (mode, cmp);
4079 if (!REG_P (src))
4080 src = copy_to_mode_reg (mode, src);
4082 if (mode == SImode)
4083 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4084 else
4085 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4087 emit_insn (pat);
4089 return target;
4093 /* Codes for all the NVPTX builtins. */
4094 enum nvptx_builtins
4096 NVPTX_BUILTIN_SHUFFLE,
4097 NVPTX_BUILTIN_SHUFFLELL,
4098 NVPTX_BUILTIN_WORKER_ADDR,
4099 NVPTX_BUILTIN_CMP_SWAP,
4100 NVPTX_BUILTIN_CMP_SWAPLL,
4101 NVPTX_BUILTIN_MAX
4104 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4106 /* Return the NVPTX builtin for CODE. */
4108 static tree
4109 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4111 if (code >= NVPTX_BUILTIN_MAX)
4112 return error_mark_node;
4114 return nvptx_builtin_decls[code];
4117 /* Set up all builtin functions for this target. */
4119 static void
4120 nvptx_init_builtins (void)
4122 #define DEF(ID, NAME, T) \
4123 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4124 = add_builtin_function ("__builtin_nvptx_" NAME, \
4125 build_function_type_list T, \
4126 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4127 #define ST sizetype
4128 #define UINT unsigned_type_node
4129 #define LLUINT long_long_unsigned_type_node
4130 #define PTRVOID ptr_type_node
4132 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4133 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4134 DEF (WORKER_ADDR, "worker_addr",
4135 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4136 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4137 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4139 #undef DEF
4140 #undef ST
4141 #undef UINT
4142 #undef LLUINT
4143 #undef PTRVOID
4146 /* Expand an expression EXP that calls a built-in function,
4147 with result going to TARGET if that's convenient
4148 (and in mode MODE if that's convenient).
4149 SUBTARGET may be used as the target for computing one of EXP's operands.
4150 IGNORE is nonzero if the value is to be ignored. */
4152 static rtx
4153 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4154 machine_mode mode, int ignore)
4156 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4157 switch (DECL_FUNCTION_CODE (fndecl))
4159 case NVPTX_BUILTIN_SHUFFLE:
4160 case NVPTX_BUILTIN_SHUFFLELL:
4161 return nvptx_expand_shuffle (exp, target, mode, ignore);
4163 case NVPTX_BUILTIN_WORKER_ADDR:
4164 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4166 case NVPTX_BUILTIN_CMP_SWAP:
4167 case NVPTX_BUILTIN_CMP_SWAPLL:
4168 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4170 default: gcc_unreachable ();
4174 /* Define dimension sizes for known hardware. */
4175 #define PTX_VECTOR_LENGTH 32
4176 #define PTX_WORKER_LENGTH 32
4177 #define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
4179 /* Validate compute dimensions of an OpenACC offload or routine, fill
4180 in non-unity defaults. FN_LEVEL indicates the level at which a
4181 routine might spawn a loop. It is negative for non-routines. If
4182 DECL is null, we are validating the default dimensions. */
4184 static bool
4185 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4187 bool changed = false;
4189 /* The vector size must be 32, unless this is a SEQ routine. */
4190 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4191 && dims[GOMP_DIM_VECTOR] >= 0
4192 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4194 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
4195 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4196 dims[GOMP_DIM_VECTOR]
4197 ? "using vector_length (%d), ignoring %d"
4198 : "using vector_length (%d), ignoring runtime setting",
4199 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4200 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4201 changed = true;
4204 /* Check the num workers is not too large. */
4205 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4207 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4208 "using num_workers (%d), ignoring %d",
4209 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4210 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4211 changed = true;
4214 if (!decl)
4216 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4217 if (dims[GOMP_DIM_WORKER] < 0)
4218 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4219 if (dims[GOMP_DIM_GANG] < 0)
4220 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
4221 changed = true;
4224 return changed;
4227 /* Return maximum dimension size, or zero for unbounded. */
4229 static int
4230 nvptx_dim_limit (int axis)
4232 switch (axis)
4234 case GOMP_DIM_WORKER:
4235 return PTX_WORKER_LENGTH;
4237 case GOMP_DIM_VECTOR:
4238 return PTX_VECTOR_LENGTH;
4240 default:
4241 break;
4243 return 0;
4246 /* Determine whether fork & joins are needed. */
4248 static bool
4249 nvptx_goacc_fork_join (gcall *call, const int dims[],
4250 bool ARG_UNUSED (is_fork))
4252 tree arg = gimple_call_arg (call, 2);
4253 unsigned axis = TREE_INT_CST_LOW (arg);
4255 /* We only care about worker and vector partitioning. */
4256 if (axis < GOMP_DIM_WORKER)
4257 return false;
4259 /* If the size is 1, there's no partitioning. */
4260 if (dims[axis] == 1)
4261 return false;
4263 return true;
4266 /* Generate a PTX builtin function call that returns the address in
4267 the worker reduction buffer at OFFSET. TYPE is the type of the
4268 data at that location. */
4270 static tree
4271 nvptx_get_worker_red_addr (tree type, tree offset)
4273 machine_mode mode = TYPE_MODE (type);
4274 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4275 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4276 tree align = build_int_cst (unsigned_type_node,
4277 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4278 tree call = build_call_expr (fndecl, 3, offset, size, align);
4280 return fold_convert (build_pointer_type (type), call);
4283 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4284 will cast the variable if necessary. */
4286 static void
4287 nvptx_generate_vector_shuffle (location_t loc,
4288 tree dest_var, tree var, unsigned shift,
4289 gimple_seq *seq)
4291 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4292 tree_code code = NOP_EXPR;
4293 tree arg_type = unsigned_type_node;
4294 tree var_type = TREE_TYPE (var);
4295 tree dest_type = var_type;
4297 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4298 var_type = TREE_TYPE (var_type);
4300 if (TREE_CODE (var_type) == REAL_TYPE)
4301 code = VIEW_CONVERT_EXPR;
4303 if (TYPE_SIZE (var_type)
4304 == TYPE_SIZE (long_long_unsigned_type_node))
4306 fn = NVPTX_BUILTIN_SHUFFLELL;
4307 arg_type = long_long_unsigned_type_node;
4310 tree call = nvptx_builtin_decl (fn, true);
4311 tree bits = build_int_cst (unsigned_type_node, shift);
4312 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4313 tree expr;
4315 if (var_type != dest_type)
4317 /* Do real and imaginary parts separately. */
4318 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4319 real = fold_build1 (code, arg_type, real);
4320 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4321 real = fold_build1 (code, var_type, real);
4323 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4324 imag = fold_build1 (code, arg_type, imag);
4325 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4326 imag = fold_build1 (code, var_type, imag);
4328 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4330 else
4332 expr = fold_build1 (code, arg_type, var);
4333 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4334 expr = fold_build1 (code, dest_type, expr);
4337 gimplify_assign (dest_var, expr, seq);
4340 /* Lazily generate the global lock var decl and return its address. */
4342 static tree
4343 nvptx_global_lock_addr ()
4345 tree v = global_lock_var;
4347 if (!v)
4349 tree name = get_identifier ("__reduction_lock");
4350 tree type = build_qualified_type (unsigned_type_node,
4351 TYPE_QUAL_VOLATILE);
4352 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4353 global_lock_var = v;
4354 DECL_ARTIFICIAL (v) = 1;
4355 DECL_EXTERNAL (v) = 1;
4356 TREE_STATIC (v) = 1;
4357 TREE_PUBLIC (v) = 1;
4358 TREE_USED (v) = 1;
4359 mark_addressable (v);
4360 mark_decl_referenced (v);
4363 return build_fold_addr_expr (v);
4366 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
4367 GSI. We use a lockless scheme for nearly all case, which looks
4368 like:
4369 actual = initval(OP);
4370 do {
4371 guess = actual;
4372 write = guess OP myval;
4373 actual = cmp&swap (ptr, guess, write)
4374 } while (actual bit-different-to guess);
4375 return write;
4377 This relies on a cmp&swap instruction, which is available for 32-
4378 and 64-bit types. Larger types must use a locking scheme. */
4380 static tree
4381 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4382 tree ptr, tree var, tree_code op)
4384 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4385 tree_code code = NOP_EXPR;
4386 tree arg_type = unsigned_type_node;
4387 tree var_type = TREE_TYPE (var);
4389 if (TREE_CODE (var_type) == COMPLEX_TYPE
4390 || TREE_CODE (var_type) == REAL_TYPE)
4391 code = VIEW_CONVERT_EXPR;
4393 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
4395 arg_type = long_long_unsigned_type_node;
4396 fn = NVPTX_BUILTIN_CMP_SWAPLL;
4399 tree swap_fn = nvptx_builtin_decl (fn, true);
4401 gimple_seq init_seq = NULL;
4402 tree init_var = make_ssa_name (arg_type);
4403 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4404 init_expr = fold_build1 (code, arg_type, init_expr);
4405 gimplify_assign (init_var, init_expr, &init_seq);
4406 gimple *init_end = gimple_seq_last (init_seq);
4408 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4410 /* Split the block just after the init stmts. */
4411 basic_block pre_bb = gsi_bb (*gsi);
4412 edge pre_edge = split_block (pre_bb, init_end);
4413 basic_block loop_bb = pre_edge->dest;
4414 pre_bb = pre_edge->src;
4415 /* Reset the iterator. */
4416 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4418 tree expect_var = make_ssa_name (arg_type);
4419 tree actual_var = make_ssa_name (arg_type);
4420 tree write_var = make_ssa_name (arg_type);
4422 /* Build and insert the reduction calculation. */
4423 gimple_seq red_seq = NULL;
4424 tree write_expr = fold_build1 (code, var_type, expect_var);
4425 write_expr = fold_build2 (op, var_type, write_expr, var);
4426 write_expr = fold_build1 (code, arg_type, write_expr);
4427 gimplify_assign (write_var, write_expr, &red_seq);
4429 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4431 /* Build & insert the cmp&swap sequence. */
4432 gimple_seq latch_seq = NULL;
4433 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4434 ptr, expect_var, write_var);
4435 gimplify_assign (actual_var, swap_expr, &latch_seq);
4437 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4438 NULL_TREE, NULL_TREE);
4439 gimple_seq_add_stmt (&latch_seq, cond);
4441 gimple *latch_end = gimple_seq_last (latch_seq);
4442 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
4444 /* Split the block just after the latch stmts. */
4445 edge post_edge = split_block (loop_bb, latch_end);
4446 basic_block post_bb = post_edge->dest;
4447 loop_bb = post_edge->src;
4448 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4450 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4451 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4452 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4453 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4455 gphi *phi = create_phi_node (expect_var, loop_bb);
4456 add_phi_arg (phi, init_var, pre_edge, loc);
4457 add_phi_arg (phi, actual_var, loop_edge, loc);
4459 loop *loop = alloc_loop ();
4460 loop->header = loop_bb;
4461 loop->latch = loop_bb;
4462 add_loop (loop, loop_bb->loop_father);
4464 return fold_build1 (code, var_type, write_var);
4467 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
4468 GSI. This is necessary for types larger than 64 bits, where there
4469 is no cmp&swap instruction to implement a lockless scheme. We use
4470 a lock variable in global memory.
4472 while (cmp&swap (&lock_var, 0, 1))
4473 continue;
4474 T accum = *ptr;
4475 accum = accum OP var;
4476 *ptr = accum;
4477 cmp&swap (&lock_var, 1, 0);
4478 return accum;
4480 A lock in global memory is necessary to force execution engine
4481 descheduling and avoid resource starvation that can occur if the
4482 lock is in .shared memory. */
4484 static tree
4485 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4486 tree ptr, tree var, tree_code op)
4488 tree var_type = TREE_TYPE (var);
4489 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4490 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4491 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4493 /* Split the block just before the gsi. Insert a gimple nop to make
4494 this easier. */
4495 gimple *nop = gimple_build_nop ();
4496 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4497 basic_block entry_bb = gsi_bb (*gsi);
4498 edge entry_edge = split_block (entry_bb, nop);
4499 basic_block lock_bb = entry_edge->dest;
4500 /* Reset the iterator. */
4501 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4503 /* Build and insert the locking sequence. */
4504 gimple_seq lock_seq = NULL;
4505 tree lock_var = make_ssa_name (unsigned_type_node);
4506 tree lock_expr = nvptx_global_lock_addr ();
4507 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4508 uns_unlocked, uns_locked);
4509 gimplify_assign (lock_var, lock_expr, &lock_seq);
4510 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4511 NULL_TREE, NULL_TREE);
4512 gimple_seq_add_stmt (&lock_seq, cond);
4513 gimple *lock_end = gimple_seq_last (lock_seq);
4514 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4516 /* Split the block just after the lock sequence. */
4517 edge locked_edge = split_block (lock_bb, lock_end);
4518 basic_block update_bb = locked_edge->dest;
4519 lock_bb = locked_edge->src;
4520 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4522 /* Create the lock loop ... */
4523 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4524 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
4525 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
4526 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
4528 /* ... and the loop structure. */
4529 loop *lock_loop = alloc_loop ();
4530 lock_loop->header = lock_bb;
4531 lock_loop->latch = lock_bb;
4532 lock_loop->nb_iterations_estimate = 1;
4533 lock_loop->any_estimate = true;
4534 add_loop (lock_loop, entry_bb->loop_father);
4536 /* Build and insert the reduction calculation. */
4537 gimple_seq red_seq = NULL;
4538 tree acc_in = make_ssa_name (var_type);
4539 tree ref_in = build_simple_mem_ref (ptr);
4540 TREE_THIS_VOLATILE (ref_in) = 1;
4541 gimplify_assign (acc_in, ref_in, &red_seq);
4543 tree acc_out = make_ssa_name (var_type);
4544 tree update_expr = fold_build2 (op, var_type, ref_in, var);
4545 gimplify_assign (acc_out, update_expr, &red_seq);
4547 tree ref_out = build_simple_mem_ref (ptr);
4548 TREE_THIS_VOLATILE (ref_out) = 1;
4549 gimplify_assign (ref_out, acc_out, &red_seq);
4551 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4553 /* Build & insert the unlock sequence. */
4554 gimple_seq unlock_seq = NULL;
4555 tree unlock_expr = nvptx_global_lock_addr ();
4556 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
4557 uns_locked, uns_unlocked);
4558 gimplify_and_add (unlock_expr, &unlock_seq);
4559 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
4561 return acc_out;
4564 /* Emit a sequence to update a reduction accumlator at *PTR with the
4565 value held in VAR using operator OP. Return the updated value.
4567 TODO: optimize for atomic ops and indepedent complex ops. */
4569 static tree
4570 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
4571 tree ptr, tree var, tree_code op)
4573 tree type = TREE_TYPE (var);
4574 tree size = TYPE_SIZE (type);
4576 if (size == TYPE_SIZE (unsigned_type_node)
4577 || size == TYPE_SIZE (long_long_unsigned_type_node))
4578 return nvptx_lockless_update (loc, gsi, ptr, var, op);
4579 else
4580 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
4583 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4585 static void
4586 nvptx_goacc_reduction_setup (gcall *call)
4588 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4589 tree lhs = gimple_call_lhs (call);
4590 tree var = gimple_call_arg (call, 2);
4591 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4592 gimple_seq seq = NULL;
4594 push_gimplify_context (true);
4596 if (level != GOMP_DIM_GANG)
4598 /* Copy the receiver object. */
4599 tree ref_to_res = gimple_call_arg (call, 1);
4601 if (!integer_zerop (ref_to_res))
4602 var = build_simple_mem_ref (ref_to_res);
4605 if (level == GOMP_DIM_WORKER)
4607 /* Store incoming value to worker reduction buffer. */
4608 tree offset = gimple_call_arg (call, 5);
4609 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4610 tree ptr = make_ssa_name (TREE_TYPE (call));
4612 gimplify_assign (ptr, call, &seq);
4613 tree ref = build_simple_mem_ref (ptr);
4614 TREE_THIS_VOLATILE (ref) = 1;
4615 gimplify_assign (ref, var, &seq);
4618 if (lhs)
4619 gimplify_assign (lhs, var, &seq);
4621 pop_gimplify_context (NULL);
4622 gsi_replace_with_seq (&gsi, seq, true);
4625 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
4627 static void
4628 nvptx_goacc_reduction_init (gcall *call)
4630 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4631 tree lhs = gimple_call_lhs (call);
4632 tree var = gimple_call_arg (call, 2);
4633 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4634 enum tree_code rcode
4635 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4636 tree init = omp_reduction_init_op (gimple_location (call), rcode,
4637 TREE_TYPE (var));
4638 gimple_seq seq = NULL;
4640 push_gimplify_context (true);
4642 if (level == GOMP_DIM_VECTOR)
4644 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4645 tree tid = make_ssa_name (integer_type_node);
4646 tree dim_vector = gimple_call_arg (call, 3);
4647 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
4648 dim_vector);
4649 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
4650 NULL_TREE, NULL_TREE);
4652 gimple_call_set_lhs (tid_call, tid);
4653 gimple_seq_add_stmt (&seq, tid_call);
4654 gimple_seq_add_stmt (&seq, cond_stmt);
4656 /* Split the block just after the call. */
4657 edge init_edge = split_block (gsi_bb (gsi), call);
4658 basic_block init_bb = init_edge->dest;
4659 basic_block call_bb = init_edge->src;
4661 /* Fixup flags from call_bb to init_bb. */
4662 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
4664 /* Set the initialization stmts. */
4665 gimple_seq init_seq = NULL;
4666 tree init_var = make_ssa_name (TREE_TYPE (var));
4667 gimplify_assign (init_var, init, &init_seq);
4668 gsi = gsi_start_bb (init_bb);
4669 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
4671 /* Split block just after the init stmt. */
4672 gsi_prev (&gsi);
4673 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
4674 basic_block dst_bb = inited_edge->dest;
4676 /* Create false edge from call_bb to dst_bb. */
4677 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
4679 /* Create phi node in dst block. */
4680 gphi *phi = create_phi_node (lhs, dst_bb);
4681 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
4682 add_phi_arg (phi, var, nop_edge, gimple_location (call));
4684 /* Reset dominator of dst bb. */
4685 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
4687 /* Reset the gsi. */
4688 gsi = gsi_for_stmt (call);
4690 else
4692 if (level == GOMP_DIM_GANG)
4694 /* If there's no receiver object, propagate the incoming VAR. */
4695 tree ref_to_res = gimple_call_arg (call, 1);
4696 if (integer_zerop (ref_to_res))
4697 init = var;
4700 gimplify_assign (lhs, init, &seq);
4703 pop_gimplify_context (NULL);
4704 gsi_replace_with_seq (&gsi, seq, true);
4707 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
4709 static void
4710 nvptx_goacc_reduction_fini (gcall *call)
4712 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4713 tree lhs = gimple_call_lhs (call);
4714 tree ref_to_res = gimple_call_arg (call, 1);
4715 tree var = gimple_call_arg (call, 2);
4716 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4717 enum tree_code op
4718 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4719 gimple_seq seq = NULL;
4720 tree r = NULL_TREE;;
4722 push_gimplify_context (true);
4724 if (level == GOMP_DIM_VECTOR)
4726 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
4727 but that requires a method of emitting a unified jump at the
4728 gimple level. */
4729 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
4731 tree other_var = make_ssa_name (TREE_TYPE (var));
4732 nvptx_generate_vector_shuffle (gimple_location (call),
4733 other_var, var, shfl, &seq);
4735 r = make_ssa_name (TREE_TYPE (var));
4736 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
4737 var, other_var), &seq);
4738 var = r;
4741 else
4743 tree accum = NULL_TREE;
4745 if (level == GOMP_DIM_WORKER)
4747 /* Get reduction buffer address. */
4748 tree offset = gimple_call_arg (call, 5);
4749 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4750 tree ptr = make_ssa_name (TREE_TYPE (call));
4752 gimplify_assign (ptr, call, &seq);
4753 accum = ptr;
4755 else if (integer_zerop (ref_to_res))
4756 r = var;
4757 else
4758 accum = ref_to_res;
4760 if (accum)
4762 /* UPDATE the accumulator. */
4763 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4764 seq = NULL;
4765 r = nvptx_reduction_update (gimple_location (call), &gsi,
4766 accum, var, op);
4770 if (lhs)
4771 gimplify_assign (lhs, r, &seq);
4772 pop_gimplify_context (NULL);
4774 gsi_replace_with_seq (&gsi, seq, true);
4777 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
4779 static void
4780 nvptx_goacc_reduction_teardown (gcall *call)
4782 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4783 tree lhs = gimple_call_lhs (call);
4784 tree var = gimple_call_arg (call, 2);
4785 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4786 gimple_seq seq = NULL;
4788 push_gimplify_context (true);
4789 if (level == GOMP_DIM_WORKER)
4791 /* Read the worker reduction buffer. */
4792 tree offset = gimple_call_arg (call, 5);
4793 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
4794 tree ptr = make_ssa_name (TREE_TYPE (call));
4796 gimplify_assign (ptr, call, &seq);
4797 var = build_simple_mem_ref (ptr);
4798 TREE_THIS_VOLATILE (var) = 1;
4801 if (level != GOMP_DIM_GANG)
4803 /* Write to the receiver object. */
4804 tree ref_to_res = gimple_call_arg (call, 1);
4806 if (!integer_zerop (ref_to_res))
4807 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
4810 if (lhs)
4811 gimplify_assign (lhs, var, &seq);
4813 pop_gimplify_context (NULL);
4815 gsi_replace_with_seq (&gsi, seq, true);
4818 /* NVPTX reduction expander. */
4820 static void
4821 nvptx_goacc_reduction (gcall *call)
4823 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
4825 switch (code)
4827 case IFN_GOACC_REDUCTION_SETUP:
4828 nvptx_goacc_reduction_setup (call);
4829 break;
4831 case IFN_GOACC_REDUCTION_INIT:
4832 nvptx_goacc_reduction_init (call);
4833 break;
4835 case IFN_GOACC_REDUCTION_FINI:
4836 nvptx_goacc_reduction_fini (call);
4837 break;
4839 case IFN_GOACC_REDUCTION_TEARDOWN:
4840 nvptx_goacc_reduction_teardown (call);
4841 break;
4843 default:
4844 gcc_unreachable ();
4848 #undef TARGET_OPTION_OVERRIDE
4849 #define TARGET_OPTION_OVERRIDE nvptx_option_override
4851 #undef TARGET_ATTRIBUTE_TABLE
4852 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
4854 #undef TARGET_LRA_P
4855 #define TARGET_LRA_P hook_bool_void_false
4857 #undef TARGET_LEGITIMATE_ADDRESS_P
4858 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
4860 #undef TARGET_PROMOTE_FUNCTION_MODE
4861 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
4863 #undef TARGET_FUNCTION_ARG
4864 #define TARGET_FUNCTION_ARG nvptx_function_arg
4865 #undef TARGET_FUNCTION_INCOMING_ARG
4866 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
4867 #undef TARGET_FUNCTION_ARG_ADVANCE
4868 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
4869 #undef TARGET_FUNCTION_ARG_BOUNDARY
4870 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
4871 #undef TARGET_PASS_BY_REFERENCE
4872 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
4873 #undef TARGET_FUNCTION_VALUE_REGNO_P
4874 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
4875 #undef TARGET_FUNCTION_VALUE
4876 #define TARGET_FUNCTION_VALUE nvptx_function_value
4877 #undef TARGET_LIBCALL_VALUE
4878 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
4879 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
4880 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
4881 #undef TARGET_GET_DRAP_RTX
4882 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
4883 #undef TARGET_SPLIT_COMPLEX_ARG
4884 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
4885 #undef TARGET_RETURN_IN_MEMORY
4886 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
4887 #undef TARGET_OMIT_STRUCT_RETURN_REG
4888 #define TARGET_OMIT_STRUCT_RETURN_REG true
4889 #undef TARGET_STRICT_ARGUMENT_NAMING
4890 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
4891 #undef TARGET_CALL_ARGS
4892 #define TARGET_CALL_ARGS nvptx_call_args
4893 #undef TARGET_END_CALL_ARGS
4894 #define TARGET_END_CALL_ARGS nvptx_end_call_args
4896 #undef TARGET_ASM_FILE_START
4897 #define TARGET_ASM_FILE_START nvptx_file_start
4898 #undef TARGET_ASM_FILE_END
4899 #define TARGET_ASM_FILE_END nvptx_file_end
4900 #undef TARGET_ASM_GLOBALIZE_LABEL
4901 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
4902 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
4903 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
4904 #undef TARGET_PRINT_OPERAND
4905 #define TARGET_PRINT_OPERAND nvptx_print_operand
4906 #undef TARGET_PRINT_OPERAND_ADDRESS
4907 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
4908 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
4909 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
4910 #undef TARGET_ASM_INTEGER
4911 #define TARGET_ASM_INTEGER nvptx_assemble_integer
4912 #undef TARGET_ASM_DECL_END
4913 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
4914 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
4915 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
4916 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
4917 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
4918 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
4919 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
4921 #undef TARGET_MACHINE_DEPENDENT_REORG
4922 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
4923 #undef TARGET_NO_REGISTER_ALLOCATION
4924 #define TARGET_NO_REGISTER_ALLOCATION true
4926 #undef TARGET_ENCODE_SECTION_INFO
4927 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
4928 #undef TARGET_RECORD_OFFLOAD_SYMBOL
4929 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
4931 #undef TARGET_VECTOR_ALIGNMENT
4932 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
4934 #undef TARGET_CANNOT_COPY_INSN_P
4935 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
4937 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
4938 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
4940 #undef TARGET_INIT_BUILTINS
4941 #define TARGET_INIT_BUILTINS nvptx_init_builtins
4942 #undef TARGET_EXPAND_BUILTIN
4943 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
4944 #undef TARGET_BUILTIN_DECL
4945 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
4947 #undef TARGET_GOACC_VALIDATE_DIMS
4948 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
4950 #undef TARGET_GOACC_DIM_LIMIT
4951 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
4953 #undef TARGET_GOACC_FORK_JOIN
4954 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
4956 #undef TARGET_GOACC_REDUCTION
4957 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
4959 struct gcc_target targetm = TARGET_INITIALIZER;
4961 #include "gt-nvptx.h"