gcc/
[official-gcc.git] / gcc / config / nvptx / nvptx.c
blob1631dae8fd9788594c08918b00f925c607cef660
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2016 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include <sstream>
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "expmed.h"
33 #include "optabs.h"
34 #include "regs.h"
35 #include "emit-rtl.h"
36 #include "recog.h"
37 #include "diagnostic.h"
38 #include "alias.h"
39 #include "insn-flags.h"
40 #include "output.h"
41 #include "insn-attr.h"
42 #include "flags.h"
43 #include "dojump.h"
44 #include "explow.h"
45 #include "calls.h"
46 #include "varasm.h"
47 #include "stmt.h"
48 #include "expr.h"
49 #include "tm-preds.h"
50 #include "tm-constrs.h"
51 #include "langhooks.h"
52 #include "dbxout.h"
53 #include "cfgrtl.h"
54 #include "gimple.h"
55 #include "stor-layout.h"
56 #include "builtins.h"
57 #include "omp-low.h"
58 #include "gomp-constants.h"
59 #include "dumpfile.h"
60 #include "internal-fn.h"
61 #include "gimple-iterator.h"
62 #include "stringpool.h"
63 #include "tree-ssa-operands.h"
64 #include "tree-ssanames.h"
65 #include "gimplify.h"
66 #include "tree-phinodes.h"
67 #include "cfgloop.h"
68 #include "fold-const.h"
70 /* This file should be included last. */
71 #include "target-def.h"
73 /* The kind of shuffe instruction. */
74 enum nvptx_shuffle_kind
76 SHUFFLE_UP,
77 SHUFFLE_DOWN,
78 SHUFFLE_BFLY,
79 SHUFFLE_IDX,
80 SHUFFLE_MAX
83 /* The various PTX memory areas an object might reside in. */
84 enum nvptx_data_area
86 DATA_AREA_GENERIC,
87 DATA_AREA_GLOBAL,
88 DATA_AREA_SHARED,
89 DATA_AREA_LOCAL,
90 DATA_AREA_CONST,
91 DATA_AREA_PARAM,
92 DATA_AREA_MAX
95 /* We record the data area in the target symbol flags. */
96 #define SYMBOL_DATA_AREA(SYM) \
97 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
98 & 7)
99 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
100 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
102 /* Record the function decls we've written, and the libfuncs and function
103 decls corresponding to them. */
104 static std::stringstream func_decls;
106 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
108 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
109 static bool equal (rtx a, rtx b) { return a == b; }
112 static GTY((cache))
113 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
115 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
117 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
118 static bool equal (tree a, tree b) { return a == b; }
121 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
122 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
124 /* Buffer needed to broadcast across workers. This is used for both
125 worker-neutering and worker broadcasting. It is shared by all
126 functions emitted. The buffer is placed in shared memory. It'd be
127 nice if PTX supported common blocks, because then this could be
128 shared across TUs (taking the largest size). */
129 static unsigned worker_bcast_size;
130 static unsigned worker_bcast_align;
131 static GTY(()) rtx worker_bcast_sym;
133 /* Buffer needed for worker reductions. This has to be distinct from
134 the worker broadcast array, as both may be live concurrently. */
135 static unsigned worker_red_size;
136 static unsigned worker_red_align;
137 static GTY(()) rtx worker_red_sym;
139 /* Global lock variable, needed for 128bit worker & gang reductions. */
140 static GTY(()) tree global_lock_var;
142 /* Allocate a new, cleared machine_function structure. */
144 static struct machine_function *
145 nvptx_init_machine_status (void)
147 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
148 p->return_mode = VOIDmode;
149 return p;
152 /* Implement TARGET_OPTION_OVERRIDE. */
154 static void
155 nvptx_option_override (void)
157 init_machine_status = nvptx_init_machine_status;
158 /* Gives us a predictable order, which we need especially for variables. */
159 flag_toplevel_reorder = 1;
160 /* Assumes that it will see only hard registers. */
161 flag_var_tracking = 0;
163 if (nvptx_optimize < 0)
164 nvptx_optimize = optimize > 0;
166 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
167 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
168 declared_libfuncs_htab
169 = hash_table<declared_libfunc_hasher>::create_ggc (17);
171 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
172 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
173 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
175 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
176 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
177 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
180 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
181 deal with ptx ideosyncracies. */
183 const char *
184 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
186 switch (mode)
188 case BLKmode:
189 return ".b8";
190 case BImode:
191 return ".pred";
192 case QImode:
193 if (promote)
194 return ".u32";
195 else
196 return ".u8";
197 case HImode:
198 return ".u16";
199 case SImode:
200 return ".u32";
201 case DImode:
202 return ".u64";
204 case SFmode:
205 return ".f32";
206 case DFmode:
207 return ".f64";
209 default:
210 gcc_unreachable ();
214 /* Encode the PTX data area that DECL (which might not actually be a
215 _DECL) should reside in. */
217 static void
218 nvptx_encode_section_info (tree decl, rtx rtl, int first)
220 default_encode_section_info (decl, rtl, first);
221 if (first && MEM_P (rtl))
223 nvptx_data_area area = DATA_AREA_GENERIC;
225 if (TREE_CONSTANT (decl))
226 area = DATA_AREA_CONST;
227 else if (TREE_CODE (decl) == VAR_DECL)
228 /* TODO: This would be a good place to check for a .shared or
229 other section name. */
230 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
232 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
236 /* Return the PTX name of the data area in which SYM should be
237 placed. The symbol must have already been processed by
238 nvptx_encode_seciton_info, or equivalent. */
240 static const char *
241 section_for_sym (rtx sym)
243 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
244 /* Same order as nvptx_data_area enum. */
245 static char const *const areas[] =
246 {"", ".global", ".shared", ".local", ".const", ".param"};
248 return areas[area];
251 /* Similarly for a decl. */
253 static const char *
254 section_for_decl (const_tree decl)
256 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
259 /* Check NAME for special function names and redirect them by returning a
260 replacement. This applies to malloc, free and realloc, for which we
261 want to use libgcc wrappers, and call, which triggers a bug in ptxas. */
263 static const char *
264 nvptx_name_replacement (const char *name)
266 if (strcmp (name, "call") == 0)
267 return "__nvptx_call";
268 if (strcmp (name, "malloc") == 0)
269 return "__nvptx_malloc";
270 if (strcmp (name, "free") == 0)
271 return "__nvptx_free";
272 if (strcmp (name, "realloc") == 0)
273 return "__nvptx_realloc";
274 return name;
277 /* If MODE should be treated as two registers of an inner mode, return
278 that inner mode. Otherwise return VOIDmode. */
280 static machine_mode
281 maybe_split_mode (machine_mode mode)
283 if (COMPLEX_MODE_P (mode))
284 return GET_MODE_INNER (mode);
286 if (mode == TImode)
287 return DImode;
289 return VOIDmode;
292 /* Output a register, subreg, or register pair (with optional
293 enclosing braces). */
295 static void
296 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
297 int subreg_offset = -1)
299 if (inner_mode == VOIDmode)
301 if (HARD_REGISTER_NUM_P (regno))
302 fprintf (file, "%s", reg_names[regno]);
303 else
304 fprintf (file, "%%r%d", regno);
306 else if (subreg_offset >= 0)
308 output_reg (file, regno, VOIDmode);
309 fprintf (file, "$%d", subreg_offset);
311 else
313 if (subreg_offset == -1)
314 fprintf (file, "{");
315 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
316 fprintf (file, ",");
317 output_reg (file, regno, inner_mode, 0);
318 if (subreg_offset == -1)
319 fprintf (file, "}");
323 /* Emit forking instructions for MASK. */
325 static void
326 nvptx_emit_forking (unsigned mask, bool is_call)
328 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
329 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
330 if (mask)
332 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
334 /* Emit fork at all levels. This helps form SESE regions, as
335 it creates a block with a single successor before entering a
336 partitooned region. That is a good candidate for the end of
337 an SESE region. */
338 if (!is_call)
339 emit_insn (gen_nvptx_fork (op));
340 emit_insn (gen_nvptx_forked (op));
344 /* Emit joining instructions for MASK. */
346 static void
347 nvptx_emit_joining (unsigned mask, bool is_call)
349 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
350 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
351 if (mask)
353 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
355 /* Emit joining for all non-call pars to ensure there's a single
356 predecessor for the block the join insn ends up in. This is
357 needed for skipping entire loops. */
358 if (!is_call)
359 emit_insn (gen_nvptx_joining (op));
360 emit_insn (gen_nvptx_join (op));
365 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
366 returned in memory. Integer and floating types supported by the
367 machine are passed in registers, everything else is passed in
368 memory. Complex types are split. */
370 static bool
371 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
373 if (type)
375 if (AGGREGATE_TYPE_P (type))
376 return true;
377 if (TREE_CODE (type) == VECTOR_TYPE)
378 return true;
381 if (!for_return && COMPLEX_MODE_P (mode))
382 /* Complex types are passed as two underlying args. */
383 mode = GET_MODE_INNER (mode);
385 if (GET_MODE_CLASS (mode) != MODE_INT
386 && GET_MODE_CLASS (mode) != MODE_FLOAT)
387 return true;
389 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
390 return true;
392 return false;
395 /* A non-memory argument of mode MODE is being passed, determine the mode it
396 should be promoted to. This is also used for determining return
397 type promotion. */
399 static machine_mode
400 promote_arg (machine_mode mode, bool prototyped)
402 if (!prototyped && mode == SFmode)
403 /* K&R float promotion for unprototyped functions. */
404 mode = DFmode;
405 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
406 mode = SImode;
408 return mode;
411 /* A non-memory return type of MODE is being returned. Determine the
412 mode it should be promoted to. */
414 static machine_mode
415 promote_return (machine_mode mode)
417 return promote_arg (mode, true);
420 /* Implement TARGET_FUNCTION_ARG. */
422 static rtx
423 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
424 const_tree, bool named)
426 if (mode == VOIDmode || !named)
427 return NULL_RTX;
429 return gen_reg_rtx (mode);
432 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
434 static rtx
435 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
436 const_tree, bool named)
438 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
440 if (mode == VOIDmode || !named)
441 return NULL_RTX;
443 /* No need to deal with split modes here, the only case that can
444 happen is complex modes and those are dealt with by
445 TARGET_SPLIT_COMPLEX_ARG. */
446 return gen_rtx_UNSPEC (mode,
447 gen_rtvec (1, GEN_INT (cum->count)),
448 UNSPEC_ARG_REG);
451 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
453 static void
454 nvptx_function_arg_advance (cumulative_args_t cum_v,
455 machine_mode ARG_UNUSED (mode),
456 const_tree ARG_UNUSED (type),
457 bool ARG_UNUSED (named))
459 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
461 cum->count++;
464 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
466 For nvptx, we know how to handle functions declared as stdarg: by
467 passing an extra pointer to the unnamed arguments. However, the
468 Fortran frontend can produce a different situation, where a
469 function pointer is declared with no arguments, but the actual
470 function and calls to it take more arguments. In that case, we
471 want to ensure the call matches the definition of the function. */
473 static bool
474 nvptx_strict_argument_naming (cumulative_args_t cum_v)
476 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
478 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
481 /* Implement TARGET_LIBCALL_VALUE. */
483 static rtx
484 nvptx_libcall_value (machine_mode mode, const_rtx)
486 if (!cfun || !cfun->machine->doing_call)
487 /* Pretend to return in a hard reg for early uses before pseudos can be
488 generated. */
489 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
491 return gen_reg_rtx (mode);
494 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
495 where function FUNC returns or receives a value of data type TYPE. */
497 static rtx
498 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
499 bool outgoing)
501 machine_mode mode = promote_return (TYPE_MODE (type));
503 if (outgoing)
505 gcc_assert (cfun);
506 cfun->machine->return_mode = mode;
507 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
510 return nvptx_libcall_value (mode, NULL_RTX);
513 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
515 static bool
516 nvptx_function_value_regno_p (const unsigned int regno)
518 return regno == NVPTX_RETURN_REGNUM;
521 /* Types with a mode other than those supported by the machine are passed by
522 reference in memory. */
524 static bool
525 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
526 machine_mode mode, const_tree type,
527 bool ARG_UNUSED (named))
529 return pass_in_memory (mode, type, false);
532 /* Implement TARGET_RETURN_IN_MEMORY. */
534 static bool
535 nvptx_return_in_memory (const_tree type, const_tree)
537 return pass_in_memory (TYPE_MODE (type), type, true);
540 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
542 static machine_mode
543 nvptx_promote_function_mode (const_tree type, machine_mode mode,
544 int *ARG_UNUSED (punsignedp),
545 const_tree funtype, int for_return)
547 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
550 /* Helper for write_arg. Emit a single PTX argument of MODE, either
551 in a prototype, or as copy in a function prologue. ARGNO is the
552 index of this argument in the PTX function. FOR_REG is negative,
553 if we're emitting the PTX prototype. It is zero if we're copying
554 to an argument register and it is greater than zero if we're
555 copying to a specific hard register. */
557 static int
558 write_arg_mode (std::stringstream &s, int for_reg, int argno,
559 machine_mode mode)
561 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
563 if (for_reg < 0)
565 /* Writing PTX prototype. */
566 s << (argno ? ", " : " (");
567 s << ".param" << ptx_type << " %in_ar" << argno;
569 else
571 s << "\t.reg" << ptx_type << " ";
572 if (for_reg)
573 s << reg_names[for_reg];
574 else
575 s << "%ar" << argno;
576 s << ";\n";
577 if (argno >= 0)
579 s << "\tld.param" << ptx_type << " ";
580 if (for_reg)
581 s << reg_names[for_reg];
582 else
583 s << "%ar" << argno;
584 s << ", [%in_ar" << argno << "];\n";
587 return argno + 1;
590 /* Process function parameter TYPE to emit one or more PTX
591 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
592 is true, if this is a prototyped function, rather than an old-style
593 C declaration. Returns the next argument number to use.
595 The promotion behavior here must match the regular GCC function
596 parameter marshalling machinery. */
598 static int
599 write_arg_type (std::stringstream &s, int for_reg, int argno,
600 tree type, bool prototyped)
602 machine_mode mode = TYPE_MODE (type);
604 if (mode == VOIDmode)
605 return argno;
607 if (pass_in_memory (mode, type, false))
608 mode = Pmode;
609 else
611 bool split = TREE_CODE (type) == COMPLEX_TYPE;
613 if (split)
615 /* Complex types are sent as two separate args. */
616 type = TREE_TYPE (type);
617 mode = TYPE_MODE (type);
618 prototyped = true;
621 mode = promote_arg (mode, prototyped);
622 if (split)
623 argno = write_arg_mode (s, for_reg, argno, mode);
626 return write_arg_mode (s, for_reg, argno, mode);
629 /* Emit a PTX return as a prototype or function prologue declaration
630 for MODE. */
632 static void
633 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
635 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
636 const char *pfx = "\t.reg";
637 const char *sfx = ";\n";
639 if (for_proto)
640 pfx = "(.param", sfx = "_out) ";
642 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
645 /* Process a function return TYPE to emit a PTX return as a prototype
646 or function prologue declaration. Returns true if return is via an
647 additional pointer parameter. The promotion behavior here must
648 match the regular GCC function return mashalling. */
650 static bool
651 write_return_type (std::stringstream &s, bool for_proto, tree type)
653 machine_mode mode = TYPE_MODE (type);
655 if (mode == VOIDmode)
656 return false;
658 bool return_in_mem = pass_in_memory (mode, type, true);
660 if (return_in_mem)
662 if (for_proto)
663 return return_in_mem;
665 /* Named return values can cause us to return a pointer as well
666 as expect an argument for the return location. This is
667 optimization-level specific, so no caller can make use of
668 this data, but more importantly for us, we must ensure it
669 doesn't change the PTX prototype. */
670 mode = (machine_mode) cfun->machine->return_mode;
672 if (mode == VOIDmode)
673 return return_in_mem;
675 /* Clear return_mode to inhibit copy of retval to non-existent
676 retval parameter. */
677 cfun->machine->return_mode = VOIDmode;
679 else
680 mode = promote_return (mode);
682 write_return_mode (s, for_proto, mode);
684 return return_in_mem;
687 /* Look for attributes in ATTRS that would indicate we must write a function
688 as a .entry kernel rather than a .func. Return true if one is found. */
690 static bool
691 write_as_kernel (tree attrs)
693 return (lookup_attribute ("kernel", attrs) != NULL_TREE
694 || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
697 /* Emit a linker marker for a function decl or defn. */
699 static void
700 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
701 const char *name)
703 s << "\n// BEGIN";
704 if (globalize)
705 s << " GLOBAL";
706 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
707 s << name << "\n";
710 /* Emit a linker marker for a variable decl or defn. */
712 static void
713 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
715 fprintf (file, "\n// BEGIN%s VAR %s: ",
716 globalize ? " GLOBAL" : "",
717 is_defn ? "DEF" : "DECL");
718 assemble_name_raw (file, name);
719 fputs ("\n", file);
722 /* Write a .func or .kernel declaration or definition along with
723 a helper comment for use by ld. S is the stream to write to, DECL
724 the decl for the function with name NAME. For definitions, emit
725 a declaration too. */
727 static const char *
728 write_fn_proto (std::stringstream &s, bool is_defn,
729 const char *name, const_tree decl)
731 if (is_defn)
732 /* Emit a declaration. The PTX assembler gets upset without it. */
733 name = write_fn_proto (s, false, name, decl);
734 else
736 /* Avoid repeating the name replacement. */
737 name = nvptx_name_replacement (name);
738 if (name[0] == '*')
739 name++;
742 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
744 /* PTX declaration. */
745 if (DECL_EXTERNAL (decl))
746 s << ".extern ";
747 else if (TREE_PUBLIC (decl))
748 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
749 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
751 tree fntype = TREE_TYPE (decl);
752 tree result_type = TREE_TYPE (fntype);
754 /* atomic_compare_exchange_$n builtins have an exceptional calling
755 convention. */
756 int not_atomic_weak_arg = -1;
757 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
758 switch (DECL_FUNCTION_CODE (decl))
760 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
761 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
762 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
763 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
764 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
765 /* These atomics skip the 'weak' parm in an actual library
766 call. We must skip it in the prototype too. */
767 not_atomic_weak_arg = 3;
768 break;
770 default:
771 break;
774 /* Declare the result. */
775 bool return_in_mem = write_return_type (s, true, result_type);
777 s << name;
779 int argno = 0;
781 /* Emit argument list. */
782 if (return_in_mem)
783 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
785 /* We get:
786 NULL in TYPE_ARG_TYPES, for old-style functions
787 NULL in DECL_ARGUMENTS, for builtin functions without another
788 declaration.
789 So we have to pick the best one we have. */
790 tree args = TYPE_ARG_TYPES (fntype);
791 bool prototyped = true;
792 if (!args)
794 args = DECL_ARGUMENTS (decl);
795 prototyped = false;
798 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
800 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
802 if (not_atomic_weak_arg)
803 argno = write_arg_type (s, -1, argno, type, prototyped);
804 else
805 gcc_assert (type == boolean_type_node);
808 if (stdarg_p (fntype))
809 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
811 if (DECL_STATIC_CHAIN (decl))
812 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
814 if (!argno && strcmp (name, "main") == 0)
816 argno = write_arg_type (s, -1, argno, integer_type_node, true);
817 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
820 if (argno)
821 s << ")";
823 s << (is_defn ? "\n" : ";\n");
825 return name;
828 /* Construct a function declaration from a call insn. This can be
829 necessary for two reasons - either we have an indirect call which
830 requires a .callprototype declaration, or we have a libcall
831 generated by emit_library_call for which no decl exists. */
833 static void
834 write_fn_proto_from_insn (std::stringstream &s, const char *name,
835 rtx result, rtx pat)
837 if (!name)
839 s << "\t.callprototype ";
840 name = "_";
842 else
844 name = nvptx_name_replacement (name);
845 write_fn_marker (s, false, true, name);
846 s << "\t.extern .func ";
849 if (result != NULL_RTX)
850 write_return_mode (s, true, GET_MODE (result));
852 s << name;
854 int arg_end = XVECLEN (pat, 0);
855 for (int i = 1; i < arg_end; i++)
857 /* We don't have to deal with mode splitting & promotion here,
858 as that was already done when generating the call
859 sequence. */
860 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
862 write_arg_mode (s, -1, i - 1, mode);
864 if (arg_end != 1)
865 s << ")";
866 s << ";\n";
869 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
870 table and and write a ptx prototype. These are emitted at end of
871 compilation. */
873 static void
874 nvptx_record_fndecl (tree decl)
876 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
877 if (*slot == NULL)
879 *slot = decl;
880 const char *name = get_fnname_from_decl (decl);
881 write_fn_proto (func_decls, false, name, decl);
885 /* Record a libcall or unprototyped external function. CALLEE is the
886 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
887 declaration for it. */
889 static void
890 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
892 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
893 if (*slot == NULL)
895 *slot = callee;
897 const char *name = XSTR (callee, 0);
898 write_fn_proto_from_insn (func_decls, name, retval, pat);
902 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
903 is prototyped, record it now. Otherwise record it as needed at end
904 of compilation, when we might have more information about it. */
906 void
907 nvptx_record_needed_fndecl (tree decl)
909 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
911 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
912 if (*slot == NULL)
913 *slot = decl;
915 else
916 nvptx_record_fndecl (decl);
919 /* SYM is a SYMBOL_REF. If it refers to an external function, record
920 it as needed. */
922 static void
923 nvptx_maybe_record_fnsym (rtx sym)
925 tree decl = SYMBOL_REF_DECL (sym);
927 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
928 nvptx_record_needed_fndecl (decl);
931 /* Emit a local array to hold some part of a conventional stack frame
932 and initialize REGNO to point to it. If the size is zero, it'll
933 never be valid to dereference, so we can simply initialize to
934 zero. */
936 static void
937 init_frame (FILE *file, int regno, unsigned align, unsigned size)
939 if (size)
940 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
941 align, reg_names[regno], size);
942 fprintf (file, "\t.reg.u%d %s;\n",
943 POINTER_SIZE, reg_names[regno]);
944 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
945 : "\tmov.u%d %s, 0;\n"),
946 POINTER_SIZE, reg_names[regno], reg_names[regno]);
949 /* Emit code to initialize the REGNO predicate register to indicate
950 whether we are not lane zero on the NAME axis. */
952 static void
953 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
955 fprintf (file, "\t{\n");
956 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
957 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
958 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
959 fprintf (file, "\t}\n");
962 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
963 function, including local var decls and copies from the arguments to
964 local regs. */
966 void
967 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
969 tree fntype = TREE_TYPE (decl);
970 tree result_type = TREE_TYPE (fntype);
971 int argno = 0;
973 /* We construct the initial part of the function into a string
974 stream, in order to share the prototype writing code. */
975 std::stringstream s;
976 write_fn_proto (s, true, name, decl);
977 s << "{\n";
979 bool return_in_mem = write_return_type (s, false, result_type);
980 if (return_in_mem)
981 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
983 /* Declare and initialize incoming arguments. */
984 tree args = TYPE_ARG_TYPES (fntype);
985 bool prototyped = true;
986 if (!args)
988 args = DECL_ARGUMENTS (decl);
989 prototyped = false;
992 for (; args != NULL_TREE; args = TREE_CHAIN (args))
994 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
996 argno = write_arg_type (s, 0, argno, type, prototyped);
999 if (stdarg_p (fntype))
1000 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1001 true);
1003 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1004 write_arg_type (s, STATIC_CHAIN_REGNUM,
1005 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1006 true);
1008 fprintf (file, "%s", s.str().c_str());
1010 /* Declare a local var for outgoing varargs. */
1011 if (cfun->machine->has_varadic)
1012 init_frame (file, STACK_POINTER_REGNUM,
1013 UNITS_PER_WORD, crtl->outgoing_args_size);
1015 /* Declare a local variable for the frame. */
1016 HOST_WIDE_INT sz = get_frame_size ();
1017 if (sz || cfun->machine->has_chain)
1018 init_frame (file, FRAME_POINTER_REGNUM,
1019 crtl->stack_alignment_needed / BITS_PER_UNIT, sz);
1021 /* Declare the pseudos we have as ptx registers. */
1022 int maxregs = max_reg_num ();
1023 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1025 if (regno_reg_rtx[i] != const0_rtx)
1027 machine_mode mode = PSEUDO_REGNO_MODE (i);
1028 machine_mode split = maybe_split_mode (mode);
1030 if (split != VOIDmode)
1031 mode = split;
1032 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1033 output_reg (file, i, split, -2);
1034 fprintf (file, ";\n");
1038 /* Emit axis predicates. */
1039 if (cfun->machine->axis_predicate[0])
1040 nvptx_init_axis_predicate (file,
1041 REGNO (cfun->machine->axis_predicate[0]), "y");
1042 if (cfun->machine->axis_predicate[1])
1043 nvptx_init_axis_predicate (file,
1044 REGNO (cfun->machine->axis_predicate[1]), "x");
1047 /* Output a return instruction. Also copy the return value to its outgoing
1048 location. */
1050 const char *
1051 nvptx_output_return (void)
1053 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1055 if (mode != VOIDmode)
1056 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1057 nvptx_ptx_type_from_mode (mode, false),
1058 reg_names[NVPTX_RETURN_REGNUM],
1059 reg_names[NVPTX_RETURN_REGNUM]);
1061 return "ret;";
1064 /* Terminate a function by writing a closing brace to FILE. */
1066 void
1067 nvptx_function_end (FILE *file)
1069 fprintf (file, "}\n");
1072 /* Decide whether we can make a sibling call to a function. For ptx, we
1073 can't. */
1075 static bool
1076 nvptx_function_ok_for_sibcall (tree, tree)
1078 return false;
1081 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1083 static rtx
1084 nvptx_get_drap_rtx (void)
1086 return NULL_RTX;
1089 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1090 argument to the next call. */
1092 static void
1093 nvptx_call_args (rtx arg, tree fntype)
1095 if (!cfun->machine->doing_call)
1097 cfun->machine->doing_call = true;
1098 cfun->machine->is_varadic = false;
1099 cfun->machine->num_args = 0;
1101 if (fntype && stdarg_p (fntype))
1103 cfun->machine->is_varadic = true;
1104 cfun->machine->has_varadic = true;
1105 cfun->machine->num_args++;
1109 if (REG_P (arg) && arg != pc_rtx)
1111 cfun->machine->num_args++;
1112 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1113 cfun->machine->call_args);
1117 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1118 information we recorded. */
1120 static void
1121 nvptx_end_call_args (void)
1123 cfun->machine->doing_call = false;
1124 free_EXPR_LIST_list (&cfun->machine->call_args);
1127 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1128 track of whether calls involving static chains or varargs were seen
1129 in the current function.
1130 For libcalls, maintain a hash table of decls we have seen, and
1131 record a function decl for later when encountering a new one. */
1133 void
1134 nvptx_expand_call (rtx retval, rtx address)
1136 rtx callee = XEXP (address, 0);
1137 rtx varargs = NULL_RTX;
1138 unsigned parallel = 0;
1140 if (!call_insn_operand (callee, Pmode))
1142 callee = force_reg (Pmode, callee);
1143 address = change_address (address, QImode, callee);
1146 if (GET_CODE (callee) == SYMBOL_REF)
1148 tree decl = SYMBOL_REF_DECL (callee);
1149 if (decl != NULL_TREE)
1151 if (DECL_STATIC_CHAIN (decl))
1152 cfun->machine->has_chain = true;
1154 tree attr = get_oacc_fn_attrib (decl);
1155 if (attr)
1157 tree dims = TREE_VALUE (attr);
1159 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1160 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1162 if (TREE_PURPOSE (dims)
1163 && !integer_zerop (TREE_PURPOSE (dims)))
1164 break;
1165 /* Not on this axis. */
1166 parallel ^= GOMP_DIM_MASK (ix);
1167 dims = TREE_CHAIN (dims);
1173 unsigned nargs = cfun->machine->num_args;
1174 if (cfun->machine->is_varadic)
1176 varargs = gen_reg_rtx (Pmode);
1177 emit_move_insn (varargs, stack_pointer_rtx);
1180 rtvec vec = rtvec_alloc (nargs + 1);
1181 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1182 int vec_pos = 0;
1184 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1185 rtx tmp_retval = retval;
1186 if (retval)
1188 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1189 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1190 call = gen_rtx_SET (tmp_retval, call);
1192 XVECEXP (pat, 0, vec_pos++) = call;
1194 /* Construct the call insn, including a USE for each argument pseudo
1195 register. These will be used when printing the insn. */
1196 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1197 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1199 if (varargs)
1200 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1202 gcc_assert (vec_pos = XVECLEN (pat, 0));
1204 nvptx_emit_forking (parallel, true);
1205 emit_call_insn (pat);
1206 nvptx_emit_joining (parallel, true);
1208 if (tmp_retval != retval)
1209 emit_move_insn (retval, tmp_retval);
1212 /* Emit a comparison COMPARE, and return the new test to be used in the
1213 jump. */
1216 nvptx_expand_compare (rtx compare)
1218 rtx pred = gen_reg_rtx (BImode);
1219 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1220 XEXP (compare, 0), XEXP (compare, 1));
1221 emit_insn (gen_rtx_SET (pred, cmp));
1222 return gen_rtx_NE (BImode, pred, const0_rtx);
1225 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1227 void
1228 nvptx_expand_oacc_fork (unsigned mode)
1230 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1233 void
1234 nvptx_expand_oacc_join (unsigned mode)
1236 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1239 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1240 objects. */
1242 static rtx
1243 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1245 rtx res;
1247 switch (GET_MODE (src))
1249 case DImode:
1250 res = gen_unpackdisi2 (dst0, dst1, src);
1251 break;
1252 case DFmode:
1253 res = gen_unpackdfsi2 (dst0, dst1, src);
1254 break;
1255 default: gcc_unreachable ();
1257 return res;
1260 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1261 object. */
1263 static rtx
1264 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1266 rtx res;
1268 switch (GET_MODE (dst))
1270 case DImode:
1271 res = gen_packsidi2 (dst, src0, src1);
1272 break;
1273 case DFmode:
1274 res = gen_packsidf2 (dst, src0, src1);
1275 break;
1276 default: gcc_unreachable ();
1278 return res;
1281 /* Generate an instruction or sequence to broadcast register REG
1282 across the vectors of a single warp. */
1284 static rtx
1285 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1287 rtx res;
1289 switch (GET_MODE (dst))
1291 case SImode:
1292 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1293 break;
1294 case SFmode:
1295 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1296 break;
1297 case DImode:
1298 case DFmode:
1300 rtx tmp0 = gen_reg_rtx (SImode);
1301 rtx tmp1 = gen_reg_rtx (SImode);
1303 start_sequence ();
1304 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1305 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1306 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1307 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1308 res = get_insns ();
1309 end_sequence ();
1311 break;
1312 case BImode:
1314 rtx tmp = gen_reg_rtx (SImode);
1316 start_sequence ();
1317 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1318 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1319 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1320 res = get_insns ();
1321 end_sequence ();
1323 break;
1324 case QImode:
1325 case HImode:
1327 rtx tmp = gen_reg_rtx (SImode);
1329 start_sequence ();
1330 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1331 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1332 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1333 tmp)));
1334 res = get_insns ();
1335 end_sequence ();
1337 break;
1339 default:
1340 gcc_unreachable ();
1342 return res;
1345 /* Generate an instruction or sequence to broadcast register REG
1346 across the vectors of a single warp. */
1348 static rtx
1349 nvptx_gen_vcast (rtx reg)
1351 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1354 /* Structure used when generating a worker-level spill or fill. */
1356 struct wcast_data_t
1358 rtx base; /* Register holding base addr of buffer. */
1359 rtx ptr; /* Iteration var, if needed. */
1360 unsigned offset; /* Offset into worker buffer. */
1363 /* Direction of the spill/fill and looping setup/teardown indicator. */
1365 enum propagate_mask
1367 PM_read = 1 << 0,
1368 PM_write = 1 << 1,
1369 PM_loop_begin = 1 << 2,
1370 PM_loop_end = 1 << 3,
1372 PM_read_write = PM_read | PM_write
1375 /* Generate instruction(s) to spill or fill register REG to/from the
1376 worker broadcast array. PM indicates what is to be done, REP
1377 how many loop iterations will be executed (0 for not a loop). */
1379 static rtx
1380 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1382 rtx res;
1383 machine_mode mode = GET_MODE (reg);
1385 switch (mode)
1387 case BImode:
1389 rtx tmp = gen_reg_rtx (SImode);
1391 start_sequence ();
1392 if (pm & PM_read)
1393 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1394 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1395 if (pm & PM_write)
1396 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1397 res = get_insns ();
1398 end_sequence ();
1400 break;
1402 default:
1404 rtx addr = data->ptr;
1406 if (!addr)
1408 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1410 if (align > worker_bcast_align)
1411 worker_bcast_align = align;
1412 data->offset = (data->offset + align - 1) & ~(align - 1);
1413 addr = data->base;
1414 if (data->offset)
1415 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1418 addr = gen_rtx_MEM (mode, addr);
1419 if (pm == PM_read)
1420 res = gen_rtx_SET (addr, reg);
1421 else if (pm == PM_write)
1422 res = gen_rtx_SET (reg, addr);
1423 else
1424 gcc_unreachable ();
1426 if (data->ptr)
1428 /* We're using a ptr, increment it. */
1429 start_sequence ();
1431 emit_insn (res);
1432 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1433 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1434 res = get_insns ();
1435 end_sequence ();
1437 else
1438 rep = 1;
1439 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1441 break;
1443 return res;
1446 /* Returns true if X is a valid address for use in a memory reference. */
1448 static bool
1449 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1451 enum rtx_code code = GET_CODE (x);
1453 switch (code)
1455 case REG:
1456 return true;
1458 case PLUS:
1459 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1460 return true;
1461 return false;
1463 case CONST:
1464 case SYMBOL_REF:
1465 case LABEL_REF:
1466 return true;
1468 default:
1469 return false;
1473 /* Machinery to output constant initializers. When beginning an
1474 initializer, we decide on a fragment size (which is visible in ptx
1475 in the type used), and then all initializer data is buffered until
1476 a fragment is filled and ready to be written out. */
1478 static struct
1480 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1481 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1482 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1483 out. */
1484 unsigned size; /* Fragment size to accumulate. */
1485 unsigned offset; /* Offset within current fragment. */
1486 bool started; /* Whether we've output any initializer. */
1487 } init_frag;
1489 /* The current fragment is full, write it out. SYM may provide a
1490 symbolic reference we should output, in which case the fragment
1491 value is the addend. */
1493 static void
1494 output_init_frag (rtx sym)
1496 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1497 unsigned HOST_WIDE_INT val = init_frag.val;
1499 init_frag.started = true;
1500 init_frag.val = 0;
1501 init_frag.offset = 0;
1502 init_frag.remaining--;
1504 if (sym)
1506 fprintf (asm_out_file, "generic(");
1507 output_address (VOIDmode, sym);
1508 fprintf (asm_out_file, val ? ") + " : ")");
1511 if (!sym || val)
1512 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1515 /* Add value VAL of size SIZE to the data we're emitting, and keep
1516 writing out chunks as they fill up. */
1518 static void
1519 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1521 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1523 for (unsigned part = 0; size; size -= part)
1525 val >>= part * BITS_PER_UNIT;
1526 part = init_frag.size - init_frag.offset;
1527 if (part > size)
1528 part = size;
1530 unsigned HOST_WIDE_INT partial
1531 = val << (init_frag.offset * BITS_PER_UNIT);
1532 init_frag.val |= partial & init_frag.mask;
1533 init_frag.offset += part;
1535 if (init_frag.offset == init_frag.size)
1536 output_init_frag (NULL);
1540 /* Target hook for assembling integer object X of size SIZE. */
1542 static bool
1543 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1545 HOST_WIDE_INT val = 0;
1547 switch (GET_CODE (x))
1549 default:
1550 /* Let the generic machinery figure it out, usually for a
1551 CONST_WIDE_INT. */
1552 return false;
1554 case CONST_INT:
1555 nvptx_assemble_value (INTVAL (x), size);
1556 break;
1558 case CONST:
1559 x = XEXP (x, 0);
1560 gcc_assert (GET_CODE (x) == PLUS);
1561 val = INTVAL (XEXP (x, 1));
1562 x = XEXP (x, 0);
1563 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1564 /* FALLTHROUGH */
1566 case SYMBOL_REF:
1567 gcc_assert (size == init_frag.size);
1568 if (init_frag.offset)
1569 sorry ("cannot emit unaligned pointers in ptx assembly");
1571 nvptx_maybe_record_fnsym (x);
1572 init_frag.val = val;
1573 output_init_frag (x);
1574 break;
1577 return true;
1580 /* Output SIZE zero bytes. We ignore the FILE argument since the
1581 functions we're calling to perform the output just use
1582 asm_out_file. */
1584 void
1585 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1587 /* Finish the current fragment, if it's started. */
1588 if (init_frag.offset)
1590 unsigned part = init_frag.size - init_frag.offset;
1591 if (part > size)
1592 part = (unsigned) size;
1593 size -= part;
1594 nvptx_assemble_value (0, part);
1597 /* If this skip doesn't terminate the initializer, write as many
1598 remaining pieces as possible directly. */
1599 if (size < init_frag.remaining * init_frag.size)
1601 while (size >= init_frag.size)
1603 size -= init_frag.size;
1604 output_init_frag (NULL_RTX);
1606 if (size)
1607 nvptx_assemble_value (0, size);
1611 /* Output a string STR with length SIZE. As in nvptx_output_skip we
1612 ignore the FILE arg. */
1614 void
1615 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1617 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1618 nvptx_assemble_value (str[i], 1);
1621 /* Emit a PTX variable decl and prepare for emission of its
1622 initializer. NAME is the symbol name and SETION the PTX data
1623 area. The type is TYPE, object size SIZE and alignment is ALIGN.
1624 The caller has already emitted any indentation and linkage
1625 specifier. It is responsible for any initializer, terminating ;
1626 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
1627 this is the opposite way round that PTX wants them! */
1629 static void
1630 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
1631 const_tree type, HOST_WIDE_INT size, unsigned align)
1633 while (TREE_CODE (type) == ARRAY_TYPE)
1634 type = TREE_TYPE (type);
1636 if (TREE_CODE (type) == VECTOR_TYPE
1637 || TREE_CODE (type) == COMPLEX_TYPE)
1638 /* Neither vector nor complex types can contain the other. */
1639 type = TREE_TYPE (type);
1641 unsigned elt_size = int_size_in_bytes (type);
1643 /* Largest mode we're prepared to accept. For BLKmode types we
1644 don't know if it'll contain pointer constants, so have to choose
1645 pointer size, otherwise we can choose DImode. */
1646 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
1648 elt_size |= GET_MODE_SIZE (elt_mode);
1649 elt_size &= -elt_size; /* Extract LSB set. */
1651 init_frag.size = elt_size;
1652 /* Avoid undefined shift behavior by using '2'. */
1653 init_frag.mask = ((unsigned HOST_WIDE_INT)2
1654 << (elt_size * BITS_PER_UNIT - 1)) - 1;
1655 init_frag.val = 0;
1656 init_frag.offset = 0;
1657 init_frag.started = false;
1658 /* Size might not be a multiple of elt size, if there's an
1659 initialized trailing struct array with smaller type than
1660 elt_size. */
1661 init_frag.remaining = (size + elt_size - 1) / elt_size;
1663 fprintf (file, "%s .align %d .u%d ",
1664 section, align / BITS_PER_UNIT,
1665 elt_size * BITS_PER_UNIT);
1666 assemble_name (file, name);
1668 if (size)
1669 /* We make everything an array, to simplify any initialization
1670 emission. */
1671 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
1674 /* Called when the initializer for a decl has been completely output through
1675 combinations of the three functions above. */
1677 static void
1678 nvptx_assemble_decl_end (void)
1680 if (init_frag.offset)
1681 /* This can happen with a packed struct with trailing array member. */
1682 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
1683 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
1686 /* Output an uninitialized common or file-scope variable. */
1688 void
1689 nvptx_output_aligned_decl (FILE *file, const char *name,
1690 const_tree decl, HOST_WIDE_INT size, unsigned align)
1692 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1694 /* If this is public, it is common. The nearest thing we have to
1695 common is weak. */
1696 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
1698 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1699 TREE_TYPE (decl), size, align);
1700 nvptx_assemble_decl_end ();
1703 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1704 writing a constant variable EXP with NAME and SIZE and its
1705 initializer to FILE. */
1707 static void
1708 nvptx_asm_declare_constant_name (FILE *file, const char *name,
1709 const_tree exp, HOST_WIDE_INT obj_size)
1711 write_var_marker (file, true, false, name);
1713 fprintf (file, "\t");
1715 tree type = TREE_TYPE (exp);
1716 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
1717 TYPE_ALIGN (type));
1720 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1721 a variable DECL with NAME to FILE. */
1723 void
1724 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1726 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1728 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
1729 : DECL_WEAK (decl) ? ".weak " : ".visible "));
1731 tree type = TREE_TYPE (decl);
1732 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
1733 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1734 type, obj_size, DECL_ALIGN (decl));
1737 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1739 static void
1740 nvptx_globalize_label (FILE *, const char *)
1744 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1745 declaration only for variable DECL with NAME to FILE. */
1747 static void
1748 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1750 /* The middle end can place constant pool decls into the varpool as
1751 undefined. Until that is fixed, catch the problem here. */
1752 if (DECL_IN_CONSTANT_POOL (decl))
1753 return;
1755 write_var_marker (file, false, TREE_PUBLIC (decl), name);
1757 fprintf (file, "\t.extern ");
1758 tree size = DECL_SIZE_UNIT (decl);
1759 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1760 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
1761 DECL_ALIGN (decl));
1762 nvptx_assemble_decl_end ();
1765 /* Output a pattern for a move instruction. */
1767 const char *
1768 nvptx_output_mov_insn (rtx dst, rtx src)
1770 machine_mode dst_mode = GET_MODE (dst);
1771 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
1772 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
1773 machine_mode src_inner = (GET_CODE (src) == SUBREG
1774 ? GET_MODE (XEXP (src, 0)) : dst_mode);
1776 rtx sym = src;
1777 if (GET_CODE (sym) == CONST)
1778 sym = XEXP (XEXP (sym, 0), 0);
1779 if (SYMBOL_REF_P (sym))
1781 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
1782 return "%.\tcvta%D1%t0\t%0, %1;";
1783 nvptx_maybe_record_fnsym (sym);
1786 if (src_inner == dst_inner)
1787 return "%.\tmov%t0\t%0, %1;";
1789 if (CONSTANT_P (src))
1790 return (GET_MODE_CLASS (dst_inner) == MODE_INT
1791 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
1792 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
1794 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
1795 return "%.\tmov.b%T0\t%0, %1;";
1797 return "%.\tcvt%t0%t1\t%0, %1;";
1800 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
1801 involves writing .param declarations and in/out copies into them. For
1802 indirect calls, also write the .callprototype. */
1804 const char *
1805 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
1807 char buf[16];
1808 static int labelno;
1809 bool needs_tgt = register_operand (callee, Pmode);
1810 rtx pat = PATTERN (insn);
1811 int arg_end = XVECLEN (pat, 0);
1812 tree decl = NULL_TREE;
1814 fprintf (asm_out_file, "\t{\n");
1815 if (result != NULL)
1816 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
1817 nvptx_ptx_type_from_mode (GET_MODE (result), false),
1818 reg_names[NVPTX_RETURN_REGNUM]);
1820 /* Ensure we have a ptx declaration in the output if necessary. */
1821 if (GET_CODE (callee) == SYMBOL_REF)
1823 decl = SYMBOL_REF_DECL (callee);
1824 if (!decl
1825 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
1826 nvptx_record_libfunc (callee, result, pat);
1827 else if (DECL_EXTERNAL (decl))
1828 nvptx_record_fndecl (decl);
1831 if (needs_tgt)
1833 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
1834 labelno++;
1835 ASM_OUTPUT_LABEL (asm_out_file, buf);
1836 std::stringstream s;
1837 write_fn_proto_from_insn (s, NULL, result, pat);
1838 fputs (s.str().c_str(), asm_out_file);
1841 for (int argno = 1; argno < arg_end; argno++)
1843 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
1844 machine_mode mode = GET_MODE (t);
1845 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
1847 /* Mode splitting has already been done. */
1848 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
1849 "\t\tst.param%s [%%out_arg%d], ",
1850 ptx_type, argno, ptx_type, argno);
1851 output_reg (asm_out_file, REGNO (t), VOIDmode);
1852 fprintf (asm_out_file, ";\n");
1855 fprintf (asm_out_file, "\t\tcall ");
1856 if (result != NULL_RTX)
1857 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
1859 if (decl)
1861 const char *name = get_fnname_from_decl (decl);
1862 name = nvptx_name_replacement (name);
1863 assemble_name (asm_out_file, name);
1865 else
1866 output_address (VOIDmode, callee);
1868 const char *open = "(";
1869 for (int argno = 1; argno < arg_end; argno++)
1871 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
1872 open = "";
1874 if (decl && DECL_STATIC_CHAIN (decl))
1876 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
1877 open = "";
1879 if (!open[0])
1880 fprintf (asm_out_file, ")");
1882 if (needs_tgt)
1884 fprintf (asm_out_file, ", ");
1885 assemble_name (asm_out_file, buf);
1887 fprintf (asm_out_file, ";\n");
1889 if (find_reg_note (insn, REG_NORETURN, NULL))
1890 /* No return functions confuse the PTX JIT, as it doesn't realize
1891 the flow control barrier they imply. It can seg fault if it
1892 encounters what looks like an unexitable loop. Emit a trailing
1893 trap, which it does grok. */
1894 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
1896 if (result)
1898 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
1900 if (!rval[0])
1901 /* We must escape the '%' that starts RETURN_REGNUM. */
1902 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
1903 reg_names[NVPTX_RETURN_REGNUM]);
1904 return rval;
1907 return "}";
1910 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
1912 static bool
1913 nvptx_print_operand_punct_valid_p (unsigned char c)
1915 return c == '.' || c== '#';
1918 static void nvptx_print_operand (FILE *, rtx, int);
1920 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
1922 static void
1923 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
1925 rtx off;
1926 if (GET_CODE (x) == CONST)
1927 x = XEXP (x, 0);
1928 switch (GET_CODE (x))
1930 case PLUS:
1931 off = XEXP (x, 1);
1932 output_address (VOIDmode, XEXP (x, 0));
1933 fprintf (file, "+");
1934 output_address (VOIDmode, off);
1935 break;
1937 case SYMBOL_REF:
1938 case LABEL_REF:
1939 output_addr_const (file, x);
1940 break;
1942 default:
1943 gcc_assert (GET_CODE (x) != MEM);
1944 nvptx_print_operand (file, x, 0);
1945 break;
1949 /* Write assembly language output for the address ADDR to FILE. */
1951 static void
1952 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
1954 nvptx_print_address_operand (file, addr, mode);
1957 /* Print an operand, X, to FILE, with an optional modifier in CODE.
1959 Meaning of CODE:
1960 . -- print the predicate for the instruction or an emptry string for an
1961 unconditional one.
1962 # -- print a rounding mode for the instruction
1964 A -- print a data area for a MEM
1965 c -- print an opcode suffix for a comparison operator, including a type code
1966 D -- print a data area for a MEM operand
1967 S -- print a shuffle kind specified by CONST_INT
1968 t -- print a type opcode suffix, promoting QImode to 32 bits
1969 T -- print a type size in bits
1970 u -- print a type opcode suffix without promotions. */
1972 static void
1973 nvptx_print_operand (FILE *file, rtx x, int code)
1975 if (code == '.')
1977 x = current_insn_predicate;
1978 if (x)
1980 unsigned int regno = REGNO (XEXP (x, 0));
1981 fputs ("[", file);
1982 if (GET_CODE (x) == EQ)
1983 fputs ("!", file);
1984 fputs (reg_names [regno], file);
1985 fputs ("]", file);
1987 return;
1989 else if (code == '#')
1991 fputs (".rn", file);
1992 return;
1995 enum rtx_code x_code = GET_CODE (x);
1996 machine_mode mode = GET_MODE (x);
1998 switch (code)
2000 case 'A':
2001 x = XEXP (x, 0);
2002 /* FALLTHROUGH. */
2004 case 'D':
2005 if (GET_CODE (x) == CONST)
2006 x = XEXP (x, 0);
2007 if (GET_CODE (x) == PLUS)
2008 x = XEXP (x, 0);
2010 if (GET_CODE (x) == SYMBOL_REF)
2011 fputs (section_for_sym (x), file);
2012 break;
2014 case 't':
2015 case 'u':
2016 if (x_code == SUBREG)
2018 mode = GET_MODE (SUBREG_REG (x));
2019 if (mode == TImode)
2020 mode = DImode;
2021 else if (COMPLEX_MODE_P (mode))
2022 mode = GET_MODE_INNER (mode);
2024 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2025 break;
2027 case 'S':
2029 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2030 /* Same order as nvptx_shuffle_kind. */
2031 static const char *const kinds[] =
2032 {".up", ".down", ".bfly", ".idx"};
2033 fputs (kinds[kind], file);
2035 break;
2037 case 'T':
2038 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2039 break;
2041 case 'j':
2042 fprintf (file, "@");
2043 goto common;
2045 case 'J':
2046 fprintf (file, "@!");
2047 goto common;
2049 case 'c':
2050 mode = GET_MODE (XEXP (x, 0));
2051 switch (x_code)
2053 case EQ:
2054 fputs (".eq", file);
2055 break;
2056 case NE:
2057 if (FLOAT_MODE_P (mode))
2058 fputs (".neu", file);
2059 else
2060 fputs (".ne", file);
2061 break;
2062 case LE:
2063 case LEU:
2064 fputs (".le", file);
2065 break;
2066 case GE:
2067 case GEU:
2068 fputs (".ge", file);
2069 break;
2070 case LT:
2071 case LTU:
2072 fputs (".lt", file);
2073 break;
2074 case GT:
2075 case GTU:
2076 fputs (".gt", file);
2077 break;
2078 case LTGT:
2079 fputs (".ne", file);
2080 break;
2081 case UNEQ:
2082 fputs (".equ", file);
2083 break;
2084 case UNLE:
2085 fputs (".leu", file);
2086 break;
2087 case UNGE:
2088 fputs (".geu", file);
2089 break;
2090 case UNLT:
2091 fputs (".ltu", file);
2092 break;
2093 case UNGT:
2094 fputs (".gtu", file);
2095 break;
2096 case UNORDERED:
2097 fputs (".nan", file);
2098 break;
2099 case ORDERED:
2100 fputs (".num", file);
2101 break;
2102 default:
2103 gcc_unreachable ();
2105 if (FLOAT_MODE_P (mode)
2106 || x_code == EQ || x_code == NE
2107 || x_code == GEU || x_code == GTU
2108 || x_code == LEU || x_code == LTU)
2109 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2110 else
2111 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2112 break;
2113 default:
2114 common:
2115 switch (x_code)
2117 case SUBREG:
2119 rtx inner_x = SUBREG_REG (x);
2120 machine_mode inner_mode = GET_MODE (inner_x);
2121 machine_mode split = maybe_split_mode (inner_mode);
2123 if (split != VOIDmode
2124 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2125 output_reg (file, REGNO (inner_x), split);
2126 else
2127 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2129 break;
2131 case REG:
2132 output_reg (file, REGNO (x), maybe_split_mode (mode));
2133 break;
2135 case MEM:
2136 fputc ('[', file);
2137 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2138 fputc (']', file);
2139 break;
2141 case CONST_INT:
2142 output_addr_const (file, x);
2143 break;
2145 case CONST:
2146 case SYMBOL_REF:
2147 case LABEL_REF:
2148 /* We could use output_addr_const, but that can print things like
2149 "x-8", which breaks ptxas. Need to ensure it is output as
2150 "x+-8". */
2151 nvptx_print_address_operand (file, x, VOIDmode);
2152 break;
2154 case CONST_DOUBLE:
2155 long vals[2];
2156 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2157 vals[0] &= 0xffffffff;
2158 vals[1] &= 0xffffffff;
2159 if (mode == SFmode)
2160 fprintf (file, "0f%08lx", vals[0]);
2161 else
2162 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2163 break;
2165 default:
2166 output_addr_const (file, x);
2171 /* Record replacement regs used to deal with subreg operands. */
2172 struct reg_replace
2174 rtx replacement[MAX_RECOG_OPERANDS];
2175 machine_mode mode;
2176 int n_allocated;
2177 int n_in_use;
2180 /* Allocate or reuse a replacement in R and return the rtx. */
2182 static rtx
2183 get_replacement (struct reg_replace *r)
2185 if (r->n_allocated == r->n_in_use)
2186 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2187 return r->replacement[r->n_in_use++];
2190 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2191 the presence of subregs would break the rules for most instructions.
2192 Replace them with a suitable new register of the right size, plus
2193 conversion copyin/copyout instructions. */
2195 static void
2196 nvptx_reorg_subreg (void)
2198 struct reg_replace qiregs, hiregs, siregs, diregs;
2199 rtx_insn *insn, *next;
2201 qiregs.n_allocated = 0;
2202 hiregs.n_allocated = 0;
2203 siregs.n_allocated = 0;
2204 diregs.n_allocated = 0;
2205 qiregs.mode = QImode;
2206 hiregs.mode = HImode;
2207 siregs.mode = SImode;
2208 diregs.mode = DImode;
2210 for (insn = get_insns (); insn; insn = next)
2212 next = NEXT_INSN (insn);
2213 if (!NONDEBUG_INSN_P (insn)
2214 || asm_noperands (PATTERN (insn)) >= 0
2215 || GET_CODE (PATTERN (insn)) == USE
2216 || GET_CODE (PATTERN (insn)) == CLOBBER)
2217 continue;
2219 qiregs.n_in_use = 0;
2220 hiregs.n_in_use = 0;
2221 siregs.n_in_use = 0;
2222 diregs.n_in_use = 0;
2223 extract_insn (insn);
2224 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2226 for (int i = 0; i < recog_data.n_operands; i++)
2228 rtx op = recog_data.operand[i];
2229 if (GET_CODE (op) != SUBREG)
2230 continue;
2232 rtx inner = SUBREG_REG (op);
2234 machine_mode outer_mode = GET_MODE (op);
2235 machine_mode inner_mode = GET_MODE (inner);
2236 gcc_assert (s_ok);
2237 if (s_ok
2238 && (GET_MODE_PRECISION (inner_mode)
2239 >= GET_MODE_PRECISION (outer_mode)))
2240 continue;
2241 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2242 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2243 : outer_mode == HImode ? &hiregs
2244 : outer_mode == SImode ? &siregs
2245 : &diregs);
2246 rtx new_reg = get_replacement (r);
2248 if (recog_data.operand_type[i] != OP_OUT)
2250 enum rtx_code code;
2251 if (GET_MODE_PRECISION (inner_mode)
2252 < GET_MODE_PRECISION (outer_mode))
2253 code = ZERO_EXTEND;
2254 else
2255 code = TRUNCATE;
2257 rtx pat = gen_rtx_SET (new_reg,
2258 gen_rtx_fmt_e (code, outer_mode, inner));
2259 emit_insn_before (pat, insn);
2262 if (recog_data.operand_type[i] != OP_IN)
2264 enum rtx_code code;
2265 if (GET_MODE_PRECISION (inner_mode)
2266 < GET_MODE_PRECISION (outer_mode))
2267 code = TRUNCATE;
2268 else
2269 code = ZERO_EXTEND;
2271 rtx pat = gen_rtx_SET (inner,
2272 gen_rtx_fmt_e (code, inner_mode, new_reg));
2273 emit_insn_after (pat, insn);
2275 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2280 /* Loop structure of the function. The entire function is described as
2281 a NULL loop. */
2283 struct parallel
2285 /* Parent parallel. */
2286 parallel *parent;
2288 /* Next sibling parallel. */
2289 parallel *next;
2291 /* First child parallel. */
2292 parallel *inner;
2294 /* Partitioning mask of the parallel. */
2295 unsigned mask;
2297 /* Partitioning used within inner parallels. */
2298 unsigned inner_mask;
2300 /* Location of parallel forked and join. The forked is the first
2301 block in the parallel and the join is the first block after of
2302 the partition. */
2303 basic_block forked_block;
2304 basic_block join_block;
2306 rtx_insn *forked_insn;
2307 rtx_insn *join_insn;
2309 rtx_insn *fork_insn;
2310 rtx_insn *joining_insn;
2312 /* Basic blocks in this parallel, but not in child parallels. The
2313 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2314 blocks are not. */
2315 auto_vec<basic_block> blocks;
2317 public:
2318 parallel (parallel *parent, unsigned mode);
2319 ~parallel ();
2322 /* Constructor links the new parallel into it's parent's chain of
2323 children. */
2325 parallel::parallel (parallel *parent_, unsigned mask_)
2326 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2328 forked_block = join_block = 0;
2329 forked_insn = join_insn = 0;
2330 fork_insn = joining_insn = 0;
2332 if (parent)
2334 next = parent->inner;
2335 parent->inner = this;
2339 parallel::~parallel ()
2341 delete inner;
2342 delete next;
2345 /* Map of basic blocks to insns */
2346 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2348 /* A tuple of an insn of interest and the BB in which it resides. */
2349 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2350 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2352 /* Split basic blocks such that each forked and join unspecs are at
2353 the start of their basic blocks. Thus afterwards each block will
2354 have a single partitioning mode. We also do the same for return
2355 insns, as they are executed by every thread. Return the
2356 partitioning mode of the function as a whole. Populate MAP with
2357 head and tail blocks. We also clear the BB visited flag, which is
2358 used when finding partitions. */
2360 static void
2361 nvptx_split_blocks (bb_insn_map_t *map)
2363 insn_bb_vec_t worklist;
2364 basic_block block;
2365 rtx_insn *insn;
2367 /* Locate all the reorg instructions of interest. */
2368 FOR_ALL_BB_FN (block, cfun)
2370 bool seen_insn = false;
2372 /* Clear visited flag, for use by parallel locator */
2373 block->flags &= ~BB_VISITED;
2375 FOR_BB_INSNS (block, insn)
2377 if (!INSN_P (insn))
2378 continue;
2379 switch (recog_memoized (insn))
2381 default:
2382 seen_insn = true;
2383 continue;
2384 case CODE_FOR_nvptx_forked:
2385 case CODE_FOR_nvptx_join:
2386 break;
2388 case CODE_FOR_return:
2389 /* We also need to split just before return insns, as
2390 that insn needs executing by all threads, but the
2391 block it is in probably does not. */
2392 break;
2395 if (seen_insn)
2396 /* We've found an instruction that must be at the start of
2397 a block, but isn't. Add it to the worklist. */
2398 worklist.safe_push (insn_bb_t (insn, block));
2399 else
2400 /* It was already the first instruction. Just add it to
2401 the map. */
2402 map->get_or_insert (block) = insn;
2403 seen_insn = true;
2407 /* Split blocks on the worklist. */
2408 unsigned ix;
2409 insn_bb_t *elt;
2410 basic_block remap = 0;
2411 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2413 if (remap != elt->second)
2415 block = elt->second;
2416 remap = block;
2419 /* Split block before insn. The insn is in the new block */
2420 edge e = split_block (block, PREV_INSN (elt->first));
2422 block = e->dest;
2423 map->get_or_insert (block) = elt->first;
2427 /* BLOCK is a basic block containing a head or tail instruction.
2428 Locate the associated prehead or pretail instruction, which must be
2429 in the single predecessor block. */
2431 static rtx_insn *
2432 nvptx_discover_pre (basic_block block, int expected)
2434 gcc_assert (block->preds->length () == 1);
2435 basic_block pre_block = (*block->preds)[0]->src;
2436 rtx_insn *pre_insn;
2438 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2439 pre_insn = PREV_INSN (pre_insn))
2440 gcc_assert (pre_insn != BB_HEAD (pre_block));
2442 gcc_assert (recog_memoized (pre_insn) == expected);
2443 return pre_insn;
2446 /* Dump this parallel and all its inner parallels. */
2448 static void
2449 nvptx_dump_pars (parallel *par, unsigned depth)
2451 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2452 depth, par->mask,
2453 par->forked_block ? par->forked_block->index : -1,
2454 par->join_block ? par->join_block->index : -1);
2456 fprintf (dump_file, " blocks:");
2458 basic_block block;
2459 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2460 fprintf (dump_file, " %d", block->index);
2461 fprintf (dump_file, "\n");
2462 if (par->inner)
2463 nvptx_dump_pars (par->inner, depth + 1);
2465 if (par->next)
2466 nvptx_dump_pars (par->next, depth);
2469 /* If BLOCK contains a fork/join marker, process it to create or
2470 terminate a loop structure. Add this block to the current loop,
2471 and then walk successor blocks. */
2473 static parallel *
2474 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2476 if (block->flags & BB_VISITED)
2477 return par;
2478 block->flags |= BB_VISITED;
2480 if (rtx_insn **endp = map->get (block))
2482 rtx_insn *end = *endp;
2484 /* This is a block head or tail, or return instruction. */
2485 switch (recog_memoized (end))
2487 case CODE_FOR_return:
2488 /* Return instructions are in their own block, and we
2489 don't need to do anything more. */
2490 return par;
2492 case CODE_FOR_nvptx_forked:
2493 /* Loop head, create a new inner loop and add it into
2494 our parent's child list. */
2496 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2498 gcc_assert (mask);
2499 par = new parallel (par, mask);
2500 par->forked_block = block;
2501 par->forked_insn = end;
2502 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2503 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2504 par->fork_insn
2505 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2507 break;
2509 case CODE_FOR_nvptx_join:
2510 /* A loop tail. Finish the current loop and return to
2511 parent. */
2513 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2515 gcc_assert (par->mask == mask);
2516 par->join_block = block;
2517 par->join_insn = end;
2518 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2519 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2520 par->joining_insn
2521 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2522 par = par->parent;
2524 break;
2526 default:
2527 gcc_unreachable ();
2531 if (par)
2532 /* Add this block onto the current loop's list of blocks. */
2533 par->blocks.safe_push (block);
2534 else
2535 /* This must be the entry block. Create a NULL parallel. */
2536 par = new parallel (0, 0);
2538 /* Walk successor blocks. */
2539 edge e;
2540 edge_iterator ei;
2542 FOR_EACH_EDGE (e, ei, block->succs)
2543 nvptx_find_par (map, par, e->dest);
2545 return par;
2548 /* DFS walk the CFG looking for fork & join markers. Construct
2549 loop structures as we go. MAP is a mapping of basic blocks
2550 to head & tail markers, discovered when splitting blocks. This
2551 speeds up the discovery. We rely on the BB visited flag having
2552 been cleared when splitting blocks. */
2554 static parallel *
2555 nvptx_discover_pars (bb_insn_map_t *map)
2557 basic_block block;
2559 /* Mark exit blocks as visited. */
2560 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2561 block->flags |= BB_VISITED;
2563 /* And entry block as not. */
2564 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2565 block->flags &= ~BB_VISITED;
2567 parallel *par = nvptx_find_par (map, 0, block);
2569 if (dump_file)
2571 fprintf (dump_file, "\nLoops\n");
2572 nvptx_dump_pars (par, 0);
2573 fprintf (dump_file, "\n");
2576 return par;
2579 /* Analyse a group of BBs within a partitioned region and create N
2580 Single-Entry-Single-Exit regions. Some of those regions will be
2581 trivial ones consisting of a single BB. The blocks of a
2582 partitioned region might form a set of disjoint graphs -- because
2583 the region encloses a differently partitoned sub region.
2585 We use the linear time algorithm described in 'Finding Regions Fast:
2586 Single Entry Single Exit and control Regions in Linear Time'
2587 Johnson, Pearson & Pingali. That algorithm deals with complete
2588 CFGs, where a back edge is inserted from END to START, and thus the
2589 problem becomes one of finding equivalent loops.
2591 In this case we have a partial CFG. We complete it by redirecting
2592 any incoming edge to the graph to be from an arbitrary external BB,
2593 and similarly redirecting any outgoing edge to be to that BB.
2594 Thus we end up with a closed graph.
2596 The algorithm works by building a spanning tree of an undirected
2597 graph and keeping track of back edges from nodes further from the
2598 root in the tree to nodes nearer to the root in the tree. In the
2599 description below, the root is up and the tree grows downwards.
2601 We avoid having to deal with degenerate back-edges to the same
2602 block, by splitting each BB into 3 -- one for input edges, one for
2603 the node itself and one for the output edges. Such back edges are
2604 referred to as 'Brackets'. Cycle equivalent nodes will have the
2605 same set of brackets.
2607 Determining bracket equivalency is done by maintaining a list of
2608 brackets in such a manner that the list length and final bracket
2609 uniquely identify the set.
2611 We use coloring to mark all BBs with cycle equivalency with the
2612 same color. This is the output of the 'Finding Regions Fast'
2613 algorithm. Notice it doesn't actually find the set of nodes within
2614 a particular region, just unorderd sets of nodes that are the
2615 entries and exits of SESE regions.
2617 After determining cycle equivalency, we need to find the minimal
2618 set of SESE regions. Do this with a DFS coloring walk of the
2619 complete graph. We're either 'looking' or 'coloring'. When
2620 looking, and we're in the subgraph, we start coloring the color of
2621 the current node, and remember that node as the start of the
2622 current color's SESE region. Every time we go to a new node, we
2623 decrement the count of nodes with thet color. If it reaches zero,
2624 we remember that node as the end of the current color's SESE region
2625 and return to 'looking'. Otherwise we color the node the current
2626 color.
2628 This way we end up with coloring the inside of non-trivial SESE
2629 regions with the color of that region. */
2631 /* A pair of BBs. We use this to represent SESE regions. */
2632 typedef std::pair<basic_block, basic_block> bb_pair_t;
2633 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
2635 /* A node in the undirected CFG. The discriminator SECOND indicates just
2636 above or just below the BB idicated by FIRST. */
2637 typedef std::pair<basic_block, int> pseudo_node_t;
2639 /* A bracket indicates an edge towards the root of the spanning tree of the
2640 undirected graph. Each bracket has a color, determined
2641 from the currrent set of brackets. */
2642 struct bracket
2644 pseudo_node_t back; /* Back target */
2646 /* Current color and size of set. */
2647 unsigned color;
2648 unsigned size;
2650 bracket (pseudo_node_t back_)
2651 : back (back_), color (~0u), size (~0u)
2655 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
2657 if (length != size)
2659 size = length;
2660 color = color_counts.length ();
2661 color_counts.quick_push (0);
2663 color_counts[color]++;
2664 return color;
2668 typedef auto_vec<bracket> bracket_vec_t;
2670 /* Basic block info for finding SESE regions. */
2672 struct bb_sese
2674 int node; /* Node number in spanning tree. */
2675 int parent; /* Parent node number. */
2677 /* The algorithm splits each node A into Ai, A', Ao. The incoming
2678 edges arrive at pseudo-node Ai and the outgoing edges leave at
2679 pseudo-node Ao. We have to remember which way we arrived at a
2680 particular node when generating the spanning tree. dir > 0 means
2681 we arrived at Ai, dir < 0 means we arrived at Ao. */
2682 int dir;
2684 /* Lowest numbered pseudo-node reached via a backedge from thsis
2685 node, or any descendant. */
2686 pseudo_node_t high;
2688 int color; /* Cycle-equivalence color */
2690 /* Stack of brackets for this node. */
2691 bracket_vec_t brackets;
2693 bb_sese (unsigned node_, unsigned p, int dir_)
2694 :node (node_), parent (p), dir (dir_)
2697 ~bb_sese ();
2699 /* Push a bracket ending at BACK. */
2700 void push (const pseudo_node_t &back)
2702 if (dump_file)
2703 fprintf (dump_file, "Pushing backedge %d:%+d\n",
2704 back.first ? back.first->index : 0, back.second);
2705 brackets.safe_push (bracket (back));
2708 void append (bb_sese *child);
2709 void remove (const pseudo_node_t &);
2711 /* Set node's color. */
2712 void set_color (auto_vec<unsigned> &color_counts)
2714 color = brackets.last ().get_color (color_counts, brackets.length ());
2718 bb_sese::~bb_sese ()
2722 /* Destructively append CHILD's brackets. */
2724 void
2725 bb_sese::append (bb_sese *child)
2727 if (int len = child->brackets.length ())
2729 int ix;
2731 if (dump_file)
2733 for (ix = 0; ix < len; ix++)
2735 const pseudo_node_t &pseudo = child->brackets[ix].back;
2736 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
2737 child->node, pseudo.first ? pseudo.first->index : 0,
2738 pseudo.second);
2741 if (!brackets.length ())
2742 std::swap (brackets, child->brackets);
2743 else
2745 brackets.reserve (len);
2746 for (ix = 0; ix < len; ix++)
2747 brackets.quick_push (child->brackets[ix]);
2752 /* Remove brackets that terminate at PSEUDO. */
2754 void
2755 bb_sese::remove (const pseudo_node_t &pseudo)
2757 unsigned removed = 0;
2758 int len = brackets.length ();
2760 for (int ix = 0; ix < len; ix++)
2762 if (brackets[ix].back == pseudo)
2764 if (dump_file)
2765 fprintf (dump_file, "Removing backedge %d:%+d\n",
2766 pseudo.first ? pseudo.first->index : 0, pseudo.second);
2767 removed++;
2769 else if (removed)
2770 brackets[ix-removed] = brackets[ix];
2772 while (removed--)
2773 brackets.pop ();
2776 /* Accessors for BB's aux pointer. */
2777 #define BB_SET_SESE(B, S) ((B)->aux = (S))
2778 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
2780 /* DFS walk creating SESE data structures. Only cover nodes with
2781 BB_VISITED set. Append discovered blocks to LIST. We number in
2782 increments of 3 so that the above and below pseudo nodes can be
2783 implicitly numbered too. */
2785 static int
2786 nvptx_sese_number (int n, int p, int dir, basic_block b,
2787 auto_vec<basic_block> *list)
2789 if (BB_GET_SESE (b))
2790 return n;
2792 if (dump_file)
2793 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
2794 b->index, n, p, dir);
2796 BB_SET_SESE (b, new bb_sese (n, p, dir));
2797 p = n;
2799 n += 3;
2800 list->quick_push (b);
2802 /* First walk the nodes on the 'other side' of this node, then walk
2803 the nodes on the same side. */
2804 for (unsigned ix = 2; ix; ix--)
2806 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
2807 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
2808 : offsetof (edge_def, src));
2809 edge e;
2810 edge_iterator (ei);
2812 FOR_EACH_EDGE (e, ei, edges)
2814 basic_block target = *(basic_block *)((char *)e + offset);
2816 if (target->flags & BB_VISITED)
2817 n = nvptx_sese_number (n, p, dir, target, list);
2819 dir = -dir;
2821 return n;
2824 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
2825 EDGES are the outgoing edges and OFFSET is the offset to the src
2826 or dst block on the edges. */
2828 static void
2829 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
2830 vec<edge, va_gc> *edges, size_t offset)
2832 edge e;
2833 edge_iterator (ei);
2834 int hi_back = depth;
2835 pseudo_node_t node_back (0, depth);
2836 int hi_child = depth;
2837 pseudo_node_t node_child (0, depth);
2838 basic_block child = NULL;
2839 unsigned num_children = 0;
2840 int usd = -dir * sese->dir;
2842 if (dump_file)
2843 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
2844 me->index, sese->node, dir);
2846 if (dir < 0)
2848 /* This is the above pseudo-child. It has the BB itself as an
2849 additional child node. */
2850 node_child = sese->high;
2851 hi_child = node_child.second;
2852 if (node_child.first)
2853 hi_child += BB_GET_SESE (node_child.first)->node;
2854 num_children++;
2857 /* Examine each edge.
2858 - if it is a child (a) append its bracket list and (b) record
2859 whether it is the child with the highest reaching bracket.
2860 - if it is an edge to ancestor, record whether it's the highest
2861 reaching backlink. */
2862 FOR_EACH_EDGE (e, ei, edges)
2864 basic_block target = *(basic_block *)((char *)e + offset);
2866 if (bb_sese *t_sese = BB_GET_SESE (target))
2868 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
2870 /* Child node. Append its bracket list. */
2871 num_children++;
2872 sese->append (t_sese);
2874 /* Compare it's hi value. */
2875 int t_hi = t_sese->high.second;
2877 if (basic_block child_hi_block = t_sese->high.first)
2878 t_hi += BB_GET_SESE (child_hi_block)->node;
2880 if (hi_child > t_hi)
2882 hi_child = t_hi;
2883 node_child = t_sese->high;
2884 child = target;
2887 else if (t_sese->node < sese->node + dir
2888 && !(dir < 0 && sese->parent == t_sese->node))
2890 /* Non-parental ancestor node -- a backlink. */
2891 int d = usd * t_sese->dir;
2892 int back = t_sese->node + d;
2894 if (hi_back > back)
2896 hi_back = back;
2897 node_back = pseudo_node_t (target, d);
2901 else
2902 { /* Fallen off graph, backlink to entry node. */
2903 hi_back = 0;
2904 node_back = pseudo_node_t (0, 0);
2908 /* Remove any brackets that terminate at this pseudo node. */
2909 sese->remove (pseudo_node_t (me, dir));
2911 /* Now push any backlinks from this pseudo node. */
2912 FOR_EACH_EDGE (e, ei, edges)
2914 basic_block target = *(basic_block *)((char *)e + offset);
2915 if (bb_sese *t_sese = BB_GET_SESE (target))
2917 if (t_sese->node < sese->node + dir
2918 && !(dir < 0 && sese->parent == t_sese->node))
2919 /* Non-parental ancestor node - backedge from me. */
2920 sese->push (pseudo_node_t (target, usd * t_sese->dir));
2922 else
2924 /* back edge to entry node */
2925 sese->push (pseudo_node_t (0, 0));
2929 /* If this node leads directly or indirectly to a no-return region of
2930 the graph, then fake a backedge to entry node. */
2931 if (!sese->brackets.length () || !edges || !edges->length ())
2933 hi_back = 0;
2934 node_back = pseudo_node_t (0, 0);
2935 sese->push (node_back);
2938 /* Record the highest reaching backedge from us or a descendant. */
2939 sese->high = hi_back < hi_child ? node_back : node_child;
2941 if (num_children > 1)
2943 /* There is more than one child -- this is a Y shaped piece of
2944 spanning tree. We have to insert a fake backedge from this
2945 node to the highest ancestor reached by not-the-highest
2946 reaching child. Note that there may be multiple children
2947 with backedges to the same highest node. That's ok and we
2948 insert the edge to that highest node. */
2949 hi_child = depth;
2950 if (dir < 0 && child)
2952 node_child = sese->high;
2953 hi_child = node_child.second;
2954 if (node_child.first)
2955 hi_child += BB_GET_SESE (node_child.first)->node;
2958 FOR_EACH_EDGE (e, ei, edges)
2960 basic_block target = *(basic_block *)((char *)e + offset);
2962 if (target == child)
2963 /* Ignore the highest child. */
2964 continue;
2966 bb_sese *t_sese = BB_GET_SESE (target);
2967 if (!t_sese)
2968 continue;
2969 if (t_sese->parent != sese->node)
2970 /* Not a child. */
2971 continue;
2973 /* Compare its hi value. */
2974 int t_hi = t_sese->high.second;
2976 if (basic_block child_hi_block = t_sese->high.first)
2977 t_hi += BB_GET_SESE (child_hi_block)->node;
2979 if (hi_child > t_hi)
2981 hi_child = t_hi;
2982 node_child = t_sese->high;
2986 sese->push (node_child);
2991 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
2992 proceed to successors. Set SESE entry and exit nodes of
2993 REGIONS. */
2995 static void
2996 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
2997 basic_block block, int coloring)
2999 bb_sese *sese = BB_GET_SESE (block);
3001 if (block->flags & BB_VISITED)
3003 /* If we've already encountered this block, either we must not
3004 be coloring, or it must have been colored the current color. */
3005 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3006 return;
3009 block->flags |= BB_VISITED;
3011 if (sese)
3013 if (coloring < 0)
3015 /* Start coloring a region. */
3016 regions[sese->color].first = block;
3017 coloring = sese->color;
3020 if (!--color_counts[sese->color] && sese->color == coloring)
3022 /* Found final block of SESE region. */
3023 regions[sese->color].second = block;
3024 coloring = -1;
3026 else
3027 /* Color the node, so we can assert on revisiting the node
3028 that the graph is indeed SESE. */
3029 sese->color = coloring;
3031 else
3032 /* Fallen off the subgraph, we cannot be coloring. */
3033 gcc_assert (coloring < 0);
3035 /* Walk each successor block. */
3036 if (block->succs && block->succs->length ())
3038 edge e;
3039 edge_iterator ei;
3041 FOR_EACH_EDGE (e, ei, block->succs)
3042 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3044 else
3045 gcc_assert (coloring < 0);
3048 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3049 end up with NULL entries in it. */
3051 static void
3052 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3054 basic_block block;
3055 int ix;
3057 /* First clear each BB of the whole function. */
3058 FOR_EACH_BB_FN (block, cfun)
3060 block->flags &= ~BB_VISITED;
3061 BB_SET_SESE (block, 0);
3063 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3064 block->flags &= ~BB_VISITED;
3065 BB_SET_SESE (block, 0);
3066 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3067 block->flags &= ~BB_VISITED;
3068 BB_SET_SESE (block, 0);
3070 /* Mark blocks in the function that are in this graph. */
3071 for (ix = 0; blocks.iterate (ix, &block); ix++)
3072 block->flags |= BB_VISITED;
3074 /* Counts of nodes assigned to each color. There cannot be more
3075 colors than blocks (and hopefully there will be fewer). */
3076 auto_vec<unsigned> color_counts;
3077 color_counts.reserve (blocks.length ());
3079 /* Worklist of nodes in the spanning tree. Again, there cannot be
3080 more nodes in the tree than blocks (there will be fewer if the
3081 CFG of blocks is disjoint). */
3082 auto_vec<basic_block> spanlist;
3083 spanlist.reserve (blocks.length ());
3085 /* Make sure every block has its cycle class determined. */
3086 for (ix = 0; blocks.iterate (ix, &block); ix++)
3088 if (BB_GET_SESE (block))
3089 /* We already met this block in an earlier graph solve. */
3090 continue;
3092 if (dump_file)
3093 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3095 /* Number the nodes reachable from block initial DFS order. */
3096 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3098 /* Now walk in reverse DFS order to find cycle equivalents. */
3099 while (spanlist.length ())
3101 block = spanlist.pop ();
3102 bb_sese *sese = BB_GET_SESE (block);
3104 /* Do the pseudo node below. */
3105 nvptx_sese_pseudo (block, sese, depth, +1,
3106 sese->dir > 0 ? block->succs : block->preds,
3107 (sese->dir > 0 ? offsetof (edge_def, dest)
3108 : offsetof (edge_def, src)));
3109 sese->set_color (color_counts);
3110 /* Do the pseudo node above. */
3111 nvptx_sese_pseudo (block, sese, depth, -1,
3112 sese->dir < 0 ? block->succs : block->preds,
3113 (sese->dir < 0 ? offsetof (edge_def, dest)
3114 : offsetof (edge_def, src)));
3116 if (dump_file)
3117 fprintf (dump_file, "\n");
3120 if (dump_file)
3122 unsigned count;
3123 const char *comma = "";
3125 fprintf (dump_file, "Found %d cycle equivalents\n",
3126 color_counts.length ());
3127 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3129 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3131 comma = "";
3132 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3133 if (BB_GET_SESE (block)->color == ix)
3135 block->flags |= BB_VISITED;
3136 fprintf (dump_file, "%s%d", comma, block->index);
3137 comma=",";
3139 fprintf (dump_file, "}");
3140 comma = ", ";
3142 fprintf (dump_file, "\n");
3145 /* Now we've colored every block in the subgraph. We now need to
3146 determine the minimal set of SESE regions that cover that
3147 subgraph. Do this with a DFS walk of the complete function.
3148 During the walk we're either 'looking' or 'coloring'. When we
3149 reach the last node of a particular color, we stop coloring and
3150 return to looking. */
3152 /* There cannot be more SESE regions than colors. */
3153 regions.reserve (color_counts.length ());
3154 for (ix = color_counts.length (); ix--;)
3155 regions.quick_push (bb_pair_t (0, 0));
3157 for (ix = 0; blocks.iterate (ix, &block); ix++)
3158 block->flags &= ~BB_VISITED;
3160 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3162 if (dump_file)
3164 const char *comma = "";
3165 int len = regions.length ();
3167 fprintf (dump_file, "SESE regions:");
3168 for (ix = 0; ix != len; ix++)
3170 basic_block from = regions[ix].first;
3171 basic_block to = regions[ix].second;
3173 if (from)
3175 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3176 if (to != from)
3177 fprintf (dump_file, "->%d", to->index);
3179 int color = BB_GET_SESE (from)->color;
3181 /* Print the blocks within the region (excluding ends). */
3182 FOR_EACH_BB_FN (block, cfun)
3184 bb_sese *sese = BB_GET_SESE (block);
3186 if (sese && sese->color == color
3187 && block != from && block != to)
3188 fprintf (dump_file, ".%d", block->index);
3190 fprintf (dump_file, "}");
3192 comma = ",";
3194 fprintf (dump_file, "\n\n");
3197 for (ix = 0; blocks.iterate (ix, &block); ix++)
3198 delete BB_GET_SESE (block);
3201 #undef BB_SET_SESE
3202 #undef BB_GET_SESE
3204 /* Propagate live state at the start of a partitioned region. BLOCK
3205 provides the live register information, and might not contain
3206 INSN. Propagation is inserted just after INSN. RW indicates whether
3207 we are reading and/or writing state. This
3208 separation is needed for worker-level proppagation where we
3209 essentially do a spill & fill. FN is the underlying worker
3210 function to generate the propagation instructions for single
3211 register. DATA is user data.
3213 We propagate the live register set and the entire frame. We could
3214 do better by (a) propagating just the live set that is used within
3215 the partitioned regions and (b) only propagating stack entries that
3216 are used. The latter might be quite hard to determine. */
3218 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3220 static void
3221 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3222 propagator_fn fn, void *data)
3224 bitmap live = DF_LIVE_IN (block);
3225 bitmap_iterator iterator;
3226 unsigned ix;
3228 /* Copy the frame array. */
3229 HOST_WIDE_INT fs = get_frame_size ();
3230 if (fs)
3232 rtx tmp = gen_reg_rtx (DImode);
3233 rtx idx = NULL_RTX;
3234 rtx ptr = gen_reg_rtx (Pmode);
3235 rtx pred = NULL_RTX;
3236 rtx_code_label *label = NULL;
3238 gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
3239 fs /= GET_MODE_SIZE (DImode);
3240 /* Detect single iteration loop. */
3241 if (fs == 1)
3242 fs = 0;
3244 start_sequence ();
3245 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3246 if (fs)
3248 idx = gen_reg_rtx (SImode);
3249 pred = gen_reg_rtx (BImode);
3250 label = gen_label_rtx ();
3252 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3253 /* Allow worker function to initialize anything needed. */
3254 rtx init = fn (tmp, PM_loop_begin, fs, data);
3255 if (init)
3256 emit_insn (init);
3257 emit_label (label);
3258 LABEL_NUSES (label)++;
3259 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3261 if (rw & PM_read)
3262 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3263 emit_insn (fn (tmp, rw, fs, data));
3264 if (rw & PM_write)
3265 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3266 if (fs)
3268 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3269 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3270 emit_insn (gen_br_true_uni (pred, label));
3271 rtx fini = fn (tmp, PM_loop_end, fs, data);
3272 if (fini)
3273 emit_insn (fini);
3274 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3276 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3277 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3278 rtx cpy = get_insns ();
3279 end_sequence ();
3280 insn = emit_insn_after (cpy, insn);
3283 /* Copy live registers. */
3284 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3286 rtx reg = regno_reg_rtx[ix];
3288 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3290 rtx bcast = fn (reg, rw, 0, data);
3292 insn = emit_insn_after (bcast, insn);
3297 /* Worker for nvptx_vpropagate. */
3299 static rtx
3300 vprop_gen (rtx reg, propagate_mask pm,
3301 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3303 if (!(pm & PM_read_write))
3304 return 0;
3306 return nvptx_gen_vcast (reg);
3309 /* Propagate state that is live at start of BLOCK across the vectors
3310 of a single warp. Propagation is inserted just after INSN. */
3312 static void
3313 nvptx_vpropagate (basic_block block, rtx_insn *insn)
3315 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3318 /* Worker for nvptx_wpropagate. */
3320 static rtx
3321 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3323 wcast_data_t *data = (wcast_data_t *)data_;
3325 if (pm & PM_loop_begin)
3327 /* Starting a loop, initialize pointer. */
3328 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3330 if (align > worker_bcast_align)
3331 worker_bcast_align = align;
3332 data->offset = (data->offset + align - 1) & ~(align - 1);
3334 data->ptr = gen_reg_rtx (Pmode);
3336 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3338 else if (pm & PM_loop_end)
3340 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3341 data->ptr = NULL_RTX;
3342 return clobber;
3344 else
3345 return nvptx_gen_wcast (reg, pm, rep, data);
3348 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3349 indicates if this is just before partitioned mode (do spill), or
3350 just after it starts (do fill). Sequence is inserted just after
3351 INSN. */
3353 static void
3354 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3356 wcast_data_t data;
3358 data.base = gen_reg_rtx (Pmode);
3359 data.offset = 0;
3360 data.ptr = NULL_RTX;
3362 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3363 if (data.offset)
3365 /* Stuff was emitted, initialize the base pointer now. */
3366 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3367 emit_insn_after (init, insn);
3369 if (worker_bcast_size < data.offset)
3370 worker_bcast_size = data.offset;
3374 /* Emit a worker-level synchronization barrier. We use different
3375 markers for before and after synchronizations. */
3377 static rtx
3378 nvptx_wsync (bool after)
3380 return gen_nvptx_barsync (GEN_INT (after));
3383 /* Single neutering according to MASK. FROM is the incoming block and
3384 TO is the outgoing block. These may be the same block. Insert at
3385 start of FROM:
3387 if (tid.<axis>) goto end.
3389 and insert before ending branch of TO (if there is such an insn):
3391 end:
3392 <possibly-broadcast-cond>
3393 <branch>
3395 We currently only use differnt FROM and TO when skipping an entire
3396 loop. We could do more if we detected superblocks. */
3398 static void
3399 nvptx_single (unsigned mask, basic_block from, basic_block to)
3401 rtx_insn *head = BB_HEAD (from);
3402 rtx_insn *tail = BB_END (to);
3403 unsigned skip_mask = mask;
3405 /* Find first insn of from block */
3406 while (head != BB_END (from) && !INSN_P (head))
3407 head = NEXT_INSN (head);
3409 /* Find last insn of to block */
3410 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3411 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3412 tail = PREV_INSN (tail);
3414 /* Detect if tail is a branch. */
3415 rtx tail_branch = NULL_RTX;
3416 rtx cond_branch = NULL_RTX;
3417 if (tail && INSN_P (tail))
3419 tail_branch = PATTERN (tail);
3420 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3421 tail_branch = NULL_RTX;
3422 else
3424 cond_branch = SET_SRC (tail_branch);
3425 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3426 cond_branch = NULL_RTX;
3430 if (tail == head)
3432 /* If this is empty, do nothing. */
3433 if (!head || !INSN_P (head))
3434 return;
3436 /* If this is a dummy insn, do nothing. */
3437 switch (recog_memoized (head))
3439 default:
3440 break;
3441 case CODE_FOR_nvptx_fork:
3442 case CODE_FOR_nvptx_forked:
3443 case CODE_FOR_nvptx_joining:
3444 case CODE_FOR_nvptx_join:
3445 return;
3448 if (cond_branch)
3450 /* If we're only doing vector single, there's no need to
3451 emit skip code because we'll not insert anything. */
3452 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3453 skip_mask = 0;
3455 else if (tail_branch)
3456 /* Block with only unconditional branch. Nothing to do. */
3457 return;
3460 /* Insert the vector test inside the worker test. */
3461 unsigned mode;
3462 rtx_insn *before = tail;
3463 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3464 if (GOMP_DIM_MASK (mode) & skip_mask)
3466 rtx_code_label *label = gen_label_rtx ();
3467 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3469 if (!pred)
3471 pred = gen_reg_rtx (BImode);
3472 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3475 rtx br;
3476 if (mode == GOMP_DIM_VECTOR)
3477 br = gen_br_true (pred, label);
3478 else
3479 br = gen_br_true_uni (pred, label);
3480 emit_insn_before (br, head);
3482 LABEL_NUSES (label)++;
3483 if (tail_branch)
3484 before = emit_label_before (label, before);
3485 else
3486 emit_label_after (label, tail);
3489 /* Now deal with propagating the branch condition. */
3490 if (cond_branch)
3492 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3494 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3496 /* Vector mode only, do a shuffle. */
3497 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3499 else
3501 /* Includes worker mode, do spill & fill. By construction
3502 we should never have worker mode only. */
3503 wcast_data_t data;
3505 data.base = worker_bcast_sym;
3506 data.ptr = 0;
3508 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3509 worker_bcast_size = GET_MODE_SIZE (SImode);
3511 data.offset = 0;
3512 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3513 before);
3514 /* Barrier so other workers can see the write. */
3515 emit_insn_before (nvptx_wsync (false), tail);
3516 data.offset = 0;
3517 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3518 /* This barrier is needed to avoid worker zero clobbering
3519 the broadcast buffer before all the other workers have
3520 had a chance to read this instance of it. */
3521 emit_insn_before (nvptx_wsync (true), tail);
3524 extract_insn (tail);
3525 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3526 UNSPEC_BR_UNIFIED);
3527 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3531 /* PAR is a parallel that is being skipped in its entirety according to
3532 MASK. Treat this as skipping a superblock starting at forked
3533 and ending at joining. */
3535 static void
3536 nvptx_skip_par (unsigned mask, parallel *par)
3538 basic_block tail = par->join_block;
3539 gcc_assert (tail->preds->length () == 1);
3541 basic_block pre_tail = (*tail->preds)[0]->src;
3542 gcc_assert (pre_tail->succs->length () == 1);
3544 nvptx_single (mask, par->forked_block, pre_tail);
3547 /* If PAR has a single inner parallel and PAR itself only contains
3548 empty entry and exit blocks, swallow the inner PAR. */
3550 static void
3551 nvptx_optimize_inner (parallel *par)
3553 parallel *inner = par->inner;
3555 /* We mustn't be the outer dummy par. */
3556 if (!par->mask)
3557 return;
3559 /* We must have a single inner par. */
3560 if (!inner || inner->next)
3561 return;
3563 /* We must only contain 2 blocks ourselves -- the head and tail of
3564 the inner par. */
3565 if (par->blocks.length () != 2)
3566 return;
3568 /* We must be disjoint partitioning. As we only have vector and
3569 worker partitioning, this is sufficient to guarantee the pars
3570 have adjacent partitioning. */
3571 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
3572 /* This indicates malformed code generation. */
3573 return;
3575 /* The outer forked insn should be immediately followed by the inner
3576 fork insn. */
3577 rtx_insn *forked = par->forked_insn;
3578 rtx_insn *fork = BB_END (par->forked_block);
3580 if (NEXT_INSN (forked) != fork)
3581 return;
3582 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
3584 /* The outer joining insn must immediately follow the inner join
3585 insn. */
3586 rtx_insn *joining = par->joining_insn;
3587 rtx_insn *join = inner->join_insn;
3588 if (NEXT_INSN (join) != joining)
3589 return;
3591 /* Preconditions met. Swallow the inner par. */
3592 if (dump_file)
3593 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3594 inner->mask, inner->forked_block->index,
3595 inner->join_block->index,
3596 par->mask, par->forked_block->index, par->join_block->index);
3598 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
3600 par->blocks.reserve (inner->blocks.length ());
3601 while (inner->blocks.length ())
3602 par->blocks.quick_push (inner->blocks.pop ());
3604 par->inner = inner->inner;
3605 inner->inner = NULL;
3607 delete inner;
3610 /* Process the parallel PAR and all its contained
3611 parallels. We do everything but the neutering. Return mask of
3612 partitioned modes used within this parallel. */
3614 static unsigned
3615 nvptx_process_pars (parallel *par)
3617 if (nvptx_optimize)
3618 nvptx_optimize_inner (par);
3620 unsigned inner_mask = par->mask;
3622 /* Do the inner parallels first. */
3623 if (par->inner)
3625 par->inner_mask = nvptx_process_pars (par->inner);
3626 inner_mask |= par->inner_mask;
3629 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3630 /* No propagation needed for a call. */;
3631 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3633 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
3634 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
3635 /* Insert begin and end synchronizations. */
3636 emit_insn_after (nvptx_wsync (false), par->forked_insn);
3637 emit_insn_before (nvptx_wsync (true), par->joining_insn);
3639 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3640 nvptx_vpropagate (par->forked_block, par->forked_insn);
3642 /* Now do siblings. */
3643 if (par->next)
3644 inner_mask |= nvptx_process_pars (par->next);
3645 return inner_mask;
3648 /* Neuter the parallel described by PAR. We recurse in depth-first
3649 order. MODES are the partitioning of the execution and OUTER is
3650 the partitioning of the parallels we are contained in. */
3652 static void
3653 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
3655 unsigned me = (par->mask
3656 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
3657 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3658 unsigned skip_mask = 0, neuter_mask = 0;
3660 if (par->inner)
3661 nvptx_neuter_pars (par->inner, modes, outer | me);
3663 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3665 if ((outer | me) & GOMP_DIM_MASK (mode))
3666 {} /* Mode is partitioned: no neutering. */
3667 else if (!(modes & GOMP_DIM_MASK (mode)))
3668 {} /* Mode is not used: nothing to do. */
3669 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3670 || !par->forked_insn)
3671 /* Partitioned in inner parallels, or we're not a partitioned
3672 at all: neuter individual blocks. */
3673 neuter_mask |= GOMP_DIM_MASK (mode);
3674 else if (!par->parent || !par->parent->forked_insn
3675 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
3676 /* Parent isn't a parallel or contains this paralleling: skip
3677 parallel at this level. */
3678 skip_mask |= GOMP_DIM_MASK (mode);
3679 else
3680 {} /* Parent will skip this parallel itself. */
3683 if (neuter_mask)
3685 int ix, len;
3687 if (nvptx_optimize)
3689 /* Neuter whole SESE regions. */
3690 bb_pair_vec_t regions;
3692 nvptx_find_sese (par->blocks, regions);
3693 len = regions.length ();
3694 for (ix = 0; ix != len; ix++)
3696 basic_block from = regions[ix].first;
3697 basic_block to = regions[ix].second;
3699 if (from)
3700 nvptx_single (neuter_mask, from, to);
3701 else
3702 gcc_assert (!to);
3705 else
3707 /* Neuter each BB individually. */
3708 len = par->blocks.length ();
3709 for (ix = 0; ix != len; ix++)
3711 basic_block block = par->blocks[ix];
3713 nvptx_single (neuter_mask, block, block);
3718 if (skip_mask)
3719 nvptx_skip_par (skip_mask, par);
3721 if (par->next)
3722 nvptx_neuter_pars (par->next, modes, outer);
3725 /* PTX-specific reorganization
3726 - Split blocks at fork and join instructions
3727 - Compute live registers
3728 - Mark now-unused registers, so function begin doesn't declare
3729 unused registers.
3730 - Insert state propagation when entering partitioned mode
3731 - Insert neutering instructions when in single mode
3732 - Replace subregs with suitable sequences.
3735 static void
3736 nvptx_reorg (void)
3738 /* We are freeing block_for_insn in the toplev to keep compatibility
3739 with old MDEP_REORGS that are not CFG based. Recompute it now. */
3740 compute_bb_for_insn ();
3742 thread_prologue_and_epilogue_insns ();
3744 /* Split blocks and record interesting unspecs. */
3745 bb_insn_map_t bb_insn_map;
3747 nvptx_split_blocks (&bb_insn_map);
3749 /* Compute live regs */
3750 df_clear_flags (DF_LR_RUN_DCE);
3751 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
3752 df_live_add_problem ();
3753 df_live_set_all_dirty ();
3754 df_analyze ();
3755 regstat_init_n_sets_and_refs ();
3757 if (dump_file)
3758 df_dump (dump_file);
3760 /* Mark unused regs as unused. */
3761 int max_regs = max_reg_num ();
3762 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
3763 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
3764 regno_reg_rtx[i] = const0_rtx;
3766 /* Determine launch dimensions of the function. If it is not an
3767 offloaded function (i.e. this is a regular compiler), the
3768 function has no neutering. */
3769 tree attr = get_oacc_fn_attrib (current_function_decl);
3770 if (attr)
3772 /* If we determined this mask before RTL expansion, we could
3773 elide emission of some levels of forks and joins. */
3774 unsigned mask = 0;
3775 tree dims = TREE_VALUE (attr);
3776 unsigned ix;
3778 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3780 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3781 tree allowed = TREE_PURPOSE (dims);
3783 if (size != 1 && !(allowed && integer_zerop (allowed)))
3784 mask |= GOMP_DIM_MASK (ix);
3786 /* If there is worker neutering, there must be vector
3787 neutering. Otherwise the hardware will fail. */
3788 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3789 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3791 /* Discover & process partitioned regions. */
3792 parallel *pars = nvptx_discover_pars (&bb_insn_map);
3793 nvptx_process_pars (pars);
3794 nvptx_neuter_pars (pars, mask, 0);
3795 delete pars;
3798 /* Replace subregs. */
3799 nvptx_reorg_subreg ();
3801 regstat_free_n_sets_and_refs ();
3803 df_finish_pass (true);
3806 /* Handle a "kernel" attribute; arguments as in
3807 struct attribute_spec.handler. */
3809 static tree
3810 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
3811 int ARG_UNUSED (flags), bool *no_add_attrs)
3813 tree decl = *node;
3815 if (TREE_CODE (decl) != FUNCTION_DECL)
3817 error ("%qE attribute only applies to functions", name);
3818 *no_add_attrs = true;
3820 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
3822 error ("%qE attribute requires a void return type", name);
3823 *no_add_attrs = true;
3826 return NULL_TREE;
3829 /* Table of valid machine attributes. */
3830 static const struct attribute_spec nvptx_attribute_table[] =
3832 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
3833 affects_type_identity } */
3834 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
3835 { NULL, 0, 0, false, false, false, NULL, false }
3838 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
3840 static HOST_WIDE_INT
3841 nvptx_vector_alignment (const_tree type)
3843 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
3845 return MIN (align, BIGGEST_ALIGNMENT);
3848 /* Indicate that INSN cannot be duplicated. */
3850 static bool
3851 nvptx_cannot_copy_insn_p (rtx_insn *insn)
3853 switch (recog_memoized (insn))
3855 case CODE_FOR_nvptx_shufflesi:
3856 case CODE_FOR_nvptx_shufflesf:
3857 case CODE_FOR_nvptx_barsync:
3858 case CODE_FOR_nvptx_fork:
3859 case CODE_FOR_nvptx_forked:
3860 case CODE_FOR_nvptx_joining:
3861 case CODE_FOR_nvptx_join:
3862 return true;
3863 default:
3864 return false;
3868 /* Section anchors do not work. Initialization for flag_section_anchor
3869 probes the existence of the anchoring target hooks and prevents
3870 anchoring if they don't exist. However, we may be being used with
3871 a host-side compiler that does support anchoring, and hence see
3872 the anchor flag set (as it's not recalculated). So provide an
3873 implementation denying anchoring. */
3875 static bool
3876 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
3878 return false;
3881 /* Record a symbol for mkoffload to enter into the mapping table. */
3883 static void
3884 nvptx_record_offload_symbol (tree decl)
3886 switch (TREE_CODE (decl))
3888 case VAR_DECL:
3889 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
3890 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3891 break;
3893 case FUNCTION_DECL:
3895 tree attr = get_oacc_fn_attrib (decl);
3896 tree dims = TREE_VALUE (attr);
3897 unsigned ix;
3899 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
3900 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3902 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3904 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3906 gcc_assert (!TREE_PURPOSE (dims));
3907 fprintf (asm_out_file, ", %#x", size);
3910 fprintf (asm_out_file, "\n");
3912 break;
3914 default:
3915 gcc_unreachable ();
3919 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
3920 at the start of a file. */
3922 static void
3923 nvptx_file_start (void)
3925 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
3926 fputs ("\t.version\t3.1\n", asm_out_file);
3927 fputs ("\t.target\tsm_30\n", asm_out_file);
3928 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
3929 fputs ("// END PREAMBLE\n", asm_out_file);
3932 /* Emit a declaration for a worker-level buffer in .shared memory. */
3934 static void
3935 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
3937 const char *name = XSTR (sym, 0);
3939 write_var_marker (file, true, false, name);
3940 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
3941 align, name, size);
3944 /* Write out the function declarations we've collected and declare storage
3945 for the broadcast buffer. */
3947 static void
3948 nvptx_file_end (void)
3950 hash_table<tree_hasher>::iterator iter;
3951 tree decl;
3952 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
3953 nvptx_record_fndecl (decl);
3954 fputs (func_decls.str().c_str(), asm_out_file);
3956 if (worker_bcast_size)
3957 write_worker_buffer (asm_out_file, worker_bcast_sym,
3958 worker_bcast_align, worker_bcast_size);
3960 if (worker_red_size)
3961 write_worker_buffer (asm_out_file, worker_red_sym,
3962 worker_red_align, worker_red_size);
3965 /* Expander for the shuffle builtins. */
3967 static rtx
3968 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
3970 if (ignore)
3971 return target;
3973 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
3974 NULL_RTX, mode, EXPAND_NORMAL);
3975 if (!REG_P (src))
3976 src = copy_to_mode_reg (mode, src);
3978 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
3979 NULL_RTX, SImode, EXPAND_NORMAL);
3980 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
3981 NULL_RTX, SImode, EXPAND_NORMAL);
3983 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
3984 idx = copy_to_mode_reg (SImode, idx);
3986 rtx pat = nvptx_gen_shuffle (target, src, idx,
3987 (nvptx_shuffle_kind) INTVAL (op));
3988 if (pat)
3989 emit_insn (pat);
3991 return target;
3994 /* Worker reduction address expander. */
3996 static rtx
3997 nvptx_expand_worker_addr (tree exp, rtx target,
3998 machine_mode ARG_UNUSED (mode), int ignore)
4000 if (ignore)
4001 return target;
4003 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4004 if (align > worker_red_align)
4005 worker_red_align = align;
4007 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4008 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4009 if (size + offset > worker_red_size)
4010 worker_red_size = size + offset;
4012 rtx addr = worker_red_sym;
4013 if (offset)
4015 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4016 addr = gen_rtx_CONST (Pmode, addr);
4019 emit_move_insn (target, addr);
4021 return target;
4024 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4025 not require taking the address of any object, other than the memory
4026 cell being operated on. */
4028 static rtx
4029 nvptx_expand_cmp_swap (tree exp, rtx target,
4030 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4032 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4034 if (!target)
4035 target = gen_reg_rtx (mode);
4037 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4038 NULL_RTX, Pmode, EXPAND_NORMAL);
4039 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4040 NULL_RTX, mode, EXPAND_NORMAL);
4041 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4042 NULL_RTX, mode, EXPAND_NORMAL);
4043 rtx pat;
4045 mem = gen_rtx_MEM (mode, mem);
4046 if (!REG_P (cmp))
4047 cmp = copy_to_mode_reg (mode, cmp);
4048 if (!REG_P (src))
4049 src = copy_to_mode_reg (mode, src);
4051 if (mode == SImode)
4052 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4053 else
4054 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4056 emit_insn (pat);
4058 return target;
4062 /* Codes for all the NVPTX builtins. */
4063 enum nvptx_builtins
4065 NVPTX_BUILTIN_SHUFFLE,
4066 NVPTX_BUILTIN_SHUFFLELL,
4067 NVPTX_BUILTIN_WORKER_ADDR,
4068 NVPTX_BUILTIN_CMP_SWAP,
4069 NVPTX_BUILTIN_CMP_SWAPLL,
4070 NVPTX_BUILTIN_MAX
4073 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4075 /* Return the NVPTX builtin for CODE. */
4077 static tree
4078 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4080 if (code >= NVPTX_BUILTIN_MAX)
4081 return error_mark_node;
4083 return nvptx_builtin_decls[code];
4086 /* Set up all builtin functions for this target. */
4088 static void
4089 nvptx_init_builtins (void)
4091 #define DEF(ID, NAME, T) \
4092 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4093 = add_builtin_function ("__builtin_nvptx_" NAME, \
4094 build_function_type_list T, \
4095 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4096 #define ST sizetype
4097 #define UINT unsigned_type_node
4098 #define LLUINT long_long_unsigned_type_node
4099 #define PTRVOID ptr_type_node
4101 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4102 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4103 DEF (WORKER_ADDR, "worker_addr",
4104 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4105 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4106 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4108 #undef DEF
4109 #undef ST
4110 #undef UINT
4111 #undef LLUINT
4112 #undef PTRVOID
4115 /* Expand an expression EXP that calls a built-in function,
4116 with result going to TARGET if that's convenient
4117 (and in mode MODE if that's convenient).
4118 SUBTARGET may be used as the target for computing one of EXP's operands.
4119 IGNORE is nonzero if the value is to be ignored. */
4121 static rtx
4122 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4123 machine_mode mode, int ignore)
4125 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4126 switch (DECL_FUNCTION_CODE (fndecl))
4128 case NVPTX_BUILTIN_SHUFFLE:
4129 case NVPTX_BUILTIN_SHUFFLELL:
4130 return nvptx_expand_shuffle (exp, target, mode, ignore);
4132 case NVPTX_BUILTIN_WORKER_ADDR:
4133 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4135 case NVPTX_BUILTIN_CMP_SWAP:
4136 case NVPTX_BUILTIN_CMP_SWAPLL:
4137 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4139 default: gcc_unreachable ();
4143 /* Define dimension sizes for known hardware. */
4144 #define PTX_VECTOR_LENGTH 32
4145 #define PTX_WORKER_LENGTH 32
4146 #define PTX_GANG_DEFAULT 32
4148 /* Validate compute dimensions of an OpenACC offload or routine, fill
4149 in non-unity defaults. FN_LEVEL indicates the level at which a
4150 routine might spawn a loop. It is negative for non-routines. If
4151 DECL is null, we are validating the default dimensions. */
4153 static bool
4154 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4156 bool changed = false;
4158 /* The vector size must be 32, unless this is a SEQ routine. */
4159 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4160 && dims[GOMP_DIM_VECTOR] >= 0
4161 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4163 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
4164 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4165 dims[GOMP_DIM_VECTOR]
4166 ? "using vector_length (%d), ignoring %d"
4167 : "using vector_length (%d), ignoring runtime setting",
4168 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4169 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4170 changed = true;
4173 /* Check the num workers is not too large. */
4174 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4176 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4177 "using num_workers (%d), ignoring %d",
4178 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4179 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4180 changed = true;
4183 if (!decl)
4185 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4186 if (dims[GOMP_DIM_WORKER] < 0)
4187 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4188 if (dims[GOMP_DIM_GANG] < 0)
4189 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
4190 changed = true;
4193 return changed;
4196 /* Return maximum dimension size, or zero for unbounded. */
4198 static int
4199 nvptx_dim_limit (int axis)
4201 switch (axis)
4203 case GOMP_DIM_WORKER:
4204 return PTX_WORKER_LENGTH;
4206 case GOMP_DIM_VECTOR:
4207 return PTX_VECTOR_LENGTH;
4209 default:
4210 break;
4212 return 0;
4215 /* Determine whether fork & joins are needed. */
4217 static bool
4218 nvptx_goacc_fork_join (gcall *call, const int dims[],
4219 bool ARG_UNUSED (is_fork))
4221 tree arg = gimple_call_arg (call, 2);
4222 unsigned axis = TREE_INT_CST_LOW (arg);
4224 /* We only care about worker and vector partitioning. */
4225 if (axis < GOMP_DIM_WORKER)
4226 return false;
4228 /* If the size is 1, there's no partitioning. */
4229 if (dims[axis] == 1)
4230 return false;
4232 return true;
4235 /* Generate a PTX builtin function call that returns the address in
4236 the worker reduction buffer at OFFSET. TYPE is the type of the
4237 data at that location. */
4239 static tree
4240 nvptx_get_worker_red_addr (tree type, tree offset)
4242 machine_mode mode = TYPE_MODE (type);
4243 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4244 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4245 tree align = build_int_cst (unsigned_type_node,
4246 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4247 tree call = build_call_expr (fndecl, 3, offset, size, align);
4249 return fold_convert (build_pointer_type (type), call);
4252 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4253 will cast the variable if necessary. */
4255 static void
4256 nvptx_generate_vector_shuffle (location_t loc,
4257 tree dest_var, tree var, unsigned shift,
4258 gimple_seq *seq)
4260 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4261 tree_code code = NOP_EXPR;
4262 tree arg_type = unsigned_type_node;
4263 tree var_type = TREE_TYPE (var);
4264 tree dest_type = var_type;
4266 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4267 var_type = TREE_TYPE (var_type);
4269 if (TREE_CODE (var_type) == REAL_TYPE)
4270 code = VIEW_CONVERT_EXPR;
4272 if (TYPE_SIZE (var_type)
4273 == TYPE_SIZE (long_long_unsigned_type_node))
4275 fn = NVPTX_BUILTIN_SHUFFLELL;
4276 arg_type = long_long_unsigned_type_node;
4279 tree call = nvptx_builtin_decl (fn, true);
4280 tree bits = build_int_cst (unsigned_type_node, shift);
4281 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4282 tree expr;
4284 if (var_type != dest_type)
4286 /* Do real and imaginary parts separately. */
4287 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4288 real = fold_build1 (code, arg_type, real);
4289 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4290 real = fold_build1 (code, var_type, real);
4292 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4293 imag = fold_build1 (code, arg_type, imag);
4294 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4295 imag = fold_build1 (code, var_type, imag);
4297 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4299 else
4301 expr = fold_build1 (code, arg_type, var);
4302 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4303 expr = fold_build1 (code, dest_type, expr);
4306 gimplify_assign (dest_var, expr, seq);
4309 /* Lazily generate the global lock var decl and return its address. */
4311 static tree
4312 nvptx_global_lock_addr ()
4314 tree v = global_lock_var;
4316 if (!v)
4318 tree name = get_identifier ("__reduction_lock");
4319 tree type = build_qualified_type (unsigned_type_node,
4320 TYPE_QUAL_VOLATILE);
4321 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4322 global_lock_var = v;
4323 DECL_ARTIFICIAL (v) = 1;
4324 DECL_EXTERNAL (v) = 1;
4325 TREE_STATIC (v) = 1;
4326 TREE_PUBLIC (v) = 1;
4327 TREE_USED (v) = 1;
4328 mark_addressable (v);
4329 mark_decl_referenced (v);
4332 return build_fold_addr_expr (v);
4335 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
4336 GSI. We use a lockless scheme for nearly all case, which looks
4337 like:
4338 actual = initval(OP);
4339 do {
4340 guess = actual;
4341 write = guess OP myval;
4342 actual = cmp&swap (ptr, guess, write)
4343 } while (actual bit-different-to guess);
4344 return write;
4346 This relies on a cmp&swap instruction, which is available for 32-
4347 and 64-bit types. Larger types must use a locking scheme. */
4349 static tree
4350 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4351 tree ptr, tree var, tree_code op)
4353 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4354 tree_code code = NOP_EXPR;
4355 tree arg_type = unsigned_type_node;
4356 tree var_type = TREE_TYPE (var);
4358 if (TREE_CODE (var_type) == COMPLEX_TYPE
4359 || TREE_CODE (var_type) == REAL_TYPE)
4360 code = VIEW_CONVERT_EXPR;
4362 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
4364 arg_type = long_long_unsigned_type_node;
4365 fn = NVPTX_BUILTIN_CMP_SWAPLL;
4368 tree swap_fn = nvptx_builtin_decl (fn, true);
4370 gimple_seq init_seq = NULL;
4371 tree init_var = make_ssa_name (arg_type);
4372 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4373 init_expr = fold_build1 (code, arg_type, init_expr);
4374 gimplify_assign (init_var, init_expr, &init_seq);
4375 gimple *init_end = gimple_seq_last (init_seq);
4377 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4379 /* Split the block just after the init stmts. */
4380 basic_block pre_bb = gsi_bb (*gsi);
4381 edge pre_edge = split_block (pre_bb, init_end);
4382 basic_block loop_bb = pre_edge->dest;
4383 pre_bb = pre_edge->src;
4384 /* Reset the iterator. */
4385 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4387 tree expect_var = make_ssa_name (arg_type);
4388 tree actual_var = make_ssa_name (arg_type);
4389 tree write_var = make_ssa_name (arg_type);
4391 /* Build and insert the reduction calculation. */
4392 gimple_seq red_seq = NULL;
4393 tree write_expr = fold_build1 (code, var_type, expect_var);
4394 write_expr = fold_build2 (op, var_type, write_expr, var);
4395 write_expr = fold_build1 (code, arg_type, write_expr);
4396 gimplify_assign (write_var, write_expr, &red_seq);
4398 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4400 /* Build & insert the cmp&swap sequence. */
4401 gimple_seq latch_seq = NULL;
4402 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4403 ptr, expect_var, write_var);
4404 gimplify_assign (actual_var, swap_expr, &latch_seq);
4406 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4407 NULL_TREE, NULL_TREE);
4408 gimple_seq_add_stmt (&latch_seq, cond);
4410 gimple *latch_end = gimple_seq_last (latch_seq);
4411 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
4413 /* Split the block just after the latch stmts. */
4414 edge post_edge = split_block (loop_bb, latch_end);
4415 basic_block post_bb = post_edge->dest;
4416 loop_bb = post_edge->src;
4417 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4419 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4420 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4421 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4422 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4424 gphi *phi = create_phi_node (expect_var, loop_bb);
4425 add_phi_arg (phi, init_var, pre_edge, loc);
4426 add_phi_arg (phi, actual_var, loop_edge, loc);
4428 loop *loop = alloc_loop ();
4429 loop->header = loop_bb;
4430 loop->latch = loop_bb;
4431 add_loop (loop, loop_bb->loop_father);
4433 return fold_build1 (code, var_type, write_var);
4436 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
4437 GSI. This is necessary for types larger than 64 bits, where there
4438 is no cmp&swap instruction to implement a lockless scheme. We use
4439 a lock variable in global memory.
4441 while (cmp&swap (&lock_var, 0, 1))
4442 continue;
4443 T accum = *ptr;
4444 accum = accum OP var;
4445 *ptr = accum;
4446 cmp&swap (&lock_var, 1, 0);
4447 return accum;
4449 A lock in global memory is necessary to force execution engine
4450 descheduling and avoid resource starvation that can occur if the
4451 lock is in .shared memory. */
4453 static tree
4454 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4455 tree ptr, tree var, tree_code op)
4457 tree var_type = TREE_TYPE (var);
4458 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4459 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4460 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4462 /* Split the block just before the gsi. Insert a gimple nop to make
4463 this easier. */
4464 gimple *nop = gimple_build_nop ();
4465 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4466 basic_block entry_bb = gsi_bb (*gsi);
4467 edge entry_edge = split_block (entry_bb, nop);
4468 basic_block lock_bb = entry_edge->dest;
4469 /* Reset the iterator. */
4470 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4472 /* Build and insert the locking sequence. */
4473 gimple_seq lock_seq = NULL;
4474 tree lock_var = make_ssa_name (unsigned_type_node);
4475 tree lock_expr = nvptx_global_lock_addr ();
4476 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4477 uns_unlocked, uns_locked);
4478 gimplify_assign (lock_var, lock_expr, &lock_seq);
4479 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4480 NULL_TREE, NULL_TREE);
4481 gimple_seq_add_stmt (&lock_seq, cond);
4482 gimple *lock_end = gimple_seq_last (lock_seq);
4483 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4485 /* Split the block just after the lock sequence. */
4486 edge locked_edge = split_block (lock_bb, lock_end);
4487 basic_block update_bb = locked_edge->dest;
4488 lock_bb = locked_edge->src;
4489 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4491 /* Create the lock loop ... */
4492 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4493 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
4494 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
4495 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
4497 /* ... and the loop structure. */
4498 loop *lock_loop = alloc_loop ();
4499 lock_loop->header = lock_bb;
4500 lock_loop->latch = lock_bb;
4501 lock_loop->nb_iterations_estimate = 1;
4502 lock_loop->any_estimate = true;
4503 add_loop (lock_loop, entry_bb->loop_father);
4505 /* Build and insert the reduction calculation. */
4506 gimple_seq red_seq = NULL;
4507 tree acc_in = make_ssa_name (var_type);
4508 tree ref_in = build_simple_mem_ref (ptr);
4509 TREE_THIS_VOLATILE (ref_in) = 1;
4510 gimplify_assign (acc_in, ref_in, &red_seq);
4512 tree acc_out = make_ssa_name (var_type);
4513 tree update_expr = fold_build2 (op, var_type, ref_in, var);
4514 gimplify_assign (acc_out, update_expr, &red_seq);
4516 tree ref_out = build_simple_mem_ref (ptr);
4517 TREE_THIS_VOLATILE (ref_out) = 1;
4518 gimplify_assign (ref_out, acc_out, &red_seq);
4520 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4522 /* Build & insert the unlock sequence. */
4523 gimple_seq unlock_seq = NULL;
4524 tree unlock_expr = nvptx_global_lock_addr ();
4525 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
4526 uns_locked, uns_unlocked);
4527 gimplify_and_add (unlock_expr, &unlock_seq);
4528 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
4530 return acc_out;
4533 /* Emit a sequence to update a reduction accumlator at *PTR with the
4534 value held in VAR using operator OP. Return the updated value.
4536 TODO: optimize for atomic ops and indepedent complex ops. */
4538 static tree
4539 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
4540 tree ptr, tree var, tree_code op)
4542 tree type = TREE_TYPE (var);
4543 tree size = TYPE_SIZE (type);
4545 if (size == TYPE_SIZE (unsigned_type_node)
4546 || size == TYPE_SIZE (long_long_unsigned_type_node))
4547 return nvptx_lockless_update (loc, gsi, ptr, var, op);
4548 else
4549 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
4552 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4554 static void
4555 nvptx_goacc_reduction_setup (gcall *call)
4557 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4558 tree lhs = gimple_call_lhs (call);
4559 tree var = gimple_call_arg (call, 2);
4560 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4561 gimple_seq seq = NULL;
4563 push_gimplify_context (true);
4565 if (level != GOMP_DIM_GANG)
4567 /* Copy the receiver object. */
4568 tree ref_to_res = gimple_call_arg (call, 1);
4570 if (!integer_zerop (ref_to_res))
4571 var = build_simple_mem_ref (ref_to_res);
4574 if (level == GOMP_DIM_WORKER)
4576 /* Store incoming value to worker reduction buffer. */
4577 tree offset = gimple_call_arg (call, 5);
4578 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4579 tree ptr = make_ssa_name (TREE_TYPE (call));
4581 gimplify_assign (ptr, call, &seq);
4582 tree ref = build_simple_mem_ref (ptr);
4583 TREE_THIS_VOLATILE (ref) = 1;
4584 gimplify_assign (ref, var, &seq);
4587 if (lhs)
4588 gimplify_assign (lhs, var, &seq);
4590 pop_gimplify_context (NULL);
4591 gsi_replace_with_seq (&gsi, seq, true);
4594 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
4596 static void
4597 nvptx_goacc_reduction_init (gcall *call)
4599 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4600 tree lhs = gimple_call_lhs (call);
4601 tree var = gimple_call_arg (call, 2);
4602 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4603 enum tree_code rcode
4604 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4605 tree init = omp_reduction_init_op (gimple_location (call), rcode,
4606 TREE_TYPE (var));
4607 gimple_seq seq = NULL;
4609 push_gimplify_context (true);
4611 if (level == GOMP_DIM_VECTOR)
4613 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4614 tree tid = make_ssa_name (integer_type_node);
4615 tree dim_vector = gimple_call_arg (call, 3);
4616 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
4617 dim_vector);
4618 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
4619 NULL_TREE, NULL_TREE);
4621 gimple_call_set_lhs (tid_call, tid);
4622 gimple_seq_add_stmt (&seq, tid_call);
4623 gimple_seq_add_stmt (&seq, cond_stmt);
4625 /* Split the block just after the call. */
4626 edge init_edge = split_block (gsi_bb (gsi), call);
4627 basic_block init_bb = init_edge->dest;
4628 basic_block call_bb = init_edge->src;
4630 /* Fixup flags from call_bb to init_bb. */
4631 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
4633 /* Set the initialization stmts. */
4634 gimple_seq init_seq = NULL;
4635 tree init_var = make_ssa_name (TREE_TYPE (var));
4636 gimplify_assign (init_var, init, &init_seq);
4637 gsi = gsi_start_bb (init_bb);
4638 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
4640 /* Split block just after the init stmt. */
4641 gsi_prev (&gsi);
4642 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
4643 basic_block dst_bb = inited_edge->dest;
4645 /* Create false edge from call_bb to dst_bb. */
4646 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
4648 /* Create phi node in dst block. */
4649 gphi *phi = create_phi_node (lhs, dst_bb);
4650 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
4651 add_phi_arg (phi, var, nop_edge, gimple_location (call));
4653 /* Reset dominator of dst bb. */
4654 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
4656 /* Reset the gsi. */
4657 gsi = gsi_for_stmt (call);
4659 else
4661 if (level == GOMP_DIM_GANG)
4663 /* If there's no receiver object, propagate the incoming VAR. */
4664 tree ref_to_res = gimple_call_arg (call, 1);
4665 if (integer_zerop (ref_to_res))
4666 init = var;
4669 gimplify_assign (lhs, init, &seq);
4672 pop_gimplify_context (NULL);
4673 gsi_replace_with_seq (&gsi, seq, true);
4676 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
4678 static void
4679 nvptx_goacc_reduction_fini (gcall *call)
4681 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4682 tree lhs = gimple_call_lhs (call);
4683 tree ref_to_res = gimple_call_arg (call, 1);
4684 tree var = gimple_call_arg (call, 2);
4685 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4686 enum tree_code op
4687 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4688 gimple_seq seq = NULL;
4689 tree r = NULL_TREE;;
4691 push_gimplify_context (true);
4693 if (level == GOMP_DIM_VECTOR)
4695 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
4696 but that requires a method of emitting a unified jump at the
4697 gimple level. */
4698 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
4700 tree other_var = make_ssa_name (TREE_TYPE (var));
4701 nvptx_generate_vector_shuffle (gimple_location (call),
4702 other_var, var, shfl, &seq);
4704 r = make_ssa_name (TREE_TYPE (var));
4705 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
4706 var, other_var), &seq);
4707 var = r;
4710 else
4712 tree accum = NULL_TREE;
4714 if (level == GOMP_DIM_WORKER)
4716 /* Get reduction buffer address. */
4717 tree offset = gimple_call_arg (call, 5);
4718 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4719 tree ptr = make_ssa_name (TREE_TYPE (call));
4721 gimplify_assign (ptr, call, &seq);
4722 accum = ptr;
4724 else if (integer_zerop (ref_to_res))
4725 r = var;
4726 else
4727 accum = ref_to_res;
4729 if (accum)
4731 /* UPDATE the accumulator. */
4732 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4733 seq = NULL;
4734 r = nvptx_reduction_update (gimple_location (call), &gsi,
4735 accum, var, op);
4739 if (lhs)
4740 gimplify_assign (lhs, r, &seq);
4741 pop_gimplify_context (NULL);
4743 gsi_replace_with_seq (&gsi, seq, true);
4746 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
4748 static void
4749 nvptx_goacc_reduction_teardown (gcall *call)
4751 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4752 tree lhs = gimple_call_lhs (call);
4753 tree var = gimple_call_arg (call, 2);
4754 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4755 gimple_seq seq = NULL;
4757 push_gimplify_context (true);
4758 if (level == GOMP_DIM_WORKER)
4760 /* Read the worker reduction buffer. */
4761 tree offset = gimple_call_arg (call, 5);
4762 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
4763 tree ptr = make_ssa_name (TREE_TYPE (call));
4765 gimplify_assign (ptr, call, &seq);
4766 var = build_simple_mem_ref (ptr);
4767 TREE_THIS_VOLATILE (var) = 1;
4770 if (level != GOMP_DIM_GANG)
4772 /* Write to the receiver object. */
4773 tree ref_to_res = gimple_call_arg (call, 1);
4775 if (!integer_zerop (ref_to_res))
4776 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
4779 if (lhs)
4780 gimplify_assign (lhs, var, &seq);
4782 pop_gimplify_context (NULL);
4784 gsi_replace_with_seq (&gsi, seq, true);
4787 /* NVPTX reduction expander. */
4789 static void
4790 nvptx_goacc_reduction (gcall *call)
4792 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
4794 switch (code)
4796 case IFN_GOACC_REDUCTION_SETUP:
4797 nvptx_goacc_reduction_setup (call);
4798 break;
4800 case IFN_GOACC_REDUCTION_INIT:
4801 nvptx_goacc_reduction_init (call);
4802 break;
4804 case IFN_GOACC_REDUCTION_FINI:
4805 nvptx_goacc_reduction_fini (call);
4806 break;
4808 case IFN_GOACC_REDUCTION_TEARDOWN:
4809 nvptx_goacc_reduction_teardown (call);
4810 break;
4812 default:
4813 gcc_unreachable ();
4817 #undef TARGET_OPTION_OVERRIDE
4818 #define TARGET_OPTION_OVERRIDE nvptx_option_override
4820 #undef TARGET_ATTRIBUTE_TABLE
4821 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
4823 #undef TARGET_LEGITIMATE_ADDRESS_P
4824 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
4826 #undef TARGET_PROMOTE_FUNCTION_MODE
4827 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
4829 #undef TARGET_FUNCTION_ARG
4830 #define TARGET_FUNCTION_ARG nvptx_function_arg
4831 #undef TARGET_FUNCTION_INCOMING_ARG
4832 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
4833 #undef TARGET_FUNCTION_ARG_ADVANCE
4834 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
4835 #undef TARGET_PASS_BY_REFERENCE
4836 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
4837 #undef TARGET_FUNCTION_VALUE_REGNO_P
4838 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
4839 #undef TARGET_FUNCTION_VALUE
4840 #define TARGET_FUNCTION_VALUE nvptx_function_value
4841 #undef TARGET_LIBCALL_VALUE
4842 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
4843 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
4844 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
4845 #undef TARGET_GET_DRAP_RTX
4846 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
4847 #undef TARGET_SPLIT_COMPLEX_ARG
4848 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
4849 #undef TARGET_RETURN_IN_MEMORY
4850 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
4851 #undef TARGET_OMIT_STRUCT_RETURN_REG
4852 #define TARGET_OMIT_STRUCT_RETURN_REG true
4853 #undef TARGET_STRICT_ARGUMENT_NAMING
4854 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
4855 #undef TARGET_CALL_ARGS
4856 #define TARGET_CALL_ARGS nvptx_call_args
4857 #undef TARGET_END_CALL_ARGS
4858 #define TARGET_END_CALL_ARGS nvptx_end_call_args
4860 #undef TARGET_ASM_FILE_START
4861 #define TARGET_ASM_FILE_START nvptx_file_start
4862 #undef TARGET_ASM_FILE_END
4863 #define TARGET_ASM_FILE_END nvptx_file_end
4864 #undef TARGET_ASM_GLOBALIZE_LABEL
4865 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
4866 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
4867 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
4868 #undef TARGET_PRINT_OPERAND
4869 #define TARGET_PRINT_OPERAND nvptx_print_operand
4870 #undef TARGET_PRINT_OPERAND_ADDRESS
4871 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
4872 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
4873 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
4874 #undef TARGET_ASM_INTEGER
4875 #define TARGET_ASM_INTEGER nvptx_assemble_integer
4876 #undef TARGET_ASM_DECL_END
4877 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
4878 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
4879 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
4880 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
4881 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
4882 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
4883 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
4885 #undef TARGET_MACHINE_DEPENDENT_REORG
4886 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
4887 #undef TARGET_NO_REGISTER_ALLOCATION
4888 #define TARGET_NO_REGISTER_ALLOCATION true
4890 #undef TARGET_ENCODE_SECTION_INFO
4891 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
4892 #undef TARGET_RECORD_OFFLOAD_SYMBOL
4893 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
4895 #undef TARGET_VECTOR_ALIGNMENT
4896 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
4898 #undef TARGET_CANNOT_COPY_INSN_P
4899 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
4901 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
4902 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
4904 #undef TARGET_INIT_BUILTINS
4905 #define TARGET_INIT_BUILTINS nvptx_init_builtins
4906 #undef TARGET_EXPAND_BUILTIN
4907 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
4908 #undef TARGET_BUILTIN_DECL
4909 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
4911 #undef TARGET_GOACC_VALIDATE_DIMS
4912 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
4914 #undef TARGET_GOACC_DIM_LIMIT
4915 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
4917 #undef TARGET_GOACC_FORK_JOIN
4918 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
4920 #undef TARGET_GOACC_REDUCTION
4921 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
4923 struct gcc_target targetm = TARGET_INITIALIZER;
4925 #include "gt-nvptx.h"