* config/nvptx/nvptx-protos.h
[official-gcc.git] / gcc / config / nvptx / nvptx.c
blob432e2460a221359f3304a1e6809273cf9de157bf
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2015 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include <sstream>
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "expmed.h"
33 #include "optabs.h"
34 #include "regs.h"
35 #include "emit-rtl.h"
36 #include "recog.h"
37 #include "diagnostic.h"
38 #include "alias.h"
39 #include "insn-flags.h"
40 #include "output.h"
41 #include "insn-attr.h"
42 #include "flags.h"
43 #include "dojump.h"
44 #include "explow.h"
45 #include "calls.h"
46 #include "varasm.h"
47 #include "stmt.h"
48 #include "expr.h"
49 #include "tm-preds.h"
50 #include "tm-constrs.h"
51 #include "langhooks.h"
52 #include "dbxout.h"
53 #include "cfgrtl.h"
54 #include "gimple.h"
55 #include "stor-layout.h"
56 #include "builtins.h"
57 #include "omp-low.h"
58 #include "gomp-constants.h"
59 #include "dumpfile.h"
60 #include "internal-fn.h"
61 #include "gimple-iterator.h"
62 #include "stringpool.h"
63 #include "tree-ssa-operands.h"
64 #include "tree-ssanames.h"
65 #include "gimplify.h"
66 #include "tree-phinodes.h"
67 #include "cfgloop.h"
68 #include "fold-const.h"
70 /* This file should be included last. */
71 #include "target-def.h"
73 /* The kind of shuffe instruction. */
74 enum nvptx_shuffle_kind
76 SHUFFLE_UP,
77 SHUFFLE_DOWN,
78 SHUFFLE_BFLY,
79 SHUFFLE_IDX,
80 SHUFFLE_MAX
83 /* The various PTX memory areas an object might reside in. */
84 enum nvptx_data_area
86 DATA_AREA_GENERIC,
87 DATA_AREA_GLOBAL,
88 DATA_AREA_SHARED,
89 DATA_AREA_LOCAL,
90 DATA_AREA_CONST,
91 DATA_AREA_PARAM,
92 DATA_AREA_MAX
95 /* We record the data area in the target symbol flags. */
96 #define SYMBOL_DATA_AREA(SYM) \
97 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
98 & 7)
99 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
100 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
102 /* Record the function decls we've written, and the libfuncs and function
103 decls corresponding to them. */
104 static std::stringstream func_decls;
106 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
108 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
109 static bool equal (rtx a, rtx b) { return a == b; }
112 static GTY((cache))
113 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
115 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
117 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
118 static bool equal (tree a, tree b) { return a == b; }
121 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
122 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
124 /* Buffer needed to broadcast across workers. This is used for both
125 worker-neutering and worker broadcasting. It is shared by all
126 functions emitted. The buffer is placed in shared memory. It'd be
127 nice if PTX supported common blocks, because then this could be
128 shared across TUs (taking the largest size). */
129 static unsigned worker_bcast_size;
130 static unsigned worker_bcast_align;
131 static GTY(()) rtx worker_bcast_sym;
133 /* Buffer needed for worker reductions. This has to be distinct from
134 the worker broadcast array, as both may be live concurrently. */
135 static unsigned worker_red_size;
136 static unsigned worker_red_align;
137 static GTY(()) rtx worker_red_sym;
139 /* Global lock variable, needed for 128bit worker & gang reductions. */
140 static GTY(()) tree global_lock_var;
142 /* Allocate a new, cleared machine_function structure. */
144 static struct machine_function *
145 nvptx_init_machine_status (void)
147 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
148 p->return_mode = VOIDmode;
149 return p;
152 /* Implement TARGET_OPTION_OVERRIDE. */
154 static void
155 nvptx_option_override (void)
157 init_machine_status = nvptx_init_machine_status;
158 /* Gives us a predictable order, which we need especially for variables. */
159 flag_toplevel_reorder = 1;
160 /* Assumes that it will see only hard registers. */
161 flag_var_tracking = 0;
163 if (write_symbols == DBX_DEBUG)
164 /* The stabs testcases want to know stabs isn't supported. */
165 sorry ("stabs debug format not supported");
167 /* Actually we don't have any debug format, but don't be
168 unneccesarily noisy. */
169 write_symbols = NO_DEBUG;
170 debug_info_level = DINFO_LEVEL_NONE;
172 if (nvptx_optimize < 0)
173 nvptx_optimize = optimize > 0;
175 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
176 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
177 declared_libfuncs_htab
178 = hash_table<declared_libfunc_hasher>::create_ggc (17);
180 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
181 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
182 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
184 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
185 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
186 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
189 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
190 deal with ptx ideosyncracies. */
192 const char *
193 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
195 switch (mode)
197 case BLKmode:
198 return ".b8";
199 case BImode:
200 return ".pred";
201 case QImode:
202 if (promote)
203 return ".u32";
204 else
205 return ".u8";
206 case HImode:
207 return ".u16";
208 case SImode:
209 return ".u32";
210 case DImode:
211 return ".u64";
213 case SFmode:
214 return ".f32";
215 case DFmode:
216 return ".f64";
218 default:
219 gcc_unreachable ();
223 /* Encode the PTX data area that DECL (which might not actually be a
224 _DECL) should reside in. */
226 static void
227 nvptx_encode_section_info (tree decl, rtx rtl, int first)
229 default_encode_section_info (decl, rtl, first);
230 if (first && MEM_P (rtl))
232 nvptx_data_area area = DATA_AREA_GENERIC;
234 if (TREE_CONSTANT (decl))
235 area = DATA_AREA_CONST;
236 else if (TREE_CODE (decl) == VAR_DECL)
237 /* TODO: This would be a good place to check for a .shared or
238 other section name. */
239 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
241 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
245 /* Return the PTX name of the data area in which SYM should be
246 placed. The symbol must have already been processed by
247 nvptx_encode_seciton_info, or equivalent. */
249 static const char *
250 section_for_sym (rtx sym)
252 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
253 /* Same order as nvptx_data_area enum. */
254 static char const *const areas[] =
255 {"", ".global", ".shared", ".local", ".const", ".param"};
257 return areas[area];
260 /* Similarly for a decl. */
262 static const char *
263 section_for_decl (const_tree decl)
265 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
268 /* Check NAME for special function names and redirect them by returning a
269 replacement. This applies to malloc, free and realloc, for which we
270 want to use libgcc wrappers, and call, which triggers a bug in ptxas. */
272 static const char *
273 nvptx_name_replacement (const char *name)
275 if (strcmp (name, "call") == 0)
276 return "__nvptx_call";
277 if (strcmp (name, "malloc") == 0)
278 return "__nvptx_malloc";
279 if (strcmp (name, "free") == 0)
280 return "__nvptx_free";
281 if (strcmp (name, "realloc") == 0)
282 return "__nvptx_realloc";
283 return name;
286 /* If MODE should be treated as two registers of an inner mode, return
287 that inner mode. Otherwise return VOIDmode. */
289 static machine_mode
290 maybe_split_mode (machine_mode mode)
292 if (COMPLEX_MODE_P (mode))
293 return GET_MODE_INNER (mode);
295 if (mode == TImode)
296 return DImode;
298 return VOIDmode;
301 /* Output a register, subreg, or register pair (with optional
302 enclosing braces). */
304 static void
305 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
306 int subreg_offset = -1)
308 if (inner_mode == VOIDmode)
310 if (HARD_REGISTER_NUM_P (regno))
311 fprintf (file, "%s", reg_names[regno]);
312 else
313 fprintf (file, "%%r%d", regno);
315 else if (subreg_offset >= 0)
317 output_reg (file, regno, VOIDmode);
318 fprintf (file, "$%d", subreg_offset);
320 else
322 if (subreg_offset == -1)
323 fprintf (file, "{");
324 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
325 fprintf (file, ",");
326 output_reg (file, regno, inner_mode, 0);
327 if (subreg_offset == -1)
328 fprintf (file, "}");
332 /* Emit forking instructions for MASK. */
334 static void
335 nvptx_emit_forking (unsigned mask, bool is_call)
337 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
338 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
339 if (mask)
341 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
343 /* Emit fork at all levels. This helps form SESE regions, as
344 it creates a block with a single successor before entering a
345 partitooned region. That is a good candidate for the end of
346 an SESE region. */
347 if (!is_call)
348 emit_insn (gen_nvptx_fork (op));
349 emit_insn (gen_nvptx_forked (op));
353 /* Emit joining instructions for MASK. */
355 static void
356 nvptx_emit_joining (unsigned mask, bool is_call)
358 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
359 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
360 if (mask)
362 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
364 /* Emit joining for all non-call pars to ensure there's a single
365 predecessor for the block the join insn ends up in. This is
366 needed for skipping entire loops. */
367 if (!is_call)
368 emit_insn (gen_nvptx_joining (op));
369 emit_insn (gen_nvptx_join (op));
374 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
375 returned in memory. Integer and floating types supported by the
376 machine are passed in registers, everything else is passed in
377 memory. Complex types are split. */
379 static bool
380 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
382 if (type)
384 if (AGGREGATE_TYPE_P (type))
385 return true;
386 if (TREE_CODE (type) == VECTOR_TYPE)
387 return true;
390 if (!for_return && COMPLEX_MODE_P (mode))
391 /* Complex types are passed as two underlying args. */
392 mode = GET_MODE_INNER (mode);
394 if (GET_MODE_CLASS (mode) != MODE_INT
395 && GET_MODE_CLASS (mode) != MODE_FLOAT)
396 return true;
398 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
399 return true;
401 return false;
404 /* A non-memory argument of mode MODE is being passed, determine the mode it
405 should be promoted to. This is also used for determining return
406 type promotion. */
408 static machine_mode
409 promote_arg (machine_mode mode, bool prototyped)
411 if (!prototyped && mode == SFmode)
412 /* K&R float promotion for unprototyped functions. */
413 mode = DFmode;
414 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
415 mode = SImode;
417 return mode;
420 /* A non-memory return type of MODE is being returned. Determine the
421 mode it should be promoted to. */
423 static machine_mode
424 promote_return (machine_mode mode)
426 return promote_arg (mode, true);
429 /* Implement TARGET_FUNCTION_ARG. */
431 static rtx
432 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
433 const_tree, bool named)
435 if (mode == VOIDmode || !named)
436 return NULL_RTX;
438 return gen_reg_rtx (mode);
441 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
443 static rtx
444 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
445 const_tree, bool named)
447 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
449 if (mode == VOIDmode || !named)
450 return NULL_RTX;
452 /* No need to deal with split modes here, the only case that can
453 happen is complex modes and those are dealt with by
454 TARGET_SPLIT_COMPLEX_ARG. */
455 return gen_rtx_UNSPEC (mode,
456 gen_rtvec (1, GEN_INT (cum->count)),
457 UNSPEC_ARG_REG);
460 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
462 static void
463 nvptx_function_arg_advance (cumulative_args_t cum_v,
464 machine_mode ARG_UNUSED (mode),
465 const_tree ARG_UNUSED (type),
466 bool ARG_UNUSED (named))
468 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
470 cum->count++;
473 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
475 For nvptx, we know how to handle functions declared as stdarg: by
476 passing an extra pointer to the unnamed arguments. However, the
477 Fortran frontend can produce a different situation, where a
478 function pointer is declared with no arguments, but the actual
479 function and calls to it take more arguments. In that case, we
480 want to ensure the call matches the definition of the function. */
482 static bool
483 nvptx_strict_argument_naming (cumulative_args_t cum_v)
485 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
487 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
490 /* Implement TARGET_LIBCALL_VALUE. */
492 static rtx
493 nvptx_libcall_value (machine_mode mode, const_rtx)
495 if (!cfun->machine->doing_call)
496 /* Pretend to return in a hard reg for early uses before pseudos can be
497 generated. */
498 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
500 return gen_reg_rtx (mode);
503 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
504 where function FUNC returns or receives a value of data type TYPE. */
506 static rtx
507 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
508 bool outgoing)
510 machine_mode mode = promote_return (TYPE_MODE (type));
512 if (outgoing)
514 cfun->machine->return_mode = mode;
515 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
518 return nvptx_libcall_value (mode, NULL_RTX);
521 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
523 static bool
524 nvptx_function_value_regno_p (const unsigned int regno)
526 return regno == NVPTX_RETURN_REGNUM;
529 /* Types with a mode other than those supported by the machine are passed by
530 reference in memory. */
532 static bool
533 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
534 machine_mode mode, const_tree type,
535 bool ARG_UNUSED (named))
537 return pass_in_memory (mode, type, false);
540 /* Implement TARGET_RETURN_IN_MEMORY. */
542 static bool
543 nvptx_return_in_memory (const_tree type, const_tree)
545 return pass_in_memory (TYPE_MODE (type), type, true);
548 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
550 static machine_mode
551 nvptx_promote_function_mode (const_tree type, machine_mode mode,
552 int *ARG_UNUSED (punsignedp),
553 const_tree funtype, int for_return)
555 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
558 /* Helper for write_arg. Emit a single PTX argument of MODE, either
559 in a prototype, or as copy in a function prologue. ARGNO is the
560 index of this argument in the PTX function. FOR_REG is negative,
561 if we're emitting the PTX prototype. It is zero if we're copying
562 to an argument register and it is greater than zero if we're
563 copying to a specific hard register. */
565 static int
566 write_arg_mode (std::stringstream &s, int for_reg, int argno,
567 machine_mode mode)
569 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
571 if (for_reg < 0)
573 /* Writing PTX prototype. */
574 s << (argno ? ", " : " (");
575 s << ".param" << ptx_type << " %in_ar" << argno;
577 else
579 s << "\t.reg" << ptx_type << " ";
580 if (for_reg)
581 s << reg_names[for_reg];
582 else
583 s << "%ar" << argno;
584 s << ";\n";
585 if (argno >= 0)
587 s << "\tld.param" << ptx_type << " ";
588 if (for_reg)
589 s << reg_names[for_reg];
590 else
591 s << "%ar" << argno;
592 s << ", [%in_ar" << argno << "];\n";
595 return argno + 1;
598 /* Process function parameter TYPE to emit one or more PTX
599 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
600 is true, if this is a prototyped function, rather than an old-style
601 C declaration. Returns the next argument number to use.
603 The promotion behaviour here must match the regular GCC function
604 parameter marshalling machinery. */
606 static int
607 write_arg_type (std::stringstream &s, int for_reg, int argno,
608 tree type, bool prototyped)
610 machine_mode mode = TYPE_MODE (type);
612 if (mode == VOIDmode)
613 return argno;
615 if (pass_in_memory (mode, type, false))
616 mode = Pmode;
617 else
619 bool split = TREE_CODE (type) == COMPLEX_TYPE;
621 if (split)
623 /* Complex types are sent as two separate args. */
624 type = TREE_TYPE (type);
625 mode = TYPE_MODE (type);
626 prototyped = true;
629 mode = promote_arg (mode, prototyped);
630 if (split)
631 argno = write_arg_mode (s, for_reg, argno, mode);
634 return write_arg_mode (s, for_reg, argno, mode);
637 /* Emit a PTX return as a prototype or function prologue declaration
638 for MODE. */
640 static void
641 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
643 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
644 const char *pfx = "\t.reg";
645 const char *sfx = ";\n";
647 if (for_proto)
648 pfx = "(.param", sfx = "_out) ";
650 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
653 /* Process a function return TYPE to emit a PTX return as a prototype
654 or function prologue declaration. Returns true if return is via an
655 additional pointer parameter. The promotion behaviour here must
656 match the regular GCC function return mashalling. */
658 static bool
659 write_return_type (std::stringstream &s, bool for_proto, tree type)
661 machine_mode mode = TYPE_MODE (type);
663 if (mode == VOIDmode)
664 return false;
666 bool return_in_mem = pass_in_memory (mode, type, true);
668 if (return_in_mem)
670 if (for_proto)
671 return return_in_mem;
673 /* Named return values can cause us to return a pointer as well
674 as expect an argument for the return location. This is
675 optimization-level specific, so no caller can make use of
676 this data, but more importantly for us, we must ensure it
677 doesn't change the PTX prototype. */
678 mode = (machine_mode) cfun->machine->return_mode;
680 if (mode == VOIDmode)
681 return return_in_mem;
683 /* Clear return_mode to inhibit copy of retval to non-existent
684 retval parameter. */
685 cfun->machine->return_mode = VOIDmode;
687 else
688 mode = promote_return (mode);
690 write_return_mode (s, for_proto, mode);
692 return return_in_mem;
695 /* Look for attributes in ATTRS that would indicate we must write a function
696 as a .entry kernel rather than a .func. Return true if one is found. */
698 static bool
699 write_as_kernel (tree attrs)
701 return (lookup_attribute ("kernel", attrs) != NULL_TREE
702 || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
705 /* Emit a linker marker for a function decl or defn. */
707 static void
708 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
709 const char *name)
711 s << "\n// BEGIN";
712 if (globalize)
713 s << " GLOBAL";
714 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
715 s << name << "\n";
718 /* Emit a linker marker for a variable decl or defn. */
720 static void
721 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
723 fprintf (file, "\n// BEGIN%s VAR %s: ",
724 globalize ? " GLOBAL" : "",
725 is_defn ? "DEF" : "DECL");
726 assemble_name_raw (file, name);
727 fputs ("\n", file);
730 /* Write a .func or .kernel declaration or definition along with
731 a helper comment for use by ld. S is the stream to write to, DECL
732 the decl for the function with name NAME. For definitions, emit
733 a declaration too. */
735 static const char *
736 write_fn_proto (std::stringstream &s, bool is_defn,
737 const char *name, const_tree decl)
739 if (is_defn)
740 /* Emit a declaration. The PTX assembler gets upset without it. */
741 name = write_fn_proto (s, false, name, decl);
742 else
744 /* Avoid repeating the name replacement. */
745 name = nvptx_name_replacement (name);
746 if (name[0] == '*')
747 name++;
750 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
752 /* PTX declaration. */
753 if (DECL_EXTERNAL (decl))
754 s << ".extern ";
755 else if (TREE_PUBLIC (decl))
756 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
757 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
759 tree fntype = TREE_TYPE (decl);
760 tree result_type = TREE_TYPE (fntype);
762 /* Declare the result. */
763 bool return_in_mem = write_return_type (s, true, result_type);
765 s << name;
767 int argno = 0;
769 /* Emit argument list. */
770 if (return_in_mem)
771 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
773 /* We get:
774 NULL in TYPE_ARG_TYPES, for old-style functions
775 NULL in DECL_ARGUMENTS, for builtin functions without another
776 declaration.
777 So we have to pick the best one we have. */
778 tree args = TYPE_ARG_TYPES (fntype);
779 bool prototyped = true;
780 if (!args)
782 args = DECL_ARGUMENTS (decl);
783 prototyped = false;
786 for (; args; args = TREE_CHAIN (args))
788 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
790 argno = write_arg_type (s, -1, argno, type, prototyped);
793 if (stdarg_p (fntype))
794 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
796 if (DECL_STATIC_CHAIN (decl))
797 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
799 if (!argno && strcmp (name, "main") == 0)
801 argno = write_arg_type (s, -1, argno, integer_type_node, true);
802 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
805 if (argno)
806 s << ")";
808 s << (is_defn ? "\n" : ";\n");
810 return name;
813 /* Construct a function declaration from a call insn. This can be
814 necessary for two reasons - either we have an indirect call which
815 requires a .callprototype declaration, or we have a libcall
816 generated by emit_library_call for which no decl exists. */
818 static void
819 write_fn_proto_from_insn (std::stringstream &s, const char *name,
820 rtx result, rtx pat)
822 if (!name)
824 s << "\t.callprototype ";
825 name = "_";
827 else
829 name = nvptx_name_replacement (name);
830 write_fn_marker (s, false, true, name);
831 s << "\t.extern .func ";
834 if (result != NULL_RTX)
835 write_return_mode (s, true, GET_MODE (result));
837 s << name;
839 int arg_end = XVECLEN (pat, 0);
840 for (int i = 1; i < arg_end; i++)
842 /* We don't have to deal with mode splitting & promotion here,
843 as that was already done when generating the call
844 sequence. */
845 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
847 write_arg_mode (s, -1, i - 1, mode);
849 if (arg_end != 1)
850 s << ")";
851 s << ";\n";
854 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
855 table and and write a ptx prototype. These are emitted at end of
856 compilation. */
858 static void
859 nvptx_record_fndecl (tree decl)
861 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
862 if (*slot == NULL)
864 *slot = decl;
865 const char *name = get_fnname_from_decl (decl);
866 write_fn_proto (func_decls, false, name, decl);
870 /* Record a libcall or unprototyped external function. CALLEE is the
871 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
872 declaration for it. */
874 static void
875 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
877 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
878 if (*slot == NULL)
880 *slot = callee;
882 const char *name = XSTR (callee, 0);
883 write_fn_proto_from_insn (func_decls, name, retval, pat);
887 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
888 is prototyped, record it now. Otherwise record it as needed at end
889 of compilation, when we might have more information about it. */
891 void
892 nvptx_record_needed_fndecl (tree decl)
894 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
896 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
897 if (*slot == NULL)
898 *slot = decl;
900 else
901 nvptx_record_fndecl (decl);
904 /* SYM is a SYMBOL_REF. If it refers to an external function, record
905 it as needed. */
907 static void
908 nvptx_maybe_record_fnsym (rtx sym)
910 tree decl = SYMBOL_REF_DECL (sym);
912 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
913 nvptx_record_needed_fndecl (decl);
916 /* Emit a local array to hold some part of a conventional stack frame
917 and initialize REGNO to point to it. If the size is zero, it'll
918 never be valid to dereference, so we can simply initialize to
919 zero. */
921 static void
922 init_frame (FILE *file, int regno, unsigned align, unsigned size)
924 if (size)
925 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
926 align, reg_names[regno], size);
927 fprintf (file, "\t.reg.u%d %s;\n",
928 POINTER_SIZE, reg_names[regno]);
929 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
930 : "\tmov.u%d %s, 0;\n"),
931 POINTER_SIZE, reg_names[regno], reg_names[regno]);
934 /* Emit code to initialize the REGNO predicate register to indicate
935 whether we are not lane zero on the NAME axis. */
937 static void
938 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
940 fprintf (file, "\t{\n");
941 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
942 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
943 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
944 fprintf (file, "\t}\n");
947 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
948 function, including local var decls and copies from the arguments to
949 local regs. */
951 void
952 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
954 tree fntype = TREE_TYPE (decl);
955 tree result_type = TREE_TYPE (fntype);
956 int argno = 0;
958 /* We construct the initial part of the function into a string
959 stream, in order to share the prototype writing code. */
960 std::stringstream s;
961 write_fn_proto (s, true, name, decl);
962 s << "{\n";
964 bool return_in_mem = write_return_type (s, false, result_type);
965 if (return_in_mem)
966 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
968 /* Declare and initialize incoming arguments. */
969 tree args = TYPE_ARG_TYPES (fntype);
970 bool prototyped = true;
971 if (!args)
973 args = DECL_ARGUMENTS (decl);
974 prototyped = false;
977 for (; args != NULL_TREE; args = TREE_CHAIN (args))
979 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
981 argno = write_arg_type (s, 0, argno, type, prototyped);
984 if (stdarg_p (fntype))
985 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
986 true);
988 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
989 write_arg_type (s, STATIC_CHAIN_REGNUM,
990 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
991 true);
993 fprintf (file, "%s", s.str().c_str());
995 /* Declare a local var for outgoing varargs. */
996 if (cfun->machine->has_varadic)
997 init_frame (file, STACK_POINTER_REGNUM,
998 UNITS_PER_WORD, crtl->outgoing_args_size);
1000 /* Declare a local variable for the frame. */
1001 HOST_WIDE_INT sz = get_frame_size ();
1002 if (sz || cfun->machine->has_chain)
1003 init_frame (file, FRAME_POINTER_REGNUM,
1004 crtl->stack_alignment_needed / BITS_PER_UNIT, sz);
1006 /* Declare the pseudos we have as ptx registers. */
1007 int maxregs = max_reg_num ();
1008 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1010 if (regno_reg_rtx[i] != const0_rtx)
1012 machine_mode mode = PSEUDO_REGNO_MODE (i);
1013 machine_mode split = maybe_split_mode (mode);
1015 if (split != VOIDmode)
1016 mode = split;
1017 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1018 output_reg (file, i, split, -2);
1019 fprintf (file, ";\n");
1023 /* Emit axis predicates. */
1024 if (cfun->machine->axis_predicate[0])
1025 nvptx_init_axis_predicate (file,
1026 REGNO (cfun->machine->axis_predicate[0]), "y");
1027 if (cfun->machine->axis_predicate[1])
1028 nvptx_init_axis_predicate (file,
1029 REGNO (cfun->machine->axis_predicate[1]), "x");
1032 /* Output a return instruction. Also copy the return value to its outgoing
1033 location. */
1035 const char *
1036 nvptx_output_return (void)
1038 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1040 if (mode != VOIDmode)
1041 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1042 nvptx_ptx_type_from_mode (mode, false),
1043 reg_names[NVPTX_RETURN_REGNUM],
1044 reg_names[NVPTX_RETURN_REGNUM]);
1046 return "ret;";
1049 /* Terminate a function by writing a closing brace to FILE. */
1051 void
1052 nvptx_function_end (FILE *file)
1054 fprintf (file, "}\n");
1057 /* Decide whether we can make a sibling call to a function. For ptx, we
1058 can't. */
1060 static bool
1061 nvptx_function_ok_for_sibcall (tree, tree)
1063 return false;
1066 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1068 static rtx
1069 nvptx_get_drap_rtx (void)
1071 return NULL_RTX;
1074 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1075 argument to the next call. */
1077 static void
1078 nvptx_call_args (rtx arg, tree fntype)
1080 if (!cfun->machine->doing_call)
1082 cfun->machine->doing_call = true;
1083 cfun->machine->is_varadic = false;
1084 cfun->machine->num_args = 0;
1086 if (fntype && stdarg_p (fntype))
1088 cfun->machine->is_varadic = true;
1089 cfun->machine->has_varadic = true;
1090 cfun->machine->num_args++;
1094 if (REG_P (arg) && arg != pc_rtx)
1096 cfun->machine->num_args++;
1097 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1098 cfun->machine->call_args);
1102 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1103 information we recorded. */
1105 static void
1106 nvptx_end_call_args (void)
1108 cfun->machine->doing_call = false;
1109 free_EXPR_LIST_list (&cfun->machine->call_args);
1112 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1113 track of whether calls involving static chains or varargs were seen
1114 in the current function.
1115 For libcalls, maintain a hash table of decls we have seen, and
1116 record a function decl for later when encountering a new one. */
1118 void
1119 nvptx_expand_call (rtx retval, rtx address)
1121 rtx callee = XEXP (address, 0);
1122 rtx varargs = NULL_RTX;
1123 unsigned parallel = 0;
1125 if (!call_insn_operand (callee, Pmode))
1127 callee = force_reg (Pmode, callee);
1128 address = change_address (address, QImode, callee);
1131 if (GET_CODE (callee) == SYMBOL_REF)
1133 tree decl = SYMBOL_REF_DECL (callee);
1134 if (decl != NULL_TREE)
1136 if (DECL_STATIC_CHAIN (decl))
1137 cfun->machine->has_chain = true;
1139 tree attr = get_oacc_fn_attrib (decl);
1140 if (attr)
1142 tree dims = TREE_VALUE (attr);
1144 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1145 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1147 if (TREE_PURPOSE (dims)
1148 && !integer_zerop (TREE_PURPOSE (dims)))
1149 break;
1150 /* Not on this axis. */
1151 parallel ^= GOMP_DIM_MASK (ix);
1152 dims = TREE_CHAIN (dims);
1158 unsigned nargs = cfun->machine->num_args;
1159 if (cfun->machine->is_varadic)
1161 varargs = gen_reg_rtx (Pmode);
1162 emit_move_insn (varargs, stack_pointer_rtx);
1165 rtvec vec = rtvec_alloc (nargs + 1);
1166 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1167 int vec_pos = 0;
1169 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1170 rtx tmp_retval = retval;
1171 if (retval)
1173 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1174 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1175 call = gen_rtx_SET (tmp_retval, call);
1177 XVECEXP (pat, 0, vec_pos++) = call;
1179 /* Construct the call insn, including a USE for each argument pseudo
1180 register. These will be used when printing the insn. */
1181 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1182 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1184 if (varargs)
1185 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1187 gcc_assert (vec_pos = XVECLEN (pat, 0));
1189 nvptx_emit_forking (parallel, true);
1190 emit_call_insn (pat);
1191 nvptx_emit_joining (parallel, true);
1193 if (tmp_retval != retval)
1194 emit_move_insn (retval, tmp_retval);
1197 /* Emit a comparison COMPARE, and return the new test to be used in the
1198 jump. */
1201 nvptx_expand_compare (rtx compare)
1203 rtx pred = gen_reg_rtx (BImode);
1204 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1205 XEXP (compare, 0), XEXP (compare, 1));
1206 emit_insn (gen_rtx_SET (pred, cmp));
1207 return gen_rtx_NE (BImode, pred, const0_rtx);
1210 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1212 void
1213 nvptx_expand_oacc_fork (unsigned mode)
1215 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1218 void
1219 nvptx_expand_oacc_join (unsigned mode)
1221 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1224 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1225 objects. */
1227 static rtx
1228 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1230 rtx res;
1232 switch (GET_MODE (src))
1234 case DImode:
1235 res = gen_unpackdisi2 (dst0, dst1, src);
1236 break;
1237 case DFmode:
1238 res = gen_unpackdfsi2 (dst0, dst1, src);
1239 break;
1240 default: gcc_unreachable ();
1242 return res;
1245 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1246 object. */
1248 static rtx
1249 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1251 rtx res;
1253 switch (GET_MODE (dst))
1255 case DImode:
1256 res = gen_packsidi2 (dst, src0, src1);
1257 break;
1258 case DFmode:
1259 res = gen_packsidf2 (dst, src0, src1);
1260 break;
1261 default: gcc_unreachable ();
1263 return res;
1266 /* Generate an instruction or sequence to broadcast register REG
1267 across the vectors of a single warp. */
1269 static rtx
1270 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1272 rtx res;
1274 switch (GET_MODE (dst))
1276 case SImode:
1277 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1278 break;
1279 case SFmode:
1280 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1281 break;
1282 case DImode:
1283 case DFmode:
1285 rtx tmp0 = gen_reg_rtx (SImode);
1286 rtx tmp1 = gen_reg_rtx (SImode);
1288 start_sequence ();
1289 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1290 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1291 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1292 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1293 res = get_insns ();
1294 end_sequence ();
1296 break;
1297 case BImode:
1299 rtx tmp = gen_reg_rtx (SImode);
1301 start_sequence ();
1302 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1303 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1304 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1305 res = get_insns ();
1306 end_sequence ();
1308 break;
1310 default:
1311 gcc_unreachable ();
1313 return res;
1316 /* Generate an instruction or sequence to broadcast register REG
1317 across the vectors of a single warp. */
1319 static rtx
1320 nvptx_gen_vcast (rtx reg)
1322 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1325 /* Structure used when generating a worker-level spill or fill. */
1327 struct wcast_data_t
1329 rtx base; /* Register holding base addr of buffer. */
1330 rtx ptr; /* Iteration var, if needed. */
1331 unsigned offset; /* Offset into worker buffer. */
1334 /* Direction of the spill/fill and looping setup/teardown indicator. */
1336 enum propagate_mask
1338 PM_read = 1 << 0,
1339 PM_write = 1 << 1,
1340 PM_loop_begin = 1 << 2,
1341 PM_loop_end = 1 << 3,
1343 PM_read_write = PM_read | PM_write
1346 /* Generate instruction(s) to spill or fill register REG to/from the
1347 worker broadcast array. PM indicates what is to be done, REP
1348 how many loop iterations will be executed (0 for not a loop). */
1350 static rtx
1351 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1353 rtx res;
1354 machine_mode mode = GET_MODE (reg);
1356 switch (mode)
1358 case BImode:
1360 rtx tmp = gen_reg_rtx (SImode);
1362 start_sequence ();
1363 if (pm & PM_read)
1364 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1365 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1366 if (pm & PM_write)
1367 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1368 res = get_insns ();
1369 end_sequence ();
1371 break;
1373 default:
1375 rtx addr = data->ptr;
1377 if (!addr)
1379 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1381 if (align > worker_bcast_align)
1382 worker_bcast_align = align;
1383 data->offset = (data->offset + align - 1) & ~(align - 1);
1384 addr = data->base;
1385 if (data->offset)
1386 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1389 addr = gen_rtx_MEM (mode, addr);
1390 if (pm == PM_read)
1391 res = gen_rtx_SET (addr, reg);
1392 else if (pm == PM_write)
1393 res = gen_rtx_SET (reg, addr);
1394 else
1395 gcc_unreachable ();
1397 if (data->ptr)
1399 /* We're using a ptr, increment it. */
1400 start_sequence ();
1402 emit_insn (res);
1403 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1404 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1405 res = get_insns ();
1406 end_sequence ();
1408 else
1409 rep = 1;
1410 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1412 break;
1414 return res;
1417 /* Returns true if X is a valid address for use in a memory reference. */
1419 static bool
1420 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1422 enum rtx_code code = GET_CODE (x);
1424 switch (code)
1426 case REG:
1427 return true;
1429 case PLUS:
1430 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1431 return true;
1432 return false;
1434 case CONST:
1435 case SYMBOL_REF:
1436 case LABEL_REF:
1437 return true;
1439 default:
1440 return false;
1444 /* Machinery to output constant initializers. When beginning an
1445 initializer, we decide on a fragment size (which is visible in ptx
1446 in the type used), and then all initializer data is buffered until
1447 a fragment is filled and ready to be written out. */
1449 static struct
1451 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1452 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1453 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1454 out. */
1455 unsigned size; /* Fragment size to accumulate. */
1456 unsigned offset; /* Offset within current fragment. */
1457 bool started; /* Whether we've output any initializer. */
1458 } init_frag;
1460 /* The current fragment is full, write it out. SYM may provide a
1461 symbolic reference we should output, in which case the fragment
1462 value is the addend. */
1464 static void
1465 output_init_frag (rtx sym)
1467 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1468 unsigned HOST_WIDE_INT val = init_frag.val;
1470 init_frag.started = true;
1471 init_frag.val = 0;
1472 init_frag.offset = 0;
1473 init_frag.remaining--;
1475 if (sym)
1477 fprintf (asm_out_file, "generic(");
1478 output_address (VOIDmode, sym);
1479 fprintf (asm_out_file, val ? ") + " : ")");
1482 if (!sym || val)
1483 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1486 /* Add value VAL of size SIZE to the data we're emitting, and keep
1487 writing out chunks as they fill up. */
1489 static void
1490 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1492 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1494 for (unsigned part = 0; size; size -= part)
1496 val >>= part * BITS_PER_UNIT;
1497 part = init_frag.size - init_frag.offset;
1498 if (part > size)
1499 part = size;
1501 unsigned HOST_WIDE_INT partial
1502 = val << (init_frag.offset * BITS_PER_UNIT);
1503 init_frag.val |= partial & init_frag.mask;
1504 init_frag.offset += part;
1506 if (init_frag.offset == init_frag.size)
1507 output_init_frag (NULL);
1511 /* Target hook for assembling integer object X of size SIZE. */
1513 static bool
1514 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1516 HOST_WIDE_INT val = 0;
1518 switch (GET_CODE (x))
1520 default:
1521 /* Let the generic machinery figure it out, usually for a
1522 CONST_WIDE_INT. */
1523 return false;
1525 case CONST_INT:
1526 nvptx_assemble_value (INTVAL (x), size);
1527 break;
1529 case CONST:
1530 x = XEXP (x, 0);
1531 gcc_assert (GET_CODE (x) == PLUS);
1532 val = INTVAL (XEXP (x, 1));
1533 x = XEXP (x, 0);
1534 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1535 /* FALLTHROUGH */
1537 case SYMBOL_REF:
1538 gcc_assert (size == init_frag.size);
1539 if (init_frag.offset)
1540 sorry ("cannot emit unaligned pointers in ptx assembly");
1542 nvptx_maybe_record_fnsym (x);
1543 init_frag.val = val;
1544 output_init_frag (x);
1545 break;
1548 return true;
1551 /* Output SIZE zero bytes. We ignore the FILE argument since the
1552 functions we're calling to perform the output just use
1553 asm_out_file. */
1555 void
1556 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1558 /* Finish the current fragment, if it's started. */
1559 if (init_frag.offset)
1561 unsigned part = init_frag.size - init_frag.offset;
1562 if (part > size)
1563 part = (unsigned) size;
1564 size -= part;
1565 nvptx_assemble_value (0, part);
1568 /* If this skip doesn't terminate the initializer, write as many
1569 remaining pieces as possible directly. */
1570 if (size < init_frag.remaining * init_frag.size)
1572 while (size >= init_frag.size)
1574 size -= init_frag.size;
1575 output_init_frag (NULL_RTX);
1577 if (size)
1578 nvptx_assemble_value (0, size);
1582 /* Output a string STR with length SIZE. As in nvptx_output_skip we
1583 ignore the FILE arg. */
1585 void
1586 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1588 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1589 nvptx_assemble_value (str[i], 1);
1592 /* Emit a PTX variable decl and prepare for emission of its
1593 initializer. NAME is the symbol name and SETION the PTX data
1594 area. The type is TYPE, object size SIZE and alignment is ALIGN.
1595 The caller has already emitted any indentation and linkage
1596 specifier. It is responsible for any initializer, terminating ;
1597 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
1598 this is the opposite way round that PTX wants them! */
1600 static void
1601 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
1602 const_tree type, HOST_WIDE_INT size, unsigned align)
1604 while (TREE_CODE (type) == ARRAY_TYPE)
1605 type = TREE_TYPE (type);
1607 if (TREE_CODE (type) == VECTOR_TYPE
1608 || TREE_CODE (type) == COMPLEX_TYPE)
1609 /* Neither vector nor complex types can contain the other. */
1610 type = TREE_TYPE (type);
1612 unsigned elt_size = int_size_in_bytes (type);
1614 /* Largest mode we're prepared to accept. For BLKmode types we
1615 don't know if it'll contain pointer constants, so have to choose
1616 pointer size, otherwise we can choose DImode. */
1617 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
1619 elt_size |= GET_MODE_SIZE (elt_mode);
1620 elt_size &= -elt_size; /* Extract LSB set. */
1622 init_frag.size = elt_size;
1623 /* Avoid undefined shift behaviour by using '2'. */
1624 init_frag.mask = ((unsigned HOST_WIDE_INT)2
1625 << (elt_size * BITS_PER_UNIT - 1)) - 1;
1626 init_frag.val = 0;
1627 init_frag.offset = 0;
1628 init_frag.started = false;
1629 /* Size might not be a multiple of elt size, if there's an
1630 initialized trailing struct array with smaller type than
1631 elt_size. */
1632 init_frag.remaining = (size + elt_size - 1) / elt_size;
1634 fprintf (file, "%s .align %d .u%d ",
1635 section, align / BITS_PER_UNIT,
1636 elt_size * BITS_PER_UNIT);
1637 assemble_name (file, name);
1639 if (size)
1640 /* We make everything an array, to simplify any initialization
1641 emission. */
1642 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
1645 /* Called when the initializer for a decl has been completely output through
1646 combinations of the three functions above. */
1648 static void
1649 nvptx_assemble_decl_end (void)
1651 if (init_frag.offset)
1652 /* This can happen with a packed struct with trailing array member. */
1653 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
1654 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
1657 /* Output an uninitialized common or file-scope variable. */
1659 void
1660 nvptx_output_aligned_decl (FILE *file, const char *name,
1661 const_tree decl, HOST_WIDE_INT size, unsigned align)
1663 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1665 /* If this is public, it is common. The nearest thing we have to
1666 common is weak. */
1667 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
1669 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1670 TREE_TYPE (decl), size, align);
1671 nvptx_assemble_decl_end ();
1674 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1675 writing a constant variable EXP with NAME and SIZE and its
1676 initializer to FILE. */
1678 static void
1679 nvptx_asm_declare_constant_name (FILE *file, const char *name,
1680 const_tree exp, HOST_WIDE_INT obj_size)
1682 write_var_marker (file, true, false, name);
1684 fprintf (file, "\t");
1686 tree type = TREE_TYPE (exp);
1687 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
1688 TYPE_ALIGN (type));
1691 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1692 a variable DECL with NAME to FILE. */
1694 void
1695 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1697 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1699 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
1700 : DECL_WEAK (decl) ? ".weak " : ".visible "));
1702 tree type = TREE_TYPE (decl);
1703 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
1704 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1705 type, obj_size, DECL_ALIGN (decl));
1708 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1710 static void
1711 nvptx_globalize_label (FILE *, const char *)
1715 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1716 declaration only for variable DECL with NAME to FILE. */
1718 static void
1719 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1721 write_var_marker (file, false, TREE_PUBLIC (decl), name);
1723 fprintf (file, "\t.extern ");
1724 tree size = DECL_SIZE_UNIT (decl);
1725 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1726 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
1727 DECL_ALIGN (decl));
1728 nvptx_assemble_decl_end ();
1731 /* Output a pattern for a move instruction. */
1733 const char *
1734 nvptx_output_mov_insn (rtx dst, rtx src)
1736 machine_mode dst_mode = GET_MODE (dst);
1737 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
1738 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
1739 machine_mode src_inner = (GET_CODE (src) == SUBREG
1740 ? GET_MODE (XEXP (src, 0)) : dst_mode);
1742 rtx sym = src;
1743 if (GET_CODE (sym) == CONST)
1744 sym = XEXP (XEXP (sym, 0), 0);
1745 if (SYMBOL_REF_P (sym))
1747 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
1748 return "%.\tcvta%D1%t0\t%0, %1;";
1749 nvptx_maybe_record_fnsym (sym);
1752 if (src_inner == dst_inner)
1753 return "%.\tmov%t0\t%0, %1;";
1755 if (CONSTANT_P (src))
1756 return (GET_MODE_CLASS (dst_inner) == MODE_INT
1757 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
1758 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
1760 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
1761 return "%.\tmov.b%T0\t%0, %1;";
1763 return "%.\tcvt%t0%t1\t%0, %1;";
1766 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
1767 involves writing .param declarations and in/out copies into them. For
1768 indirect calls, also write the .callprototype. */
1770 const char *
1771 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
1773 char buf[16];
1774 static int labelno;
1775 bool needs_tgt = register_operand (callee, Pmode);
1776 rtx pat = PATTERN (insn);
1777 int arg_end = XVECLEN (pat, 0);
1778 tree decl = NULL_TREE;
1780 fprintf (asm_out_file, "\t{\n");
1781 if (result != NULL)
1782 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
1783 nvptx_ptx_type_from_mode (GET_MODE (result), false),
1784 reg_names[NVPTX_RETURN_REGNUM]);
1786 /* Ensure we have a ptx declaration in the output if necessary. */
1787 if (GET_CODE (callee) == SYMBOL_REF)
1789 decl = SYMBOL_REF_DECL (callee);
1790 if (!decl
1791 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
1792 nvptx_record_libfunc (callee, result, pat);
1793 else if (DECL_EXTERNAL (decl))
1794 nvptx_record_fndecl (decl);
1797 if (needs_tgt)
1799 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
1800 labelno++;
1801 ASM_OUTPUT_LABEL (asm_out_file, buf);
1802 std::stringstream s;
1803 write_fn_proto_from_insn (s, NULL, result, pat);
1804 fputs (s.str().c_str(), asm_out_file);
1807 for (int argno = 1; argno < arg_end; argno++)
1809 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
1810 machine_mode mode = GET_MODE (t);
1812 /* Mode splitting has already been done. */
1813 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d%s;\n",
1814 nvptx_ptx_type_from_mode (mode, false), argno,
1815 mode == QImode || mode == HImode ? "[1]" : "");
1816 fprintf (asm_out_file, "\t\tst.param%s [%%out_arg%d], %%r%d;\n",
1817 nvptx_ptx_type_from_mode (mode, false), argno,
1818 REGNO (t));
1821 fprintf (asm_out_file, "\t\tcall ");
1822 if (result != NULL_RTX)
1823 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
1825 if (decl)
1827 const char *name = get_fnname_from_decl (decl);
1828 name = nvptx_name_replacement (name);
1829 assemble_name (asm_out_file, name);
1831 else
1832 output_address (VOIDmode, callee);
1834 const char *open = "(";
1835 for (int argno = 1; argno < arg_end; argno++)
1837 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
1838 open = "";
1840 if (decl && DECL_STATIC_CHAIN (decl))
1842 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
1843 open = "";
1845 if (!open[0])
1846 fprintf (asm_out_file, ")");
1848 if (needs_tgt)
1850 fprintf (asm_out_file, ", ");
1851 assemble_name (asm_out_file, buf);
1853 fprintf (asm_out_file, ";\n");
1855 if (find_reg_note (insn, REG_NORETURN, NULL))
1856 /* No return functions confuse the PTX JIT, as it doesn't realize
1857 the flow control barrier they imply. It can seg fault if it
1858 encounters what looks like an unexitable loop. Emit a trailing
1859 trap, which it does grok. */
1860 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
1862 if (result)
1864 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
1866 if (!rval[0])
1867 /* We must escape the '%' that starts RETURN_REGNUM. */
1868 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
1869 reg_names[NVPTX_RETURN_REGNUM]);
1870 return rval;
1873 return "}";
1876 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
1878 static bool
1879 nvptx_print_operand_punct_valid_p (unsigned char c)
1881 return c == '.' || c== '#';
1884 static void nvptx_print_operand (FILE *, rtx, int);
1886 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
1888 static void
1889 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
1891 rtx off;
1892 if (GET_CODE (x) == CONST)
1893 x = XEXP (x, 0);
1894 switch (GET_CODE (x))
1896 case PLUS:
1897 off = XEXP (x, 1);
1898 output_address (VOIDmode, XEXP (x, 0));
1899 fprintf (file, "+");
1900 output_address (VOIDmode, off);
1901 break;
1903 case SYMBOL_REF:
1904 case LABEL_REF:
1905 output_addr_const (file, x);
1906 break;
1908 default:
1909 gcc_assert (GET_CODE (x) != MEM);
1910 nvptx_print_operand (file, x, 0);
1911 break;
1915 /* Write assembly language output for the address ADDR to FILE. */
1917 static void
1918 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
1920 nvptx_print_address_operand (file, addr, mode);
1923 /* Print an operand, X, to FILE, with an optional modifier in CODE.
1925 Meaning of CODE:
1926 . -- print the predicate for the instruction or an emptry string for an
1927 unconditional one.
1928 # -- print a rounding mode for the instruction
1930 A -- print a data area for a MEM
1931 c -- print an opcode suffix for a comparison operator, including a type code
1932 D -- print a data area for a MEM operand
1933 S -- print a shuffle kind specified by CONST_INT
1934 t -- print a type opcode suffix, promoting QImode to 32 bits
1935 T -- print a type size in bits
1936 u -- print a type opcode suffix without promotions. */
1938 static void
1939 nvptx_print_operand (FILE *file, rtx x, int code)
1941 if (code == '.')
1943 x = current_insn_predicate;
1944 if (x)
1946 unsigned int regno = REGNO (XEXP (x, 0));
1947 fputs ("[", file);
1948 if (GET_CODE (x) == EQ)
1949 fputs ("!", file);
1950 fputs (reg_names [regno], file);
1951 fputs ("]", file);
1953 return;
1955 else if (code == '#')
1957 fputs (".rn", file);
1958 return;
1961 enum rtx_code x_code = GET_CODE (x);
1962 machine_mode mode = GET_MODE (x);
1964 switch (code)
1966 case 'A':
1967 x = XEXP (x, 0);
1968 /* FALLTHROUGH. */
1970 case 'D':
1971 if (GET_CODE (x) == CONST)
1972 x = XEXP (x, 0);
1973 if (GET_CODE (x) == PLUS)
1974 x = XEXP (x, 0);
1976 if (GET_CODE (x) == SYMBOL_REF)
1977 fputs (section_for_sym (x), file);
1978 break;
1980 case 't':
1981 case 'u':
1982 if (x_code == SUBREG)
1984 mode = GET_MODE (SUBREG_REG (x));
1985 if (mode == TImode)
1986 mode = DImode;
1987 else if (COMPLEX_MODE_P (mode))
1988 mode = GET_MODE_INNER (mode);
1990 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
1991 break;
1993 case 'S':
1995 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
1996 /* Same order as nvptx_shuffle_kind. */
1997 static const char *const kinds[] =
1998 {".up", ".down", ".bfly", ".idx"};
1999 fputs (kinds[kind], file);
2001 break;
2003 case 'T':
2004 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2005 break;
2007 case 'j':
2008 fprintf (file, "@");
2009 goto common;
2011 case 'J':
2012 fprintf (file, "@!");
2013 goto common;
2015 case 'c':
2016 mode = GET_MODE (XEXP (x, 0));
2017 switch (x_code)
2019 case EQ:
2020 fputs (".eq", file);
2021 break;
2022 case NE:
2023 if (FLOAT_MODE_P (mode))
2024 fputs (".neu", file);
2025 else
2026 fputs (".ne", file);
2027 break;
2028 case LE:
2029 fputs (".le", file);
2030 break;
2031 case GE:
2032 fputs (".ge", file);
2033 break;
2034 case LT:
2035 fputs (".lt", file);
2036 break;
2037 case GT:
2038 fputs (".gt", file);
2039 break;
2040 case LEU:
2041 fputs (".ls", file);
2042 break;
2043 case GEU:
2044 fputs (".hs", file);
2045 break;
2046 case LTU:
2047 fputs (".lo", file);
2048 break;
2049 case GTU:
2050 fputs (".hi", file);
2051 break;
2052 case LTGT:
2053 fputs (".ne", file);
2054 break;
2055 case UNEQ:
2056 fputs (".equ", file);
2057 break;
2058 case UNLE:
2059 fputs (".leu", file);
2060 break;
2061 case UNGE:
2062 fputs (".geu", file);
2063 break;
2064 case UNLT:
2065 fputs (".ltu", file);
2066 break;
2067 case UNGT:
2068 fputs (".gtu", file);
2069 break;
2070 case UNORDERED:
2071 fputs (".nan", file);
2072 break;
2073 case ORDERED:
2074 fputs (".num", file);
2075 break;
2076 default:
2077 gcc_unreachable ();
2079 if (FLOAT_MODE_P (mode)
2080 || x_code == EQ || x_code == NE
2081 || x_code == GEU || x_code == GTU
2082 || x_code == LEU || x_code == LTU)
2083 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2084 else
2085 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2086 break;
2087 default:
2088 common:
2089 switch (x_code)
2091 case SUBREG:
2093 rtx inner_x = SUBREG_REG (x);
2094 machine_mode inner_mode = GET_MODE (inner_x);
2095 machine_mode split = maybe_split_mode (inner_mode);
2097 if (split != VOIDmode
2098 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2099 output_reg (file, REGNO (inner_x), split);
2100 else
2101 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2103 break;
2105 case REG:
2106 output_reg (file, REGNO (x), maybe_split_mode (mode));
2107 break;
2109 case MEM:
2110 fputc ('[', file);
2111 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2112 fputc (']', file);
2113 break;
2115 case CONST_INT:
2116 output_addr_const (file, x);
2117 break;
2119 case CONST:
2120 case SYMBOL_REF:
2121 case LABEL_REF:
2122 /* We could use output_addr_const, but that can print things like
2123 "x-8", which breaks ptxas. Need to ensure it is output as
2124 "x+-8". */
2125 nvptx_print_address_operand (file, x, VOIDmode);
2126 break;
2128 case CONST_DOUBLE:
2129 long vals[2];
2130 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2131 vals[0] &= 0xffffffff;
2132 vals[1] &= 0xffffffff;
2133 if (mode == SFmode)
2134 fprintf (file, "0f%08lx", vals[0]);
2135 else
2136 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2137 break;
2139 default:
2140 output_addr_const (file, x);
2145 /* Record replacement regs used to deal with subreg operands. */
2146 struct reg_replace
2148 rtx replacement[MAX_RECOG_OPERANDS];
2149 machine_mode mode;
2150 int n_allocated;
2151 int n_in_use;
2154 /* Allocate or reuse a replacement in R and return the rtx. */
2156 static rtx
2157 get_replacement (struct reg_replace *r)
2159 if (r->n_allocated == r->n_in_use)
2160 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2161 return r->replacement[r->n_in_use++];
2164 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2165 the presence of subregs would break the rules for most instructions.
2166 Replace them with a suitable new register of the right size, plus
2167 conversion copyin/copyout instructions. */
2169 static void
2170 nvptx_reorg_subreg (void)
2172 struct reg_replace qiregs, hiregs, siregs, diregs;
2173 rtx_insn *insn, *next;
2175 qiregs.n_allocated = 0;
2176 hiregs.n_allocated = 0;
2177 siregs.n_allocated = 0;
2178 diregs.n_allocated = 0;
2179 qiregs.mode = QImode;
2180 hiregs.mode = HImode;
2181 siregs.mode = SImode;
2182 diregs.mode = DImode;
2184 for (insn = get_insns (); insn; insn = next)
2186 next = NEXT_INSN (insn);
2187 if (!NONDEBUG_INSN_P (insn)
2188 || asm_noperands (PATTERN (insn)) >= 0
2189 || GET_CODE (PATTERN (insn)) == USE
2190 || GET_CODE (PATTERN (insn)) == CLOBBER)
2191 continue;
2193 qiregs.n_in_use = 0;
2194 hiregs.n_in_use = 0;
2195 siregs.n_in_use = 0;
2196 diregs.n_in_use = 0;
2197 extract_insn (insn);
2198 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2200 for (int i = 0; i < recog_data.n_operands; i++)
2202 rtx op = recog_data.operand[i];
2203 if (GET_CODE (op) != SUBREG)
2204 continue;
2206 rtx inner = SUBREG_REG (op);
2208 machine_mode outer_mode = GET_MODE (op);
2209 machine_mode inner_mode = GET_MODE (inner);
2210 gcc_assert (s_ok);
2211 if (s_ok
2212 && (GET_MODE_PRECISION (inner_mode)
2213 >= GET_MODE_PRECISION (outer_mode)))
2214 continue;
2215 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2216 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2217 : outer_mode == HImode ? &hiregs
2218 : outer_mode == SImode ? &siregs
2219 : &diregs);
2220 rtx new_reg = get_replacement (r);
2222 if (recog_data.operand_type[i] != OP_OUT)
2224 enum rtx_code code;
2225 if (GET_MODE_PRECISION (inner_mode)
2226 < GET_MODE_PRECISION (outer_mode))
2227 code = ZERO_EXTEND;
2228 else
2229 code = TRUNCATE;
2231 rtx pat = gen_rtx_SET (new_reg,
2232 gen_rtx_fmt_e (code, outer_mode, inner));
2233 emit_insn_before (pat, insn);
2236 if (recog_data.operand_type[i] != OP_IN)
2238 enum rtx_code code;
2239 if (GET_MODE_PRECISION (inner_mode)
2240 < GET_MODE_PRECISION (outer_mode))
2241 code = TRUNCATE;
2242 else
2243 code = ZERO_EXTEND;
2245 rtx pat = gen_rtx_SET (inner,
2246 gen_rtx_fmt_e (code, inner_mode, new_reg));
2247 emit_insn_after (pat, insn);
2249 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2254 /* Loop structure of the function. The entire function is described as
2255 a NULL loop. */
2257 struct parallel
2259 /* Parent parallel. */
2260 parallel *parent;
2262 /* Next sibling parallel. */
2263 parallel *next;
2265 /* First child parallel. */
2266 parallel *inner;
2268 /* Partitioning mask of the parallel. */
2269 unsigned mask;
2271 /* Partitioning used within inner parallels. */
2272 unsigned inner_mask;
2274 /* Location of parallel forked and join. The forked is the first
2275 block in the parallel and the join is the first block after of
2276 the partition. */
2277 basic_block forked_block;
2278 basic_block join_block;
2280 rtx_insn *forked_insn;
2281 rtx_insn *join_insn;
2283 rtx_insn *fork_insn;
2284 rtx_insn *joining_insn;
2286 /* Basic blocks in this parallel, but not in child parallels. The
2287 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2288 blocks are not. */
2289 auto_vec<basic_block> blocks;
2291 public:
2292 parallel (parallel *parent, unsigned mode);
2293 ~parallel ();
2296 /* Constructor links the new parallel into it's parent's chain of
2297 children. */
2299 parallel::parallel (parallel *parent_, unsigned mask_)
2300 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2302 forked_block = join_block = 0;
2303 forked_insn = join_insn = 0;
2304 fork_insn = joining_insn = 0;
2306 if (parent)
2308 next = parent->inner;
2309 parent->inner = this;
2313 parallel::~parallel ()
2315 delete inner;
2316 delete next;
2319 /* Map of basic blocks to insns */
2320 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2322 /* A tuple of an insn of interest and the BB in which it resides. */
2323 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2324 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2326 /* Split basic blocks such that each forked and join unspecs are at
2327 the start of their basic blocks. Thus afterwards each block will
2328 have a single partitioning mode. We also do the same for return
2329 insns, as they are executed by every thread. Return the
2330 partitioning mode of the function as a whole. Populate MAP with
2331 head and tail blocks. We also clear the BB visited flag, which is
2332 used when finding partitions. */
2334 static void
2335 nvptx_split_blocks (bb_insn_map_t *map)
2337 insn_bb_vec_t worklist;
2338 basic_block block;
2339 rtx_insn *insn;
2341 /* Locate all the reorg instructions of interest. */
2342 FOR_ALL_BB_FN (block, cfun)
2344 bool seen_insn = false;
2346 /* Clear visited flag, for use by parallel locator */
2347 block->flags &= ~BB_VISITED;
2349 FOR_BB_INSNS (block, insn)
2351 if (!INSN_P (insn))
2352 continue;
2353 switch (recog_memoized (insn))
2355 default:
2356 seen_insn = true;
2357 continue;
2358 case CODE_FOR_nvptx_forked:
2359 case CODE_FOR_nvptx_join:
2360 break;
2362 case CODE_FOR_return:
2363 /* We also need to split just before return insns, as
2364 that insn needs executing by all threads, but the
2365 block it is in probably does not. */
2366 break;
2369 if (seen_insn)
2370 /* We've found an instruction that must be at the start of
2371 a block, but isn't. Add it to the worklist. */
2372 worklist.safe_push (insn_bb_t (insn, block));
2373 else
2374 /* It was already the first instruction. Just add it to
2375 the map. */
2376 map->get_or_insert (block) = insn;
2377 seen_insn = true;
2381 /* Split blocks on the worklist. */
2382 unsigned ix;
2383 insn_bb_t *elt;
2384 basic_block remap = 0;
2385 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2387 if (remap != elt->second)
2389 block = elt->second;
2390 remap = block;
2393 /* Split block before insn. The insn is in the new block */
2394 edge e = split_block (block, PREV_INSN (elt->first));
2396 block = e->dest;
2397 map->get_or_insert (block) = elt->first;
2401 /* BLOCK is a basic block containing a head or tail instruction.
2402 Locate the associated prehead or pretail instruction, which must be
2403 in the single predecessor block. */
2405 static rtx_insn *
2406 nvptx_discover_pre (basic_block block, int expected)
2408 gcc_assert (block->preds->length () == 1);
2409 basic_block pre_block = (*block->preds)[0]->src;
2410 rtx_insn *pre_insn;
2412 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2413 pre_insn = PREV_INSN (pre_insn))
2414 gcc_assert (pre_insn != BB_HEAD (pre_block));
2416 gcc_assert (recog_memoized (pre_insn) == expected);
2417 return pre_insn;
2420 /* Dump this parallel and all its inner parallels. */
2422 static void
2423 nvptx_dump_pars (parallel *par, unsigned depth)
2425 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2426 depth, par->mask,
2427 par->forked_block ? par->forked_block->index : -1,
2428 par->join_block ? par->join_block->index : -1);
2430 fprintf (dump_file, " blocks:");
2432 basic_block block;
2433 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2434 fprintf (dump_file, " %d", block->index);
2435 fprintf (dump_file, "\n");
2436 if (par->inner)
2437 nvptx_dump_pars (par->inner, depth + 1);
2439 if (par->next)
2440 nvptx_dump_pars (par->next, depth);
2443 /* If BLOCK contains a fork/join marker, process it to create or
2444 terminate a loop structure. Add this block to the current loop,
2445 and then walk successor blocks. */
2447 static parallel *
2448 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2450 if (block->flags & BB_VISITED)
2451 return par;
2452 block->flags |= BB_VISITED;
2454 if (rtx_insn **endp = map->get (block))
2456 rtx_insn *end = *endp;
2458 /* This is a block head or tail, or return instruction. */
2459 switch (recog_memoized (end))
2461 case CODE_FOR_return:
2462 /* Return instructions are in their own block, and we
2463 don't need to do anything more. */
2464 return par;
2466 case CODE_FOR_nvptx_forked:
2467 /* Loop head, create a new inner loop and add it into
2468 our parent's child list. */
2470 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2472 gcc_assert (mask);
2473 par = new parallel (par, mask);
2474 par->forked_block = block;
2475 par->forked_insn = end;
2476 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2477 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2478 par->fork_insn
2479 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2481 break;
2483 case CODE_FOR_nvptx_join:
2484 /* A loop tail. Finish the current loop and return to
2485 parent. */
2487 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2489 gcc_assert (par->mask == mask);
2490 par->join_block = block;
2491 par->join_insn = end;
2492 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2493 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2494 par->joining_insn
2495 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2496 par = par->parent;
2498 break;
2500 default:
2501 gcc_unreachable ();
2505 if (par)
2506 /* Add this block onto the current loop's list of blocks. */
2507 par->blocks.safe_push (block);
2508 else
2509 /* This must be the entry block. Create a NULL parallel. */
2510 par = new parallel (0, 0);
2512 /* Walk successor blocks. */
2513 edge e;
2514 edge_iterator ei;
2516 FOR_EACH_EDGE (e, ei, block->succs)
2517 nvptx_find_par (map, par, e->dest);
2519 return par;
2522 /* DFS walk the CFG looking for fork & join markers. Construct
2523 loop structures as we go. MAP is a mapping of basic blocks
2524 to head & tail markers, discovered when splitting blocks. This
2525 speeds up the discovery. We rely on the BB visited flag having
2526 been cleared when splitting blocks. */
2528 static parallel *
2529 nvptx_discover_pars (bb_insn_map_t *map)
2531 basic_block block;
2533 /* Mark exit blocks as visited. */
2534 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2535 block->flags |= BB_VISITED;
2537 /* And entry block as not. */
2538 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2539 block->flags &= ~BB_VISITED;
2541 parallel *par = nvptx_find_par (map, 0, block);
2543 if (dump_file)
2545 fprintf (dump_file, "\nLoops\n");
2546 nvptx_dump_pars (par, 0);
2547 fprintf (dump_file, "\n");
2550 return par;
2553 /* Analyse a group of BBs within a partitioned region and create N
2554 Single-Entry-Single-Exit regions. Some of those regions will be
2555 trivial ones consisting of a single BB. The blocks of a
2556 partitioned region might form a set of disjoint graphs -- because
2557 the region encloses a differently partitoned sub region.
2559 We use the linear time algorithm described in 'Finding Regions Fast:
2560 Single Entry Single Exit and control Regions in Linear Time'
2561 Johnson, Pearson & Pingali. That algorithm deals with complete
2562 CFGs, where a back edge is inserted from END to START, and thus the
2563 problem becomes one of finding equivalent loops.
2565 In this case we have a partial CFG. We complete it by redirecting
2566 any incoming edge to the graph to be from an arbitrary external BB,
2567 and similarly redirecting any outgoing edge to be to that BB.
2568 Thus we end up with a closed graph.
2570 The algorithm works by building a spanning tree of an undirected
2571 graph and keeping track of back edges from nodes further from the
2572 root in the tree to nodes nearer to the root in the tree. In the
2573 description below, the root is up and the tree grows downwards.
2575 We avoid having to deal with degenerate back-edges to the same
2576 block, by splitting each BB into 3 -- one for input edges, one for
2577 the node itself and one for the output edges. Such back edges are
2578 referred to as 'Brackets'. Cycle equivalent nodes will have the
2579 same set of brackets.
2581 Determining bracket equivalency is done by maintaining a list of
2582 brackets in such a manner that the list length and final bracket
2583 uniquely identify the set.
2585 We use coloring to mark all BBs with cycle equivalency with the
2586 same color. This is the output of the 'Finding Regions Fast'
2587 algorithm. Notice it doesn't actually find the set of nodes within
2588 a particular region, just unorderd sets of nodes that are the
2589 entries and exits of SESE regions.
2591 After determining cycle equivalency, we need to find the minimal
2592 set of SESE regions. Do this with a DFS coloring walk of the
2593 complete graph. We're either 'looking' or 'coloring'. When
2594 looking, and we're in the subgraph, we start coloring the color of
2595 the current node, and remember that node as the start of the
2596 current color's SESE region. Every time we go to a new node, we
2597 decrement the count of nodes with thet color. If it reaches zero,
2598 we remember that node as the end of the current color's SESE region
2599 and return to 'looking'. Otherwise we color the node the current
2600 color.
2602 This way we end up with coloring the inside of non-trivial SESE
2603 regions with the color of that region. */
2605 /* A pair of BBs. We use this to represent SESE regions. */
2606 typedef std::pair<basic_block, basic_block> bb_pair_t;
2607 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
2609 /* A node in the undirected CFG. The discriminator SECOND indicates just
2610 above or just below the BB idicated by FIRST. */
2611 typedef std::pair<basic_block, int> pseudo_node_t;
2613 /* A bracket indicates an edge towards the root of the spanning tree of the
2614 undirected graph. Each bracket has a color, determined
2615 from the currrent set of brackets. */
2616 struct bracket
2618 pseudo_node_t back; /* Back target */
2620 /* Current color and size of set. */
2621 unsigned color;
2622 unsigned size;
2624 bracket (pseudo_node_t back_)
2625 : back (back_), color (~0u), size (~0u)
2629 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
2631 if (length != size)
2633 size = length;
2634 color = color_counts.length ();
2635 color_counts.quick_push (0);
2637 color_counts[color]++;
2638 return color;
2642 typedef auto_vec<bracket> bracket_vec_t;
2644 /* Basic block info for finding SESE regions. */
2646 struct bb_sese
2648 int node; /* Node number in spanning tree. */
2649 int parent; /* Parent node number. */
2651 /* The algorithm splits each node A into Ai, A', Ao. The incoming
2652 edges arrive at pseudo-node Ai and the outgoing edges leave at
2653 pseudo-node Ao. We have to remember which way we arrived at a
2654 particular node when generating the spanning tree. dir > 0 means
2655 we arrived at Ai, dir < 0 means we arrived at Ao. */
2656 int dir;
2658 /* Lowest numbered pseudo-node reached via a backedge from thsis
2659 node, or any descendant. */
2660 pseudo_node_t high;
2662 int color; /* Cycle-equivalence color */
2664 /* Stack of brackets for this node. */
2665 bracket_vec_t brackets;
2667 bb_sese (unsigned node_, unsigned p, int dir_)
2668 :node (node_), parent (p), dir (dir_)
2671 ~bb_sese ();
2673 /* Push a bracket ending at BACK. */
2674 void push (const pseudo_node_t &back)
2676 if (dump_file)
2677 fprintf (dump_file, "Pushing backedge %d:%+d\n",
2678 back.first ? back.first->index : 0, back.second);
2679 brackets.safe_push (bracket (back));
2682 void append (bb_sese *child);
2683 void remove (const pseudo_node_t &);
2685 /* Set node's color. */
2686 void set_color (auto_vec<unsigned> &color_counts)
2688 color = brackets.last ().get_color (color_counts, brackets.length ());
2692 bb_sese::~bb_sese ()
2696 /* Destructively append CHILD's brackets. */
2698 void
2699 bb_sese::append (bb_sese *child)
2701 if (int len = child->brackets.length ())
2703 int ix;
2705 if (dump_file)
2707 for (ix = 0; ix < len; ix++)
2709 const pseudo_node_t &pseudo = child->brackets[ix].back;
2710 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
2711 child->node, pseudo.first ? pseudo.first->index : 0,
2712 pseudo.second);
2715 if (!brackets.length ())
2716 std::swap (brackets, child->brackets);
2717 else
2719 brackets.reserve (len);
2720 for (ix = 0; ix < len; ix++)
2721 brackets.quick_push (child->brackets[ix]);
2726 /* Remove brackets that terminate at PSEUDO. */
2728 void
2729 bb_sese::remove (const pseudo_node_t &pseudo)
2731 unsigned removed = 0;
2732 int len = brackets.length ();
2734 for (int ix = 0; ix < len; ix++)
2736 if (brackets[ix].back == pseudo)
2738 if (dump_file)
2739 fprintf (dump_file, "Removing backedge %d:%+d\n",
2740 pseudo.first ? pseudo.first->index : 0, pseudo.second);
2741 removed++;
2743 else if (removed)
2744 brackets[ix-removed] = brackets[ix];
2746 while (removed--)
2747 brackets.pop ();
2750 /* Accessors for BB's aux pointer. */
2751 #define BB_SET_SESE(B, S) ((B)->aux = (S))
2752 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
2754 /* DFS walk creating SESE data structures. Only cover nodes with
2755 BB_VISITED set. Append discovered blocks to LIST. We number in
2756 increments of 3 so that the above and below pseudo nodes can be
2757 implicitly numbered too. */
2759 static int
2760 nvptx_sese_number (int n, int p, int dir, basic_block b,
2761 auto_vec<basic_block> *list)
2763 if (BB_GET_SESE (b))
2764 return n;
2766 if (dump_file)
2767 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
2768 b->index, n, p, dir);
2770 BB_SET_SESE (b, new bb_sese (n, p, dir));
2771 p = n;
2773 n += 3;
2774 list->quick_push (b);
2776 /* First walk the nodes on the 'other side' of this node, then walk
2777 the nodes on the same side. */
2778 for (unsigned ix = 2; ix; ix--)
2780 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
2781 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
2782 : offsetof (edge_def, src));
2783 edge e;
2784 edge_iterator (ei);
2786 FOR_EACH_EDGE (e, ei, edges)
2788 basic_block target = *(basic_block *)((char *)e + offset);
2790 if (target->flags & BB_VISITED)
2791 n = nvptx_sese_number (n, p, dir, target, list);
2793 dir = -dir;
2795 return n;
2798 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
2799 EDGES are the outgoing edges and OFFSET is the offset to the src
2800 or dst block on the edges. */
2802 static void
2803 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
2804 vec<edge, va_gc> *edges, size_t offset)
2806 edge e;
2807 edge_iterator (ei);
2808 int hi_back = depth;
2809 pseudo_node_t node_back (0, depth);
2810 int hi_child = depth;
2811 pseudo_node_t node_child (0, depth);
2812 basic_block child = NULL;
2813 unsigned num_children = 0;
2814 int usd = -dir * sese->dir;
2816 if (dump_file)
2817 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
2818 me->index, sese->node, dir);
2820 if (dir < 0)
2822 /* This is the above pseudo-child. It has the BB itself as an
2823 additional child node. */
2824 node_child = sese->high;
2825 hi_child = node_child.second;
2826 if (node_child.first)
2827 hi_child += BB_GET_SESE (node_child.first)->node;
2828 num_children++;
2831 /* Examine each edge.
2832 - if it is a child (a) append its bracket list and (b) record
2833 whether it is the child with the highest reaching bracket.
2834 - if it is an edge to ancestor, record whether it's the highest
2835 reaching backlink. */
2836 FOR_EACH_EDGE (e, ei, edges)
2838 basic_block target = *(basic_block *)((char *)e + offset);
2840 if (bb_sese *t_sese = BB_GET_SESE (target))
2842 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
2844 /* Child node. Append its bracket list. */
2845 num_children++;
2846 sese->append (t_sese);
2848 /* Compare it's hi value. */
2849 int t_hi = t_sese->high.second;
2851 if (basic_block child_hi_block = t_sese->high.first)
2852 t_hi += BB_GET_SESE (child_hi_block)->node;
2854 if (hi_child > t_hi)
2856 hi_child = t_hi;
2857 node_child = t_sese->high;
2858 child = target;
2861 else if (t_sese->node < sese->node + dir
2862 && !(dir < 0 && sese->parent == t_sese->node))
2864 /* Non-parental ancestor node -- a backlink. */
2865 int d = usd * t_sese->dir;
2866 int back = t_sese->node + d;
2868 if (hi_back > back)
2870 hi_back = back;
2871 node_back = pseudo_node_t (target, d);
2875 else
2876 { /* Fallen off graph, backlink to entry node. */
2877 hi_back = 0;
2878 node_back = pseudo_node_t (0, 0);
2882 /* Remove any brackets that terminate at this pseudo node. */
2883 sese->remove (pseudo_node_t (me, dir));
2885 /* Now push any backlinks from this pseudo node. */
2886 FOR_EACH_EDGE (e, ei, edges)
2888 basic_block target = *(basic_block *)((char *)e + offset);
2889 if (bb_sese *t_sese = BB_GET_SESE (target))
2891 if (t_sese->node < sese->node + dir
2892 && !(dir < 0 && sese->parent == t_sese->node))
2893 /* Non-parental ancestor node - backedge from me. */
2894 sese->push (pseudo_node_t (target, usd * t_sese->dir));
2896 else
2898 /* back edge to entry node */
2899 sese->push (pseudo_node_t (0, 0));
2903 /* If this node leads directly or indirectly to a no-return region of
2904 the graph, then fake a backedge to entry node. */
2905 if (!sese->brackets.length () || !edges || !edges->length ())
2907 hi_back = 0;
2908 node_back = pseudo_node_t (0, 0);
2909 sese->push (node_back);
2912 /* Record the highest reaching backedge from us or a descendant. */
2913 sese->high = hi_back < hi_child ? node_back : node_child;
2915 if (num_children > 1)
2917 /* There is more than one child -- this is a Y shaped piece of
2918 spanning tree. We have to insert a fake backedge from this
2919 node to the highest ancestor reached by not-the-highest
2920 reaching child. Note that there may be multiple children
2921 with backedges to the same highest node. That's ok and we
2922 insert the edge to that highest node. */
2923 hi_child = depth;
2924 if (dir < 0 && child)
2926 node_child = sese->high;
2927 hi_child = node_child.second;
2928 if (node_child.first)
2929 hi_child += BB_GET_SESE (node_child.first)->node;
2932 FOR_EACH_EDGE (e, ei, edges)
2934 basic_block target = *(basic_block *)((char *)e + offset);
2936 if (target == child)
2937 /* Ignore the highest child. */
2938 continue;
2940 bb_sese *t_sese = BB_GET_SESE (target);
2941 if (!t_sese)
2942 continue;
2943 if (t_sese->parent != sese->node)
2944 /* Not a child. */
2945 continue;
2947 /* Compare its hi value. */
2948 int t_hi = t_sese->high.second;
2950 if (basic_block child_hi_block = t_sese->high.first)
2951 t_hi += BB_GET_SESE (child_hi_block)->node;
2953 if (hi_child > t_hi)
2955 hi_child = t_hi;
2956 node_child = t_sese->high;
2960 sese->push (node_child);
2965 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
2966 proceed to successors. Set SESE entry and exit nodes of
2967 REGIONS. */
2969 static void
2970 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
2971 basic_block block, int coloring)
2973 bb_sese *sese = BB_GET_SESE (block);
2975 if (block->flags & BB_VISITED)
2977 /* If we've already encountered this block, either we must not
2978 be coloring, or it must have been colored the current color. */
2979 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
2980 return;
2983 block->flags |= BB_VISITED;
2985 if (sese)
2987 if (coloring < 0)
2989 /* Start coloring a region. */
2990 regions[sese->color].first = block;
2991 coloring = sese->color;
2994 if (!--color_counts[sese->color] && sese->color == coloring)
2996 /* Found final block of SESE region. */
2997 regions[sese->color].second = block;
2998 coloring = -1;
3000 else
3001 /* Color the node, so we can assert on revisiting the node
3002 that the graph is indeed SESE. */
3003 sese->color = coloring;
3005 else
3006 /* Fallen off the subgraph, we cannot be coloring. */
3007 gcc_assert (coloring < 0);
3009 /* Walk each successor block. */
3010 if (block->succs && block->succs->length ())
3012 edge e;
3013 edge_iterator ei;
3015 FOR_EACH_EDGE (e, ei, block->succs)
3016 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3018 else
3019 gcc_assert (coloring < 0);
3022 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3023 end up with NULL entries in it. */
3025 static void
3026 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3028 basic_block block;
3029 int ix;
3031 /* First clear each BB of the whole function. */
3032 FOR_EACH_BB_FN (block, cfun)
3034 block->flags &= ~BB_VISITED;
3035 BB_SET_SESE (block, 0);
3037 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3038 block->flags &= ~BB_VISITED;
3039 BB_SET_SESE (block, 0);
3040 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3041 block->flags &= ~BB_VISITED;
3042 BB_SET_SESE (block, 0);
3044 /* Mark blocks in the function that are in this graph. */
3045 for (ix = 0; blocks.iterate (ix, &block); ix++)
3046 block->flags |= BB_VISITED;
3048 /* Counts of nodes assigned to each color. There cannot be more
3049 colors than blocks (and hopefully there will be fewer). */
3050 auto_vec<unsigned> color_counts;
3051 color_counts.reserve (blocks.length ());
3053 /* Worklist of nodes in the spanning tree. Again, there cannot be
3054 more nodes in the tree than blocks (there will be fewer if the
3055 CFG of blocks is disjoint). */
3056 auto_vec<basic_block> spanlist;
3057 spanlist.reserve (blocks.length ());
3059 /* Make sure every block has its cycle class determined. */
3060 for (ix = 0; blocks.iterate (ix, &block); ix++)
3062 if (BB_GET_SESE (block))
3063 /* We already met this block in an earlier graph solve. */
3064 continue;
3066 if (dump_file)
3067 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3069 /* Number the nodes reachable from block initial DFS order. */
3070 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3072 /* Now walk in reverse DFS order to find cycle equivalents. */
3073 while (spanlist.length ())
3075 block = spanlist.pop ();
3076 bb_sese *sese = BB_GET_SESE (block);
3078 /* Do the pseudo node below. */
3079 nvptx_sese_pseudo (block, sese, depth, +1,
3080 sese->dir > 0 ? block->succs : block->preds,
3081 (sese->dir > 0 ? offsetof (edge_def, dest)
3082 : offsetof (edge_def, src)));
3083 sese->set_color (color_counts);
3084 /* Do the pseudo node above. */
3085 nvptx_sese_pseudo (block, sese, depth, -1,
3086 sese->dir < 0 ? block->succs : block->preds,
3087 (sese->dir < 0 ? offsetof (edge_def, dest)
3088 : offsetof (edge_def, src)));
3090 if (dump_file)
3091 fprintf (dump_file, "\n");
3094 if (dump_file)
3096 unsigned count;
3097 const char *comma = "";
3099 fprintf (dump_file, "Found %d cycle equivalents\n",
3100 color_counts.length ());
3101 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3103 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3105 comma = "";
3106 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3107 if (BB_GET_SESE (block)->color == ix)
3109 block->flags |= BB_VISITED;
3110 fprintf (dump_file, "%s%d", comma, block->index);
3111 comma=",";
3113 fprintf (dump_file, "}");
3114 comma = ", ";
3116 fprintf (dump_file, "\n");
3119 /* Now we've colored every block in the subgraph. We now need to
3120 determine the minimal set of SESE regions that cover that
3121 subgraph. Do this with a DFS walk of the complete function.
3122 During the walk we're either 'looking' or 'coloring'. When we
3123 reach the last node of a particular color, we stop coloring and
3124 return to looking. */
3126 /* There cannot be more SESE regions than colors. */
3127 regions.reserve (color_counts.length ());
3128 for (ix = color_counts.length (); ix--;)
3129 regions.quick_push (bb_pair_t (0, 0));
3131 for (ix = 0; blocks.iterate (ix, &block); ix++)
3132 block->flags &= ~BB_VISITED;
3134 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3136 if (dump_file)
3138 const char *comma = "";
3139 int len = regions.length ();
3141 fprintf (dump_file, "SESE regions:");
3142 for (ix = 0; ix != len; ix++)
3144 basic_block from = regions[ix].first;
3145 basic_block to = regions[ix].second;
3147 if (from)
3149 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3150 if (to != from)
3151 fprintf (dump_file, "->%d", to->index);
3153 int color = BB_GET_SESE (from)->color;
3155 /* Print the blocks within the region (excluding ends). */
3156 FOR_EACH_BB_FN (block, cfun)
3158 bb_sese *sese = BB_GET_SESE (block);
3160 if (sese && sese->color == color
3161 && block != from && block != to)
3162 fprintf (dump_file, ".%d", block->index);
3164 fprintf (dump_file, "}");
3166 comma = ",";
3168 fprintf (dump_file, "\n\n");
3171 for (ix = 0; blocks.iterate (ix, &block); ix++)
3172 delete BB_GET_SESE (block);
3175 #undef BB_SET_SESE
3176 #undef BB_GET_SESE
3178 /* Propagate live state at the start of a partitioned region. BLOCK
3179 provides the live register information, and might not contain
3180 INSN. Propagation is inserted just after INSN. RW indicates whether
3181 we are reading and/or writing state. This
3182 separation is needed for worker-level proppagation where we
3183 essentially do a spill & fill. FN is the underlying worker
3184 function to generate the propagation instructions for single
3185 register. DATA is user data.
3187 We propagate the live register set and the entire frame. We could
3188 do better by (a) propagating just the live set that is used within
3189 the partitioned regions and (b) only propagating stack entries that
3190 are used. The latter might be quite hard to determine. */
3192 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3194 static void
3195 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3196 propagator_fn fn, void *data)
3198 bitmap live = DF_LIVE_IN (block);
3199 bitmap_iterator iterator;
3200 unsigned ix;
3202 /* Copy the frame array. */
3203 HOST_WIDE_INT fs = get_frame_size ();
3204 if (fs)
3206 rtx tmp = gen_reg_rtx (DImode);
3207 rtx idx = NULL_RTX;
3208 rtx ptr = gen_reg_rtx (Pmode);
3209 rtx pred = NULL_RTX;
3210 rtx_code_label *label = NULL;
3212 gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
3213 fs /= GET_MODE_SIZE (DImode);
3214 /* Detect single iteration loop. */
3215 if (fs == 1)
3216 fs = 0;
3218 start_sequence ();
3219 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3220 if (fs)
3222 idx = gen_reg_rtx (SImode);
3223 pred = gen_reg_rtx (BImode);
3224 label = gen_label_rtx ();
3226 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3227 /* Allow worker function to initialize anything needed. */
3228 rtx init = fn (tmp, PM_loop_begin, fs, data);
3229 if (init)
3230 emit_insn (init);
3231 emit_label (label);
3232 LABEL_NUSES (label)++;
3233 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3235 if (rw & PM_read)
3236 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3237 emit_insn (fn (tmp, rw, fs, data));
3238 if (rw & PM_write)
3239 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3240 if (fs)
3242 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3243 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3244 emit_insn (gen_br_true_uni (pred, label));
3245 rtx fini = fn (tmp, PM_loop_end, fs, data);
3246 if (fini)
3247 emit_insn (fini);
3248 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3250 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3251 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3252 rtx cpy = get_insns ();
3253 end_sequence ();
3254 insn = emit_insn_after (cpy, insn);
3257 /* Copy live registers. */
3258 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3260 rtx reg = regno_reg_rtx[ix];
3262 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3264 rtx bcast = fn (reg, rw, 0, data);
3266 insn = emit_insn_after (bcast, insn);
3271 /* Worker for nvptx_vpropagate. */
3273 static rtx
3274 vprop_gen (rtx reg, propagate_mask pm,
3275 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3277 if (!(pm & PM_read_write))
3278 return 0;
3280 return nvptx_gen_vcast (reg);
3283 /* Propagate state that is live at start of BLOCK across the vectors
3284 of a single warp. Propagation is inserted just after INSN. */
3286 static void
3287 nvptx_vpropagate (basic_block block, rtx_insn *insn)
3289 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3292 /* Worker for nvptx_wpropagate. */
3294 static rtx
3295 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3297 wcast_data_t *data = (wcast_data_t *)data_;
3299 if (pm & PM_loop_begin)
3301 /* Starting a loop, initialize pointer. */
3302 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3304 if (align > worker_bcast_align)
3305 worker_bcast_align = align;
3306 data->offset = (data->offset + align - 1) & ~(align - 1);
3308 data->ptr = gen_reg_rtx (Pmode);
3310 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3312 else if (pm & PM_loop_end)
3314 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3315 data->ptr = NULL_RTX;
3316 return clobber;
3318 else
3319 return nvptx_gen_wcast (reg, pm, rep, data);
3322 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3323 indicates if this is just before partitioned mode (do spill), or
3324 just after it starts (do fill). Sequence is inserted just after
3325 INSN. */
3327 static void
3328 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3330 wcast_data_t data;
3332 data.base = gen_reg_rtx (Pmode);
3333 data.offset = 0;
3334 data.ptr = NULL_RTX;
3336 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3337 if (data.offset)
3339 /* Stuff was emitted, initialize the base pointer now. */
3340 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3341 emit_insn_after (init, insn);
3343 if (worker_bcast_size < data.offset)
3344 worker_bcast_size = data.offset;
3348 /* Emit a worker-level synchronization barrier. We use different
3349 markers for before and after synchronizations. */
3351 static rtx
3352 nvptx_wsync (bool after)
3354 return gen_nvptx_barsync (GEN_INT (after));
3357 /* Single neutering according to MASK. FROM is the incoming block and
3358 TO is the outgoing block. These may be the same block. Insert at
3359 start of FROM:
3361 if (tid.<axis>) goto end.
3363 and insert before ending branch of TO (if there is such an insn):
3365 end:
3366 <possibly-broadcast-cond>
3367 <branch>
3369 We currently only use differnt FROM and TO when skipping an entire
3370 loop. We could do more if we detected superblocks. */
3372 static void
3373 nvptx_single (unsigned mask, basic_block from, basic_block to)
3375 rtx_insn *head = BB_HEAD (from);
3376 rtx_insn *tail = BB_END (to);
3377 unsigned skip_mask = mask;
3379 /* Find first insn of from block */
3380 while (head != BB_END (from) && !INSN_P (head))
3381 head = NEXT_INSN (head);
3383 /* Find last insn of to block */
3384 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3385 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3386 tail = PREV_INSN (tail);
3388 /* Detect if tail is a branch. */
3389 rtx tail_branch = NULL_RTX;
3390 rtx cond_branch = NULL_RTX;
3391 if (tail && INSN_P (tail))
3393 tail_branch = PATTERN (tail);
3394 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3395 tail_branch = NULL_RTX;
3396 else
3398 cond_branch = SET_SRC (tail_branch);
3399 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3400 cond_branch = NULL_RTX;
3404 if (tail == head)
3406 /* If this is empty, do nothing. */
3407 if (!head || !INSN_P (head))
3408 return;
3410 /* If this is a dummy insn, do nothing. */
3411 switch (recog_memoized (head))
3413 default:
3414 break;
3415 case CODE_FOR_nvptx_fork:
3416 case CODE_FOR_nvptx_forked:
3417 case CODE_FOR_nvptx_joining:
3418 case CODE_FOR_nvptx_join:
3419 return;
3422 if (cond_branch)
3424 /* If we're only doing vector single, there's no need to
3425 emit skip code because we'll not insert anything. */
3426 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3427 skip_mask = 0;
3429 else if (tail_branch)
3430 /* Block with only unconditional branch. Nothing to do. */
3431 return;
3434 /* Insert the vector test inside the worker test. */
3435 unsigned mode;
3436 rtx_insn *before = tail;
3437 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3438 if (GOMP_DIM_MASK (mode) & skip_mask)
3440 rtx_code_label *label = gen_label_rtx ();
3441 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3443 if (!pred)
3445 pred = gen_reg_rtx (BImode);
3446 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3449 rtx br;
3450 if (mode == GOMP_DIM_VECTOR)
3451 br = gen_br_true (pred, label);
3452 else
3453 br = gen_br_true_uni (pred, label);
3454 emit_insn_before (br, head);
3456 LABEL_NUSES (label)++;
3457 if (tail_branch)
3458 before = emit_label_before (label, before);
3459 else
3460 emit_label_after (label, tail);
3463 /* Now deal with propagating the branch condition. */
3464 if (cond_branch)
3466 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3468 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3470 /* Vector mode only, do a shuffle. */
3471 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3473 else
3475 /* Includes worker mode, do spill & fill. By construction
3476 we should never have worker mode only. */
3477 wcast_data_t data;
3479 data.base = worker_bcast_sym;
3480 data.ptr = 0;
3482 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3483 worker_bcast_size = GET_MODE_SIZE (SImode);
3485 data.offset = 0;
3486 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3487 before);
3488 /* Barrier so other workers can see the write. */
3489 emit_insn_before (nvptx_wsync (false), tail);
3490 data.offset = 0;
3491 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3492 /* This barrier is needed to avoid worker zero clobbering
3493 the broadcast buffer before all the other workers have
3494 had a chance to read this instance of it. */
3495 emit_insn_before (nvptx_wsync (true), tail);
3498 extract_insn (tail);
3499 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3500 UNSPEC_BR_UNIFIED);
3501 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3505 /* PAR is a parallel that is being skipped in its entirety according to
3506 MASK. Treat this as skipping a superblock starting at forked
3507 and ending at joining. */
3509 static void
3510 nvptx_skip_par (unsigned mask, parallel *par)
3512 basic_block tail = par->join_block;
3513 gcc_assert (tail->preds->length () == 1);
3515 basic_block pre_tail = (*tail->preds)[0]->src;
3516 gcc_assert (pre_tail->succs->length () == 1);
3518 nvptx_single (mask, par->forked_block, pre_tail);
3521 /* If PAR has a single inner parallel and PAR itself only contains
3522 empty entry and exit blocks, swallow the inner PAR. */
3524 static void
3525 nvptx_optimize_inner (parallel *par)
3527 parallel *inner = par->inner;
3529 /* We mustn't be the outer dummy par. */
3530 if (!par->mask)
3531 return;
3533 /* We must have a single inner par. */
3534 if (!inner || inner->next)
3535 return;
3537 /* We must only contain 2 blocks ourselves -- the head and tail of
3538 the inner par. */
3539 if (par->blocks.length () != 2)
3540 return;
3542 /* We must be disjoint partitioning. As we only have vector and
3543 worker partitioning, this is sufficient to guarantee the pars
3544 have adjacent partitioning. */
3545 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
3546 /* This indicates malformed code generation. */
3547 return;
3549 /* The outer forked insn should be immediately followed by the inner
3550 fork insn. */
3551 rtx_insn *forked = par->forked_insn;
3552 rtx_insn *fork = BB_END (par->forked_block);
3554 if (NEXT_INSN (forked) != fork)
3555 return;
3556 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
3558 /* The outer joining insn must immediately follow the inner join
3559 insn. */
3560 rtx_insn *joining = par->joining_insn;
3561 rtx_insn *join = inner->join_insn;
3562 if (NEXT_INSN (join) != joining)
3563 return;
3565 /* Preconditions met. Swallow the inner par. */
3566 if (dump_file)
3567 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3568 inner->mask, inner->forked_block->index,
3569 inner->join_block->index,
3570 par->mask, par->forked_block->index, par->join_block->index);
3572 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
3574 par->blocks.reserve (inner->blocks.length ());
3575 while (inner->blocks.length ())
3576 par->blocks.quick_push (inner->blocks.pop ());
3578 par->inner = inner->inner;
3579 inner->inner = NULL;
3581 delete inner;
3584 /* Process the parallel PAR and all its contained
3585 parallels. We do everything but the neutering. Return mask of
3586 partitioned modes used within this parallel. */
3588 static unsigned
3589 nvptx_process_pars (parallel *par)
3591 if (nvptx_optimize)
3592 nvptx_optimize_inner (par);
3594 unsigned inner_mask = par->mask;
3596 /* Do the inner parallels first. */
3597 if (par->inner)
3599 par->inner_mask = nvptx_process_pars (par->inner);
3600 inner_mask |= par->inner_mask;
3603 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3604 /* No propagation needed for a call. */;
3605 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3607 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
3608 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
3609 /* Insert begin and end synchronizations. */
3610 emit_insn_after (nvptx_wsync (false), par->forked_insn);
3611 emit_insn_before (nvptx_wsync (true), par->joining_insn);
3613 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3614 nvptx_vpropagate (par->forked_block, par->forked_insn);
3616 /* Now do siblings. */
3617 if (par->next)
3618 inner_mask |= nvptx_process_pars (par->next);
3619 return inner_mask;
3622 /* Neuter the parallel described by PAR. We recurse in depth-first
3623 order. MODES are the partitioning of the execution and OUTER is
3624 the partitioning of the parallels we are contained in. */
3626 static void
3627 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
3629 unsigned me = (par->mask
3630 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
3631 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3632 unsigned skip_mask = 0, neuter_mask = 0;
3634 if (par->inner)
3635 nvptx_neuter_pars (par->inner, modes, outer | me);
3637 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3639 if ((outer | me) & GOMP_DIM_MASK (mode))
3640 {} /* Mode is partitioned: no neutering. */
3641 else if (!(modes & GOMP_DIM_MASK (mode)))
3642 {} /* Mode is not used: nothing to do. */
3643 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3644 || !par->forked_insn)
3645 /* Partitioned in inner parallels, or we're not a partitioned
3646 at all: neuter individual blocks. */
3647 neuter_mask |= GOMP_DIM_MASK (mode);
3648 else if (!par->parent || !par->parent->forked_insn
3649 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
3650 /* Parent isn't a parallel or contains this paralleling: skip
3651 parallel at this level. */
3652 skip_mask |= GOMP_DIM_MASK (mode);
3653 else
3654 {} /* Parent will skip this parallel itself. */
3657 if (neuter_mask)
3659 int ix, len;
3661 if (nvptx_optimize)
3663 /* Neuter whole SESE regions. */
3664 bb_pair_vec_t regions;
3666 nvptx_find_sese (par->blocks, regions);
3667 len = regions.length ();
3668 for (ix = 0; ix != len; ix++)
3670 basic_block from = regions[ix].first;
3671 basic_block to = regions[ix].second;
3673 if (from)
3674 nvptx_single (neuter_mask, from, to);
3675 else
3676 gcc_assert (!to);
3679 else
3681 /* Neuter each BB individually. */
3682 len = par->blocks.length ();
3683 for (ix = 0; ix != len; ix++)
3685 basic_block block = par->blocks[ix];
3687 nvptx_single (neuter_mask, block, block);
3692 if (skip_mask)
3693 nvptx_skip_par (skip_mask, par);
3695 if (par->next)
3696 nvptx_neuter_pars (par->next, modes, outer);
3699 /* PTX-specific reorganization
3700 - Split blocks at fork and join instructions
3701 - Compute live registers
3702 - Mark now-unused registers, so function begin doesn't declare
3703 unused registers.
3704 - Insert state propagation when entering partitioned mode
3705 - Insert neutering instructions when in single mode
3706 - Replace subregs with suitable sequences.
3709 static void
3710 nvptx_reorg (void)
3712 /* We are freeing block_for_insn in the toplev to keep compatibility
3713 with old MDEP_REORGS that are not CFG based. Recompute it now. */
3714 compute_bb_for_insn ();
3716 thread_prologue_and_epilogue_insns ();
3718 /* Split blocks and record interesting unspecs. */
3719 bb_insn_map_t bb_insn_map;
3721 nvptx_split_blocks (&bb_insn_map);
3723 /* Compute live regs */
3724 df_clear_flags (DF_LR_RUN_DCE);
3725 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
3726 df_live_add_problem ();
3727 df_live_set_all_dirty ();
3728 df_analyze ();
3729 regstat_init_n_sets_and_refs ();
3731 if (dump_file)
3732 df_dump (dump_file);
3734 /* Mark unused regs as unused. */
3735 int max_regs = max_reg_num ();
3736 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
3737 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
3738 regno_reg_rtx[i] = const0_rtx;
3740 /* Determine launch dimensions of the function. If it is not an
3741 offloaded function (i.e. this is a regular compiler), the
3742 function has no neutering. */
3743 tree attr = get_oacc_fn_attrib (current_function_decl);
3744 if (attr)
3746 /* If we determined this mask before RTL expansion, we could
3747 elide emission of some levels of forks and joins. */
3748 unsigned mask = 0;
3749 tree dims = TREE_VALUE (attr);
3750 unsigned ix;
3752 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3754 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3755 tree allowed = TREE_PURPOSE (dims);
3757 if (size != 1 && !(allowed && integer_zerop (allowed)))
3758 mask |= GOMP_DIM_MASK (ix);
3760 /* If there is worker neutering, there must be vector
3761 neutering. Otherwise the hardware will fail. */
3762 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3763 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3765 /* Discover & process partitioned regions. */
3766 parallel *pars = nvptx_discover_pars (&bb_insn_map);
3767 nvptx_process_pars (pars);
3768 nvptx_neuter_pars (pars, mask, 0);
3769 delete pars;
3772 /* Replace subregs. */
3773 nvptx_reorg_subreg ();
3775 regstat_free_n_sets_and_refs ();
3777 df_finish_pass (true);
3780 /* Handle a "kernel" attribute; arguments as in
3781 struct attribute_spec.handler. */
3783 static tree
3784 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
3785 int ARG_UNUSED (flags), bool *no_add_attrs)
3787 tree decl = *node;
3789 if (TREE_CODE (decl) != FUNCTION_DECL)
3791 error ("%qE attribute only applies to functions", name);
3792 *no_add_attrs = true;
3794 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
3796 error ("%qE attribute requires a void return type", name);
3797 *no_add_attrs = true;
3800 return NULL_TREE;
3803 /* Table of valid machine attributes. */
3804 static const struct attribute_spec nvptx_attribute_table[] =
3806 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
3807 affects_type_identity } */
3808 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
3809 { NULL, 0, 0, false, false, false, NULL, false }
3812 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
3814 static HOST_WIDE_INT
3815 nvptx_vector_alignment (const_tree type)
3817 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
3819 return MIN (align, BIGGEST_ALIGNMENT);
3822 /* Indicate that INSN cannot be duplicated. */
3824 static bool
3825 nvptx_cannot_copy_insn_p (rtx_insn *insn)
3827 switch (recog_memoized (insn))
3829 case CODE_FOR_nvptx_shufflesi:
3830 case CODE_FOR_nvptx_shufflesf:
3831 case CODE_FOR_nvptx_barsync:
3832 case CODE_FOR_nvptx_fork:
3833 case CODE_FOR_nvptx_forked:
3834 case CODE_FOR_nvptx_joining:
3835 case CODE_FOR_nvptx_join:
3836 return true;
3837 default:
3838 return false;
3842 /* Section anchors do not work. Initialization for flag_section_anchor
3843 probes the existence of the anchoring target hooks and prevents
3844 anchoring if they don't exist. However, we may be being used with
3845 a host-side compiler that does support anchoring, and hence see
3846 the anchor flag set (as it's not recalculated). So provide an
3847 implementation denying anchoring. */
3849 static bool
3850 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
3852 return false;
3855 /* Record a symbol for mkoffload to enter into the mapping table. */
3857 static void
3858 nvptx_record_offload_symbol (tree decl)
3860 switch (TREE_CODE (decl))
3862 case VAR_DECL:
3863 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
3864 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3865 break;
3867 case FUNCTION_DECL:
3869 tree attr = get_oacc_fn_attrib (decl);
3870 tree dims = TREE_VALUE (attr);
3871 unsigned ix;
3873 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
3874 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3876 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3878 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3880 gcc_assert (!TREE_PURPOSE (dims));
3881 fprintf (asm_out_file, ", %#x", size);
3884 fprintf (asm_out_file, "\n");
3886 break;
3888 default:
3889 gcc_unreachable ();
3893 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
3894 at the start of a file. */
3896 static void
3897 nvptx_file_start (void)
3899 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
3900 fputs ("\t.version\t3.1\n", asm_out_file);
3901 fputs ("\t.target\tsm_30\n", asm_out_file);
3902 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
3903 fputs ("// END PREAMBLE\n", asm_out_file);
3906 /* Emit a declaration for a worker-level buffer in .shared memory. */
3908 static void
3909 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
3911 const char *name = XSTR (sym, 0);
3913 write_var_marker (file, true, false, name);
3914 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
3915 align, name, size);
3918 /* Write out the function declarations we've collected and declare storage
3919 for the broadcast buffer. */
3921 static void
3922 nvptx_file_end (void)
3924 hash_table<tree_hasher>::iterator iter;
3925 tree decl;
3926 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
3927 nvptx_record_fndecl (decl);
3928 fputs (func_decls.str().c_str(), asm_out_file);
3930 if (worker_bcast_size)
3931 write_worker_buffer (asm_out_file, worker_bcast_sym,
3932 worker_bcast_align, worker_bcast_size);
3934 if (worker_red_size)
3935 write_worker_buffer (asm_out_file, worker_red_sym,
3936 worker_red_align, worker_red_size);
3939 /* Expander for the shuffle builtins. */
3941 static rtx
3942 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
3944 if (ignore)
3945 return target;
3947 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
3948 NULL_RTX, mode, EXPAND_NORMAL);
3949 if (!REG_P (src))
3950 src = copy_to_mode_reg (mode, src);
3952 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
3953 NULL_RTX, SImode, EXPAND_NORMAL);
3954 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
3955 NULL_RTX, SImode, EXPAND_NORMAL);
3957 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
3958 idx = copy_to_mode_reg (SImode, idx);
3960 rtx pat = nvptx_gen_shuffle (target, src, idx,
3961 (nvptx_shuffle_kind) INTVAL (op));
3962 if (pat)
3963 emit_insn (pat);
3965 return target;
3968 /* Worker reduction address expander. */
3970 static rtx
3971 nvptx_expand_worker_addr (tree exp, rtx target,
3972 machine_mode ARG_UNUSED (mode), int ignore)
3974 if (ignore)
3975 return target;
3977 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
3978 if (align > worker_red_align)
3979 worker_red_align = align;
3981 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
3982 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
3983 if (size + offset > worker_red_size)
3984 worker_red_size = size + offset;
3986 rtx addr = worker_red_sym;
3987 if (offset)
3989 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
3990 addr = gen_rtx_CONST (Pmode, addr);
3993 emit_move_insn (target, addr);
3995 return target;
3998 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
3999 not require taking the address of any object, other than the memory
4000 cell being operated on. */
4002 static rtx
4003 nvptx_expand_cmp_swap (tree exp, rtx target,
4004 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4006 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4008 if (!target)
4009 target = gen_reg_rtx (mode);
4011 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4012 NULL_RTX, Pmode, EXPAND_NORMAL);
4013 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4014 NULL_RTX, mode, EXPAND_NORMAL);
4015 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4016 NULL_RTX, mode, EXPAND_NORMAL);
4017 rtx pat;
4019 mem = gen_rtx_MEM (mode, mem);
4020 if (!REG_P (cmp))
4021 cmp = copy_to_mode_reg (mode, cmp);
4022 if (!REG_P (src))
4023 src = copy_to_mode_reg (mode, src);
4025 if (mode == SImode)
4026 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4027 else
4028 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4030 emit_insn (pat);
4032 return target;
4036 /* Codes for all the NVPTX builtins. */
4037 enum nvptx_builtins
4039 NVPTX_BUILTIN_SHUFFLE,
4040 NVPTX_BUILTIN_SHUFFLELL,
4041 NVPTX_BUILTIN_WORKER_ADDR,
4042 NVPTX_BUILTIN_CMP_SWAP,
4043 NVPTX_BUILTIN_CMP_SWAPLL,
4044 NVPTX_BUILTIN_MAX
4047 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4049 /* Return the NVPTX builtin for CODE. */
4051 static tree
4052 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4054 if (code >= NVPTX_BUILTIN_MAX)
4055 return error_mark_node;
4057 return nvptx_builtin_decls[code];
4060 /* Set up all builtin functions for this target. */
4062 static void
4063 nvptx_init_builtins (void)
4065 #define DEF(ID, NAME, T) \
4066 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4067 = add_builtin_function ("__builtin_nvptx_" NAME, \
4068 build_function_type_list T, \
4069 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4070 #define ST sizetype
4071 #define UINT unsigned_type_node
4072 #define LLUINT long_long_unsigned_type_node
4073 #define PTRVOID ptr_type_node
4075 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4076 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4077 DEF (WORKER_ADDR, "worker_addr",
4078 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4079 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4080 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4082 #undef DEF
4083 #undef ST
4084 #undef UINT
4085 #undef LLUINT
4086 #undef PTRVOID
4089 /* Expand an expression EXP that calls a built-in function,
4090 with result going to TARGET if that's convenient
4091 (and in mode MODE if that's convenient).
4092 SUBTARGET may be used as the target for computing one of EXP's operands.
4093 IGNORE is nonzero if the value is to be ignored. */
4095 static rtx
4096 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4097 machine_mode mode, int ignore)
4099 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4100 switch (DECL_FUNCTION_CODE (fndecl))
4102 case NVPTX_BUILTIN_SHUFFLE:
4103 case NVPTX_BUILTIN_SHUFFLELL:
4104 return nvptx_expand_shuffle (exp, target, mode, ignore);
4106 case NVPTX_BUILTIN_WORKER_ADDR:
4107 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4109 case NVPTX_BUILTIN_CMP_SWAP:
4110 case NVPTX_BUILTIN_CMP_SWAPLL:
4111 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4113 default: gcc_unreachable ();
4117 /* Define dimension sizes for known hardware. */
4118 #define PTX_VECTOR_LENGTH 32
4119 #define PTX_WORKER_LENGTH 32
4121 /* Validate compute dimensions of an OpenACC offload or routine, fill
4122 in non-unity defaults. FN_LEVEL indicates the level at which a
4123 routine might spawn a loop. It is negative for non-routines. */
4125 static bool
4126 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4128 bool changed = false;
4130 /* The vector size must be 32, unless this is a SEQ routine. */
4131 if (fn_level <= GOMP_DIM_VECTOR
4132 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4134 if (dims[GOMP_DIM_VECTOR] >= 0 && fn_level < 0)
4135 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4136 dims[GOMP_DIM_VECTOR]
4137 ? "using vector_length (%d), ignoring %d"
4138 : "using vector_length (%d), ignoring runtime setting",
4139 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4140 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4141 changed = true;
4144 /* Check the num workers is not too large. */
4145 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4147 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4148 "using num_workers (%d), ignoring %d",
4149 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4150 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4151 changed = true;
4154 return changed;
4157 /* Return maximum dimension size, or zero for unbounded. */
4159 static int
4160 nvptx_dim_limit (int axis)
4162 switch (axis)
4164 case GOMP_DIM_WORKER:
4165 return PTX_WORKER_LENGTH;
4167 case GOMP_DIM_VECTOR:
4168 return PTX_VECTOR_LENGTH;
4170 default:
4171 break;
4173 return 0;
4176 /* Determine whether fork & joins are needed. */
4178 static bool
4179 nvptx_goacc_fork_join (gcall *call, const int dims[],
4180 bool ARG_UNUSED (is_fork))
4182 tree arg = gimple_call_arg (call, 2);
4183 unsigned axis = TREE_INT_CST_LOW (arg);
4185 /* We only care about worker and vector partitioning. */
4186 if (axis < GOMP_DIM_WORKER)
4187 return false;
4189 /* If the size is 1, there's no partitioning. */
4190 if (dims[axis] == 1)
4191 return false;
4193 return true;
4196 /* Generate a PTX builtin function call that returns the address in
4197 the worker reduction buffer at OFFSET. TYPE is the type of the
4198 data at that location. */
4200 static tree
4201 nvptx_get_worker_red_addr (tree type, tree offset)
4203 machine_mode mode = TYPE_MODE (type);
4204 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4205 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4206 tree align = build_int_cst (unsigned_type_node,
4207 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4208 tree call = build_call_expr (fndecl, 3, offset, size, align);
4210 return fold_convert (build_pointer_type (type), call);
4213 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4214 will cast the variable if necessary. */
4216 static void
4217 nvptx_generate_vector_shuffle (location_t loc,
4218 tree dest_var, tree var, unsigned shift,
4219 gimple_seq *seq)
4221 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4222 tree_code code = NOP_EXPR;
4223 tree arg_type = unsigned_type_node;
4224 tree var_type = TREE_TYPE (var);
4225 tree dest_type = var_type;
4227 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4228 var_type = TREE_TYPE (var_type);
4230 if (TREE_CODE (var_type) == REAL_TYPE)
4231 code = VIEW_CONVERT_EXPR;
4233 if (TYPE_SIZE (var_type)
4234 == TYPE_SIZE (long_long_unsigned_type_node))
4236 fn = NVPTX_BUILTIN_SHUFFLELL;
4237 arg_type = long_long_unsigned_type_node;
4240 tree call = nvptx_builtin_decl (fn, true);
4241 tree bits = build_int_cst (unsigned_type_node, shift);
4242 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4243 tree expr;
4245 if (var_type != dest_type)
4247 /* Do real and imaginary parts separately. */
4248 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4249 real = fold_build1 (code, arg_type, real);
4250 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4251 real = fold_build1 (code, var_type, real);
4253 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4254 imag = fold_build1 (code, arg_type, imag);
4255 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4256 imag = fold_build1 (code, var_type, imag);
4258 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4260 else
4262 expr = fold_build1 (code, arg_type, var);
4263 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4264 expr = fold_build1 (code, dest_type, expr);
4267 gimplify_assign (dest_var, expr, seq);
4270 /* Lazily generate the global lock var decl and return its address. */
4272 static tree
4273 nvptx_global_lock_addr ()
4275 tree v = global_lock_var;
4277 if (!v)
4279 tree name = get_identifier ("__reduction_lock");
4280 tree type = build_qualified_type (unsigned_type_node,
4281 TYPE_QUAL_VOLATILE);
4282 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4283 global_lock_var = v;
4284 DECL_ARTIFICIAL (v) = 1;
4285 DECL_EXTERNAL (v) = 1;
4286 TREE_STATIC (v) = 1;
4287 TREE_PUBLIC (v) = 1;
4288 TREE_USED (v) = 1;
4289 mark_addressable (v);
4290 mark_decl_referenced (v);
4293 return build_fold_addr_expr (v);
4296 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
4297 GSI. We use a lockless scheme for nearly all case, which looks
4298 like:
4299 actual = initval(OP);
4300 do {
4301 guess = actual;
4302 write = guess OP myval;
4303 actual = cmp&swap (ptr, guess, write)
4304 } while (actual bit-different-to guess);
4305 return write;
4307 This relies on a cmp&swap instruction, which is available for 32-
4308 and 64-bit types. Larger types must use a locking scheme. */
4310 static tree
4311 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4312 tree ptr, tree var, tree_code op)
4314 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4315 tree_code code = NOP_EXPR;
4316 tree arg_type = unsigned_type_node;
4317 tree var_type = TREE_TYPE (var);
4319 if (TREE_CODE (var_type) == COMPLEX_TYPE
4320 || TREE_CODE (var_type) == REAL_TYPE)
4321 code = VIEW_CONVERT_EXPR;
4323 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
4325 arg_type = long_long_unsigned_type_node;
4326 fn = NVPTX_BUILTIN_CMP_SWAPLL;
4329 tree swap_fn = nvptx_builtin_decl (fn, true);
4331 gimple_seq init_seq = NULL;
4332 tree init_var = make_ssa_name (arg_type);
4333 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4334 init_expr = fold_build1 (code, arg_type, init_expr);
4335 gimplify_assign (init_var, init_expr, &init_seq);
4336 gimple *init_end = gimple_seq_last (init_seq);
4338 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4340 /* Split the block just after the init stmts. */
4341 basic_block pre_bb = gsi_bb (*gsi);
4342 edge pre_edge = split_block (pre_bb, init_end);
4343 basic_block loop_bb = pre_edge->dest;
4344 pre_bb = pre_edge->src;
4345 /* Reset the iterator. */
4346 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4348 tree expect_var = make_ssa_name (arg_type);
4349 tree actual_var = make_ssa_name (arg_type);
4350 tree write_var = make_ssa_name (arg_type);
4352 /* Build and insert the reduction calculation. */
4353 gimple_seq red_seq = NULL;
4354 tree write_expr = fold_build1 (code, var_type, expect_var);
4355 write_expr = fold_build2 (op, var_type, write_expr, var);
4356 write_expr = fold_build1 (code, arg_type, write_expr);
4357 gimplify_assign (write_var, write_expr, &red_seq);
4359 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4361 /* Build & insert the cmp&swap sequence. */
4362 gimple_seq latch_seq = NULL;
4363 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4364 ptr, expect_var, write_var);
4365 gimplify_assign (actual_var, swap_expr, &latch_seq);
4367 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4368 NULL_TREE, NULL_TREE);
4369 gimple_seq_add_stmt (&latch_seq, cond);
4371 gimple *latch_end = gimple_seq_last (latch_seq);
4372 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
4374 /* Split the block just after the latch stmts. */
4375 edge post_edge = split_block (loop_bb, latch_end);
4376 basic_block post_bb = post_edge->dest;
4377 loop_bb = post_edge->src;
4378 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4380 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4381 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4382 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4383 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4385 gphi *phi = create_phi_node (expect_var, loop_bb);
4386 add_phi_arg (phi, init_var, pre_edge, loc);
4387 add_phi_arg (phi, actual_var, loop_edge, loc);
4389 loop *loop = alloc_loop ();
4390 loop->header = loop_bb;
4391 loop->latch = loop_bb;
4392 add_loop (loop, loop_bb->loop_father);
4394 return fold_build1 (code, var_type, write_var);
4397 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
4398 GSI. This is necessary for types larger than 64 bits, where there
4399 is no cmp&swap instruction to implement a lockless scheme. We use
4400 a lock variable in global memory.
4402 while (cmp&swap (&lock_var, 0, 1))
4403 continue;
4404 T accum = *ptr;
4405 accum = accum OP var;
4406 *ptr = accum;
4407 cmp&swap (&lock_var, 1, 0);
4408 return accum;
4410 A lock in global memory is necessary to force execution engine
4411 descheduling and avoid resource starvation that can occur if the
4412 lock is in .shared memory. */
4414 static tree
4415 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4416 tree ptr, tree var, tree_code op)
4418 tree var_type = TREE_TYPE (var);
4419 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4420 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4421 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4423 /* Split the block just before the gsi. Insert a gimple nop to make
4424 this easier. */
4425 gimple *nop = gimple_build_nop ();
4426 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4427 basic_block entry_bb = gsi_bb (*gsi);
4428 edge entry_edge = split_block (entry_bb, nop);
4429 basic_block lock_bb = entry_edge->dest;
4430 /* Reset the iterator. */
4431 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4433 /* Build and insert the locking sequence. */
4434 gimple_seq lock_seq = NULL;
4435 tree lock_var = make_ssa_name (unsigned_type_node);
4436 tree lock_expr = nvptx_global_lock_addr ();
4437 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4438 uns_unlocked, uns_locked);
4439 gimplify_assign (lock_var, lock_expr, &lock_seq);
4440 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4441 NULL_TREE, NULL_TREE);
4442 gimple_seq_add_stmt (&lock_seq, cond);
4443 gimple *lock_end = gimple_seq_last (lock_seq);
4444 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4446 /* Split the block just after the lock sequence. */
4447 edge locked_edge = split_block (lock_bb, lock_end);
4448 basic_block update_bb = locked_edge->dest;
4449 lock_bb = locked_edge->src;
4450 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4452 /* Create the lock loop ... */
4453 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4454 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
4455 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
4456 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
4458 /* ... and the loop structure. */
4459 loop *lock_loop = alloc_loop ();
4460 lock_loop->header = lock_bb;
4461 lock_loop->latch = lock_bb;
4462 lock_loop->nb_iterations_estimate = 1;
4463 lock_loop->any_estimate = true;
4464 add_loop (lock_loop, entry_bb->loop_father);
4466 /* Build and insert the reduction calculation. */
4467 gimple_seq red_seq = NULL;
4468 tree acc_in = make_ssa_name (var_type);
4469 tree ref_in = build_simple_mem_ref (ptr);
4470 TREE_THIS_VOLATILE (ref_in) = 1;
4471 gimplify_assign (acc_in, ref_in, &red_seq);
4473 tree acc_out = make_ssa_name (var_type);
4474 tree update_expr = fold_build2 (op, var_type, ref_in, var);
4475 gimplify_assign (acc_out, update_expr, &red_seq);
4477 tree ref_out = build_simple_mem_ref (ptr);
4478 TREE_THIS_VOLATILE (ref_out) = 1;
4479 gimplify_assign (ref_out, acc_out, &red_seq);
4481 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4483 /* Build & insert the unlock sequence. */
4484 gimple_seq unlock_seq = NULL;
4485 tree unlock_expr = nvptx_global_lock_addr ();
4486 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
4487 uns_locked, uns_unlocked);
4488 gimplify_and_add (unlock_expr, &unlock_seq);
4489 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
4491 return acc_out;
4494 /* Emit a sequence to update a reduction accumlator at *PTR with the
4495 value held in VAR using operator OP. Return the updated value.
4497 TODO: optimize for atomic ops and indepedent complex ops. */
4499 static tree
4500 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
4501 tree ptr, tree var, tree_code op)
4503 tree type = TREE_TYPE (var);
4504 tree size = TYPE_SIZE (type);
4506 if (size == TYPE_SIZE (unsigned_type_node)
4507 || size == TYPE_SIZE (long_long_unsigned_type_node))
4508 return nvptx_lockless_update (loc, gsi, ptr, var, op);
4509 else
4510 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
4513 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4515 static void
4516 nvptx_goacc_reduction_setup (gcall *call)
4518 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4519 tree lhs = gimple_call_lhs (call);
4520 tree var = gimple_call_arg (call, 2);
4521 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4522 gimple_seq seq = NULL;
4524 push_gimplify_context (true);
4526 if (level != GOMP_DIM_GANG)
4528 /* Copy the receiver object. */
4529 tree ref_to_res = gimple_call_arg (call, 1);
4531 if (!integer_zerop (ref_to_res))
4532 var = build_simple_mem_ref (ref_to_res);
4535 if (level == GOMP_DIM_WORKER)
4537 /* Store incoming value to worker reduction buffer. */
4538 tree offset = gimple_call_arg (call, 5);
4539 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4540 tree ptr = make_ssa_name (TREE_TYPE (call));
4542 gimplify_assign (ptr, call, &seq);
4543 tree ref = build_simple_mem_ref (ptr);
4544 TREE_THIS_VOLATILE (ref) = 1;
4545 gimplify_assign (ref, var, &seq);
4548 if (lhs)
4549 gimplify_assign (lhs, var, &seq);
4551 pop_gimplify_context (NULL);
4552 gsi_replace_with_seq (&gsi, seq, true);
4555 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
4557 static void
4558 nvptx_goacc_reduction_init (gcall *call)
4560 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4561 tree lhs = gimple_call_lhs (call);
4562 tree var = gimple_call_arg (call, 2);
4563 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4564 enum tree_code rcode
4565 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4566 tree init = omp_reduction_init_op (gimple_location (call), rcode,
4567 TREE_TYPE (var));
4568 gimple_seq seq = NULL;
4570 push_gimplify_context (true);
4572 if (level == GOMP_DIM_VECTOR)
4574 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4575 tree tid = make_ssa_name (integer_type_node);
4576 tree dim_vector = gimple_call_arg (call, 3);
4577 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
4578 dim_vector);
4579 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
4580 NULL_TREE, NULL_TREE);
4582 gimple_call_set_lhs (tid_call, tid);
4583 gimple_seq_add_stmt (&seq, tid_call);
4584 gimple_seq_add_stmt (&seq, cond_stmt);
4586 /* Split the block just after the call. */
4587 edge init_edge = split_block (gsi_bb (gsi), call);
4588 basic_block init_bb = init_edge->dest;
4589 basic_block call_bb = init_edge->src;
4591 /* Fixup flags from call_bb to init_bb. */
4592 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
4594 /* Set the initialization stmts. */
4595 gimple_seq init_seq = NULL;
4596 tree init_var = make_ssa_name (TREE_TYPE (var));
4597 gimplify_assign (init_var, init, &init_seq);
4598 gsi = gsi_start_bb (init_bb);
4599 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
4601 /* Split block just after the init stmt. */
4602 gsi_prev (&gsi);
4603 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
4604 basic_block dst_bb = inited_edge->dest;
4606 /* Create false edge from call_bb to dst_bb. */
4607 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
4609 /* Create phi node in dst block. */
4610 gphi *phi = create_phi_node (lhs, dst_bb);
4611 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
4612 add_phi_arg (phi, var, nop_edge, gimple_location (call));
4614 /* Reset dominator of dst bb. */
4615 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
4617 /* Reset the gsi. */
4618 gsi = gsi_for_stmt (call);
4620 else
4622 if (level == GOMP_DIM_GANG)
4624 /* If there's no receiver object, propagate the incoming VAR. */
4625 tree ref_to_res = gimple_call_arg (call, 1);
4626 if (integer_zerop (ref_to_res))
4627 init = var;
4630 gimplify_assign (lhs, init, &seq);
4633 pop_gimplify_context (NULL);
4634 gsi_replace_with_seq (&gsi, seq, true);
4637 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
4639 static void
4640 nvptx_goacc_reduction_fini (gcall *call)
4642 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4643 tree lhs = gimple_call_lhs (call);
4644 tree ref_to_res = gimple_call_arg (call, 1);
4645 tree var = gimple_call_arg (call, 2);
4646 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4647 enum tree_code op
4648 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4649 gimple_seq seq = NULL;
4650 tree r = NULL_TREE;;
4652 push_gimplify_context (true);
4654 if (level == GOMP_DIM_VECTOR)
4656 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
4657 but that requires a method of emitting a unified jump at the
4658 gimple level. */
4659 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
4661 tree other_var = make_ssa_name (TREE_TYPE (var));
4662 nvptx_generate_vector_shuffle (gimple_location (call),
4663 other_var, var, shfl, &seq);
4665 r = make_ssa_name (TREE_TYPE (var));
4666 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
4667 var, other_var), &seq);
4668 var = r;
4671 else
4673 tree accum = NULL_TREE;
4675 if (level == GOMP_DIM_WORKER)
4677 /* Get reduction buffer address. */
4678 tree offset = gimple_call_arg (call, 5);
4679 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4680 tree ptr = make_ssa_name (TREE_TYPE (call));
4682 gimplify_assign (ptr, call, &seq);
4683 accum = ptr;
4685 else if (integer_zerop (ref_to_res))
4686 r = var;
4687 else
4688 accum = ref_to_res;
4690 if (accum)
4692 /* UPDATE the accumulator. */
4693 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4694 seq = NULL;
4695 r = nvptx_reduction_update (gimple_location (call), &gsi,
4696 accum, var, op);
4700 if (lhs)
4701 gimplify_assign (lhs, r, &seq);
4702 pop_gimplify_context (NULL);
4704 gsi_replace_with_seq (&gsi, seq, true);
4707 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
4709 static void
4710 nvptx_goacc_reduction_teardown (gcall *call)
4712 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4713 tree lhs = gimple_call_lhs (call);
4714 tree var = gimple_call_arg (call, 2);
4715 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4716 gimple_seq seq = NULL;
4718 push_gimplify_context (true);
4719 if (level == GOMP_DIM_WORKER)
4721 /* Read the worker reduction buffer. */
4722 tree offset = gimple_call_arg (call, 5);
4723 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
4724 tree ptr = make_ssa_name (TREE_TYPE (call));
4726 gimplify_assign (ptr, call, &seq);
4727 var = build_simple_mem_ref (ptr);
4728 TREE_THIS_VOLATILE (var) = 1;
4731 if (level != GOMP_DIM_GANG)
4733 /* Write to the receiver object. */
4734 tree ref_to_res = gimple_call_arg (call, 1);
4736 if (!integer_zerop (ref_to_res))
4737 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
4740 if (lhs)
4741 gimplify_assign (lhs, var, &seq);
4743 pop_gimplify_context (NULL);
4745 gsi_replace_with_seq (&gsi, seq, true);
4748 /* NVPTX reduction expander. */
4750 static void
4751 nvptx_goacc_reduction (gcall *call)
4753 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
4755 switch (code)
4757 case IFN_GOACC_REDUCTION_SETUP:
4758 nvptx_goacc_reduction_setup (call);
4759 break;
4761 case IFN_GOACC_REDUCTION_INIT:
4762 nvptx_goacc_reduction_init (call);
4763 break;
4765 case IFN_GOACC_REDUCTION_FINI:
4766 nvptx_goacc_reduction_fini (call);
4767 break;
4769 case IFN_GOACC_REDUCTION_TEARDOWN:
4770 nvptx_goacc_reduction_teardown (call);
4771 break;
4773 default:
4774 gcc_unreachable ();
4778 #undef TARGET_OPTION_OVERRIDE
4779 #define TARGET_OPTION_OVERRIDE nvptx_option_override
4781 #undef TARGET_ATTRIBUTE_TABLE
4782 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
4784 #undef TARGET_LEGITIMATE_ADDRESS_P
4785 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
4787 #undef TARGET_PROMOTE_FUNCTION_MODE
4788 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
4790 #undef TARGET_FUNCTION_ARG
4791 #define TARGET_FUNCTION_ARG nvptx_function_arg
4792 #undef TARGET_FUNCTION_INCOMING_ARG
4793 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
4794 #undef TARGET_FUNCTION_ARG_ADVANCE
4795 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
4796 #undef TARGET_PASS_BY_REFERENCE
4797 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
4798 #undef TARGET_FUNCTION_VALUE_REGNO_P
4799 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
4800 #undef TARGET_FUNCTION_VALUE
4801 #define TARGET_FUNCTION_VALUE nvptx_function_value
4802 #undef TARGET_LIBCALL_VALUE
4803 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
4804 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
4805 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
4806 #undef TARGET_GET_DRAP_RTX
4807 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
4808 #undef TARGET_SPLIT_COMPLEX_ARG
4809 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
4810 #undef TARGET_RETURN_IN_MEMORY
4811 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
4812 #undef TARGET_OMIT_STRUCT_RETURN_REG
4813 #define TARGET_OMIT_STRUCT_RETURN_REG true
4814 #undef TARGET_STRICT_ARGUMENT_NAMING
4815 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
4816 #undef TARGET_CALL_ARGS
4817 #define TARGET_CALL_ARGS nvptx_call_args
4818 #undef TARGET_END_CALL_ARGS
4819 #define TARGET_END_CALL_ARGS nvptx_end_call_args
4821 #undef TARGET_ASM_FILE_START
4822 #define TARGET_ASM_FILE_START nvptx_file_start
4823 #undef TARGET_ASM_FILE_END
4824 #define TARGET_ASM_FILE_END nvptx_file_end
4825 #undef TARGET_ASM_GLOBALIZE_LABEL
4826 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
4827 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
4828 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
4829 #undef TARGET_PRINT_OPERAND
4830 #define TARGET_PRINT_OPERAND nvptx_print_operand
4831 #undef TARGET_PRINT_OPERAND_ADDRESS
4832 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
4833 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
4834 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
4835 #undef TARGET_ASM_INTEGER
4836 #define TARGET_ASM_INTEGER nvptx_assemble_integer
4837 #undef TARGET_ASM_DECL_END
4838 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
4839 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
4840 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
4841 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
4842 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
4843 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
4844 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
4846 #undef TARGET_MACHINE_DEPENDENT_REORG
4847 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
4848 #undef TARGET_NO_REGISTER_ALLOCATION
4849 #define TARGET_NO_REGISTER_ALLOCATION true
4851 #undef TARGET_ENCODE_SECTION_INFO
4852 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
4853 #undef TARGET_RECORD_OFFLOAD_SYMBOL
4854 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
4856 #undef TARGET_VECTOR_ALIGNMENT
4857 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
4859 #undef TARGET_CANNOT_COPY_INSN_P
4860 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
4862 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
4863 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
4865 #undef TARGET_INIT_BUILTINS
4866 #define TARGET_INIT_BUILTINS nvptx_init_builtins
4867 #undef TARGET_EXPAND_BUILTIN
4868 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
4869 #undef TARGET_BUILTIN_DECL
4870 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
4872 #undef TARGET_GOACC_VALIDATE_DIMS
4873 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
4875 #undef TARGET_GOACC_DIM_LIMIT
4876 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
4878 #undef TARGET_GOACC_FORK_JOIN
4879 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
4881 #undef TARGET_GOACC_REDUCTION
4882 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
4884 struct gcc_target targetm = TARGET_INITIALIZER;
4886 #include "gt-nvptx.h"