From bd9b3d3d1a8d33e460ae137da0cb0d5a919e8f8f Mon Sep 17 00:00:00 2001
From: Cesar Philippidis <cesar@codesourcery.com>
Date: Mon, 13 Aug 2018 05:04:24 -0700
Subject: [PATCH] [nvptx] Use CUDA driver API to select default runtime launch
 geometry

The CUDA driver API starting version 6.5 offers a set of runtime functions to
calculate several occupancy-related measures, as a replacement for the occupancy
calculator spreadsheet.

This patch adds a heuristic for default runtime launch geometry, based on the
new runtime function cuOccupancyMaxPotentialBlockSize.

Build on x86_64 with nvptx accelerator and ran libgomp testsuite.

2018-08-13  Cesar Philippidis  <cesar@codesourcery.com>
	    Tom de Vries  <tdevries@suse.de>

	PR target/85590
	* plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
	(cuOccupancyMaxPotentialBlockSize): Declare.
	* plugin/cuda-lib.def (cuOccupancyMaxPotentialBlockSize): New
	CUDA_ONE_CALL_MAYBE_NULL.
	* plugin/plugin-nvptx.c (CUDA_VERSION < 6050): Define
	CUoccupancyB2DSize and declare
	cuOccupancyMaxPotentialBlockSize.
	(nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
	default num_gangs and num_workers when the driver supports it.

Co-Authored-By: Tom de Vries <tdevries@suse.de>

From-SVN: r263505
---
 libgomp/ChangeLog             | 14 ++++++++
 libgomp/plugin/cuda-lib.def   |  1 +
 libgomp/plugin/cuda/cuda.h    |  3 ++
 libgomp/plugin/plugin-nvptx.c | 83 ++++++++++++++++++++++++++++++++++++-------
 4 files changed, 89 insertions(+), 12 deletions(-)

diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index 4aff6cd9a33..f54a695fb38 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,17 @@
+2018-08-13  Cesar Philippidis  <cesar@codesourcery.com>
+	    Tom de Vries  <tdevries@suse.de>
+
+	PR target/85590
+	* plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
+	(cuOccupancyMaxPotentialBlockSize): Declare.
+	* plugin/cuda-lib.def (cuOccupancyMaxPotentialBlockSize): New
+	CUDA_ONE_CALL_MAYBE_NULL.
+	* plugin/plugin-nvptx.c (CUDA_VERSION < 6050): Define
+	CUoccupancyB2DSize and declare
+	cuOccupancyMaxPotentialBlockSize.
+	(nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
+	default num_gangs and num_workers when the driver supports it.
+
 2018-08-08  Tom de Vries  <tdevries@suse.de>
 
 	* plugin/cuda-lib.def (cuLinkAddData_v2, cuLinkCreate_v2): Declare using
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index 29028b504a0..b2a4c2154eb 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -41,6 +41,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
 CUDA_ONE_CALL (cuModuleLoad)
 CUDA_ONE_CALL (cuModuleLoadData)
 CUDA_ONE_CALL (cuModuleUnload)
+CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
 CUDA_ONE_CALL (cuStreamCreate)
 CUDA_ONE_CALL (cuStreamDestroy)
 CUDA_ONE_CALL (cuStreamQuery)
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 4799825bda2..b4c1b29c5d8 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -44,6 +44,7 @@ typedef void *CUevent;
 typedef void *CUfunction;
 typedef void *CUlinkState;
 typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
 typedef void *CUstream;
 
 typedef enum {
@@ -170,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
 CUresult cuModuleLoad (CUmodule *, const char *);
 CUresult cuModuleLoadData (CUmodule *, const void *);
 CUresult cuModuleUnload (CUmodule);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+					  CUoccupancyB2DSize, size_t, int);
 CUresult cuStreamCreate (CUstream *, unsigned);
 #define cuStreamDestroy cuStreamDestroy_v2
 CUresult cuStreamDestroy (CUstream);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 6799a264976..bae1b05ccaa 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -61,9 +61,12 @@ CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
 			const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
 #else
+typedef size_t (*CUoccupancyB2DSize)(int);
 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
 			   const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+					  CUoccupancyB2DSize, size_t, int);
 #endif
 
 #define DO_PRAGMA(x) _Pragma (#x)
@@ -1200,21 +1203,77 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
       {
 	bool default_dim_p[GOMP_DIM_MAX];
 	for (i = 0; i != GOMP_DIM_MAX; i++)
+	  default_dim_p[i] = !dims[i];
+
+	if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
 	  {
-	    default_dim_p[i] = !dims[i];
-	    if (default_dim_p[i])
-	      dims[i] = nvthd->ptx_dev->default_dims[i];
+	    for (i = 0; i != GOMP_DIM_MAX; i++)
+	      if (default_dim_p[i])
+		dims[i] = nvthd->ptx_dev->default_dims[i];
+
+	    if (default_dim_p[GOMP_DIM_VECTOR])
+	      dims[GOMP_DIM_VECTOR]
+		= MIN (dims[GOMP_DIM_VECTOR],
+		       (targ_fn->max_threads_per_block / warp_size
+			* warp_size));
+
+	    if (default_dim_p[GOMP_DIM_WORKER])
+	      dims[GOMP_DIM_WORKER]
+		= MIN (dims[GOMP_DIM_WORKER],
+		       targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
 	  }
+	else
+	  {
+	    /* Handle the case that the compiler allows the runtime to choose
+	       the vector-length conservatively, by ignoring
+	       gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
+	       it.  */
+	    int vectors = 0;
+	    /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
+	       gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
+	       exceed targ_fn->max_threads_per_block. */
+	    int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
+	    int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
+	    int grids, blocks;
+
+	    CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
+			      &blocks, function, NULL, 0,
+			      dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
+	    GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
+			       "grid = %d, block = %d\n", grids, blocks);
+
+	    /* Keep the num_gangs proportional to the block size.  In
+	       the case were a block size is limited by shared-memory
+	       or the register file capacity, the runtime will not
+	       excessively over assign gangs to the multiprocessor
+	       units if their state is going to be swapped out even
+	       more than necessary. The constant factor 2 is there to
+	       prevent threads from idling when there is insufficient
+	       work for them.  */
+	    if (gangs == 0)
+	      gangs = 2 * grids * (blocks / warp_size);
+
+	    if (vectors == 0)
+	      vectors = warp_size;
+
+	    if (workers == 0)
+	      {
+		int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
+				      ? vectors
+				      : dims[GOMP_DIM_VECTOR]);
+		workers = blocks / actual_vectors;
+	      }
 
-	if (default_dim_p[GOMP_DIM_VECTOR])
-	  dims[GOMP_DIM_VECTOR]
-	    = MIN (dims[GOMP_DIM_VECTOR],
-		   (targ_fn->max_threads_per_block / warp_size * warp_size));
-
-	if (default_dim_p[GOMP_DIM_WORKER])
-	  dims[GOMP_DIM_WORKER]
-	    = MIN (dims[GOMP_DIM_WORKER],
-		   targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
+	    for (i = 0; i != GOMP_DIM_MAX; i++)
+	      if (default_dim_p[i])
+		switch (i)
+		  {
+		  case GOMP_DIM_GANG: dims[i] = gangs; break;
+		  case GOMP_DIM_WORKER: dims[i] = workers; break;
+		  case GOMP_DIM_VECTOR: dims[i] = vectors; break;
+		  default: GOMP_PLUGIN_fatal ("invalid dim");
+		  }
+	  }
       }
     }
 
-- 
2.11.4.GIT