From bd9b3d3d1a8d33e460ae137da0cb0d5a919e8f8f Mon Sep 17 00:00:00 2001 From: Cesar Philippidis Date: Mon, 13 Aug 2018 05:04:24 -0700 Subject: [PATCH] [nvptx] Use CUDA driver API to select default runtime launch geometry The CUDA driver API starting version 6.5 offers a set of runtime functions to calculate several occupancy-related measures, as a replacement for the occupancy calculator spreadsheet. This patch adds a heuristic for default runtime launch geometry, based on the new runtime function cuOccupancyMaxPotentialBlockSize. Build on x86_64 with nvptx accelerator and ran libgomp testsuite. 2018-08-13 Cesar Philippidis Tom de Vries PR target/85590 * plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef. (cuOccupancyMaxPotentialBlockSize): Declare. * plugin/cuda-lib.def (cuOccupancyMaxPotentialBlockSize): New CUDA_ONE_CALL_MAYBE_NULL. * plugin/plugin-nvptx.c (CUDA_VERSION < 6050): Define CUoccupancyB2DSize and declare cuOccupancyMaxPotentialBlockSize. (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the default num_gangs and num_workers when the driver supports it. Co-Authored-By: Tom de Vries From-SVN: r263505 --- libgomp/ChangeLog | 14 ++++++++ libgomp/plugin/cuda-lib.def | 1 + libgomp/plugin/cuda/cuda.h | 3 ++ libgomp/plugin/plugin-nvptx.c | 83 ++++++++++++++++++++++++++++++++++++------- 4 files changed, 89 insertions(+), 12 deletions(-) diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index 4aff6cd9a33..f54a695fb38 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,17 @@ +2018-08-13 Cesar Philippidis + Tom de Vries + + PR target/85590 + * plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef. + (cuOccupancyMaxPotentialBlockSize): Declare. + * plugin/cuda-lib.def (cuOccupancyMaxPotentialBlockSize): New + CUDA_ONE_CALL_MAYBE_NULL. + * plugin/plugin-nvptx.c (CUDA_VERSION < 6050): Define + CUoccupancyB2DSize and declare + cuOccupancyMaxPotentialBlockSize. + (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the + default num_gangs and num_workers when the driver supports it. + 2018-08-08 Tom de Vries * plugin/cuda-lib.def (cuLinkAddData_v2, cuLinkCreate_v2): Declare using diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def index 29028b504a0..b2a4c2154eb 100644 --- a/libgomp/plugin/cuda-lib.def +++ b/libgomp/plugin/cuda-lib.def @@ -41,6 +41,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal) CUDA_ONE_CALL (cuModuleLoad) CUDA_ONE_CALL (cuModuleLoadData) CUDA_ONE_CALL (cuModuleUnload) +CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize) CUDA_ONE_CALL (cuStreamCreate) CUDA_ONE_CALL (cuStreamDestroy) CUDA_ONE_CALL (cuStreamQuery) diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h index 4799825bda2..b4c1b29c5d8 100644 --- a/libgomp/plugin/cuda/cuda.h +++ b/libgomp/plugin/cuda/cuda.h @@ -44,6 +44,7 @@ typedef void *CUevent; typedef void *CUfunction; typedef void *CUlinkState; typedef void *CUmodule; +typedef size_t (*CUoccupancyB2DSize)(int); typedef void *CUstream; typedef enum { @@ -170,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *); CUresult cuModuleLoad (CUmodule *, const char *); CUresult cuModuleLoadData (CUmodule *, const void *); CUresult cuModuleUnload (CUmodule); +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, + CUoccupancyB2DSize, size_t, int); CUresult cuStreamCreate (CUstream *, unsigned); #define cuStreamDestroy cuStreamDestroy_v2 CUresult cuStreamDestroy (CUstream); diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 6799a264976..bae1b05ccaa 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -61,9 +61,12 @@ CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t, const char *, unsigned, CUjit_option *, void **); CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *); #else +typedef size_t (*CUoccupancyB2DSize)(int); CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t, const char *, unsigned, CUjit_option *, void **); CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *); +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, + CUoccupancyB2DSize, size_t, int); #endif #define DO_PRAGMA(x) _Pragma (#x) @@ -1200,21 +1203,77 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, { bool default_dim_p[GOMP_DIM_MAX]; for (i = 0; i != GOMP_DIM_MAX; i++) + default_dim_p[i] = !dims[i]; + + if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize)) { - default_dim_p[i] = !dims[i]; - if (default_dim_p[i]) - dims[i] = nvthd->ptx_dev->default_dims[i]; + for (i = 0; i != GOMP_DIM_MAX; i++) + if (default_dim_p[i]) + dims[i] = nvthd->ptx_dev->default_dims[i]; + + if (default_dim_p[GOMP_DIM_VECTOR]) + dims[GOMP_DIM_VECTOR] + = MIN (dims[GOMP_DIM_VECTOR], + (targ_fn->max_threads_per_block / warp_size + * warp_size)); + + if (default_dim_p[GOMP_DIM_WORKER]) + dims[GOMP_DIM_WORKER] + = MIN (dims[GOMP_DIM_WORKER], + targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]); } + else + { + /* Handle the case that the compiler allows the runtime to choose + the vector-length conservatively, by ignoring + gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle + it. */ + int vectors = 0; + /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that + gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not + exceed targ_fn->max_threads_per_block. */ + int workers = gomp_openacc_dims[GOMP_DIM_WORKER]; + int gangs = gomp_openacc_dims[GOMP_DIM_GANG]; + int grids, blocks; + + CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids, + &blocks, function, NULL, 0, + dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]); + GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: " + "grid = %d, block = %d\n", grids, blocks); + + /* Keep the num_gangs proportional to the block size. In + the case were a block size is limited by shared-memory + or the register file capacity, the runtime will not + excessively over assign gangs to the multiprocessor + units if their state is going to be swapped out even + more than necessary. The constant factor 2 is there to + prevent threads from idling when there is insufficient + work for them. */ + if (gangs == 0) + gangs = 2 * grids * (blocks / warp_size); + + if (vectors == 0) + vectors = warp_size; + + if (workers == 0) + { + int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR] + ? vectors + : dims[GOMP_DIM_VECTOR]); + workers = blocks / actual_vectors; + } - if (default_dim_p[GOMP_DIM_VECTOR]) - dims[GOMP_DIM_VECTOR] - = MIN (dims[GOMP_DIM_VECTOR], - (targ_fn->max_threads_per_block / warp_size * warp_size)); - - if (default_dim_p[GOMP_DIM_WORKER]) - dims[GOMP_DIM_WORKER] - = MIN (dims[GOMP_DIM_WORKER], - targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]); + for (i = 0; i != GOMP_DIM_MAX; i++) + if (default_dim_p[i]) + switch (i) + { + case GOMP_DIM_GANG: dims[i] = gangs; break; + case GOMP_DIM_WORKER: dims[i] = workers; break; + case GOMP_DIM_VECTOR: dims[i] = vectors; break; + default: GOMP_PLUGIN_fatal ("invalid dim"); + } + } } } -- 2.11.4.GIT