deps/boost_1_63_0/boost/compute/algorithm/detail/reduce_on_gpu.hpp

   1 //---------------------------------------------------------------------------//
   2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
   3 //
   4 // Distributed under the Boost Software License, Version 1.0
   5 // See accompanying file LICENSE_1_0.txt or copy at
   6 // http://www.boost.org/LICENSE_1_0.txt
   7 //
   8 // See http://boostorg.github.com/compute for more information.
   9 //---------------------------------------------------------------------------//
  10
  11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
  12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
  13
  14 #include <iterator>
  15
  16 #include <boost/compute/utility/source.hpp>
  17 #include <boost/compute/program.hpp>
  18 #include <boost/compute/command_queue.hpp>
  19 #include <boost/compute/detail/vendor.hpp>
  20 #include <boost/compute/detail/parameter_cache.hpp>
  21 #include <boost/compute/detail/work_size.hpp>
  22 #include <boost/compute/detail/meta_kernel.hpp>
  23 #include <boost/compute/type_traits/type_name.hpp>
  24 #include <boost/compute/utility/program_cache.hpp>
  25
  26 namespace boost {
  27 namespace compute {
  28 namespace detail {
  29
  30 /// \internal
  31 /// body reduction inside a warp
  32 template<typename T,bool isNvidiaDevice>
  33 struct ReduceBody
  34 {
  35     static std::string body()
  36     {
  37         std::stringstream k;
  38         // local reduction
  39         k << "for(int i = 1; i < TPB; i <<= 1){\n" <<
  40              "   barrier(CLK_LOCAL_MEM_FENCE);\n"  <<
  41              "   uint mask = (i << 1) - 1;\n"      <<
  42              "   if((lid & mask) == 0){\n"         <<
  43              "       scratch[lid] += scratch[lid+i];\n" <<
  44              "   }\n" <<
  45             "}\n";
  46         return k.str();
  47     }
  48 };
  49
  50 /// \internal
  51 /// body reduction inside a warp
  52 /// for nvidia device we can use the "unsafe"
  53 /// memory optimisation
  54 template<typename T>
  55 struct ReduceBody<T,true>
  56 {
  57     static std::string body()
  58     {
  59         std::stringstream k;
  60         // local reduction
  61         // we use TPB to compile only useful instruction
  62         // local reduction when size is greater than warp size
  63         k << "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
  64         "if(TPB >= 1024){\n" <<
  65             "if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
  66          "if(TPB >= 512){\n" <<
  67             "if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
  68          "if(TPB >= 256){\n" <<
  69             "if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
  70          "if(TPB >= 128){\n" <<
  71             "if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" <<
  72
  73         // warp reduction
  74         "if(lid < 32){\n" <<
  75             // volatile this way we don't need any barrier
  76             "volatile __local " << type_name<T>() << " *lmem = scratch;\n" <<
  77             "if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" <<
  78             "if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" <<
  79             "if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" <<
  80             "if(TPB >=  8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" <<
  81             "if(TPB >=  4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" <<
  82             "if(TPB >=  2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" <<
  83         "}\n";
  84         return k.str();
  85     }
  86 };
  87
  88 template<class InputIterator, class Function>
  89 inline void initial_reduce(InputIterator first,
  90                            InputIterator last,
  91                            buffer result,
  92                            const Function &function,
  93                            kernel &reduce_kernel,
  94                            const uint_ vpt,
  95                            const uint_ tpb,
  96                            command_queue &queue)
  97 {
  98     (void) function;
  99     (void) reduce_kernel;
 100
 101     typedef typename std::iterator_traits<InputIterator>::value_type Arg;
 102     typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;
 103
 104     size_t count = std::distance(first, last);
 105     detail::meta_kernel k("initial_reduce");
 106     k.add_set_arg<const uint_>("count", uint_(count));
 107     size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");
 108
 109     k <<
 110         k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
 111         k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
 112
 113         "__local " << type_name<T>() << " scratch[TPB];\n" <<
 114
 115         // private reduction
 116         k.decl<T>("sum") << " = 0;\n" <<
 117         "for(uint i = 0; i < VPT; i++){\n" <<
 118         "    if(offset + lid + i*TPB < count){\n" <<
 119         "        sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
 120         "    }\n" <<
 121         "}\n" <<
 122
 123         "scratch[lid] = sum;\n" <<
 124
 125         // local reduction
 126         ReduceBody<T,false>::body() <<
 127
 128         // write sum to output
 129         "if(lid == 0){\n" <<
 130         "    output[get_group_id(0)] = scratch[0];\n" <<
 131         "}\n";
 132
 133     const context &context = queue.get_context();
 134     std::stringstream options;
 135     options << "-DVPT=" << vpt << " -DTPB=" << tpb;
 136     kernel generic_reduce_kernel = k.compile(context, options.str());
 137     generic_reduce_kernel.set_arg(output_arg, result);
 138
 139     size_t work_size = calculate_work_size(count, vpt, tpb);
 140
 141     queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
 142 }
 143
 144 template<class T>
 145 inline void initial_reduce(const buffer_iterator<T> &first,
 146                            const buffer_iterator<T> &last,
 147                            const buffer &result,
 148                            const plus<T> &function,
 149                            kernel &reduce_kernel,
 150                            const uint_ vpt,
 151                            const uint_ tpb,
 152                            command_queue &queue)
 153 {
 154     (void) function;
 155
 156     size_t count = std::distance(first, last);
 157
 158     reduce_kernel.set_arg(0, first.get_buffer());
 159     reduce_kernel.set_arg(1, uint_(first.get_index()));
 160     reduce_kernel.set_arg(2, uint_(count));
 161     reduce_kernel.set_arg(3, result);
 162     reduce_kernel.set_arg(4, uint_(0));
 163
 164     size_t work_size = calculate_work_size(count, vpt, tpb);
 165
 166     queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
 167 }
 168
 169 template<class InputIterator, class T, class Function>
 170 inline void reduce_on_gpu(InputIterator first,
 171                           InputIterator last,
 172                           buffer_iterator<T> result,
 173                           Function function,
 174                           command_queue &queue)
 175 {
 176     const device &device = queue.get_device();
 177     const context &context = queue.get_context();
 178
 179     detail::meta_kernel k("reduce");
 180     k.add_arg<const T*>(memory_object::global_memory, "input");
 181     k.add_arg<const uint_>("offset");
 182     k.add_arg<const uint_>("count");
 183     k.add_arg<T*>(memory_object::global_memory, "output");
 184     k.add_arg<const uint_>("output_offset");
 185
 186     k <<
 187         k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
 188         "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<
 189         k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
 190
 191         "__local " << type_name<T>() << " scratch[TPB];\n" <<
 192         // private reduction
 193         k.decl<T>("sum") << " = 0;\n" <<
 194         "for(uint i = 0; i < VPT; i++){\n" <<
 195         "    if(block_offset + lid + i*TPB < count){\n" <<
 196         "        sum = sum + block[lid+i*TPB]; \n" <<
 197         "    }\n" <<
 198         "}\n" <<
 199
 200         "scratch[lid] = sum;\n";
 201
 202     // discrimination on vendor name
 203     if(is_nvidia_device(device))
 204         k << ReduceBody<T,true>::body();
 205     else
 206         k << ReduceBody<T,false>::body();
 207
 208     k <<
 209         // write sum to output
 210          "if(lid == 0){\n" <<
 211          "    output[output_offset + get_group_id(0)] = scratch[0];\n" <<
 212          "}\n";
 213
 214     std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();
 215
 216     // load parameters
 217     boost::shared_ptr<parameter_cache> parameters =
 218         detail::parameter_cache::get_global_cache(device);
 219
 220     uint_ vpt = parameters->get(cache_key, "vpt", 8);
 221     uint_ tpb = parameters->get(cache_key, "tpb", 128);
 222
 223     // reduce program compiler flags
 224     std::stringstream options;
 225     options << "-DT=" << type_name<T>()
 226             << " -DVPT=" << vpt
 227             << " -DTPB=" << tpb;
 228
 229     // load program
 230     boost::shared_ptr<program_cache> cache =
 231         program_cache::get_global_cache(context);
 232
 233     program reduce_program = cache->get_or_build(
 234         cache_key, options.str(), k.source(), context
 235     );
 236
 237     // create reduce kernel
 238     kernel reduce_kernel(reduce_program, "reduce");
 239
 240     size_t count = std::distance(first, last);
 241
 242     // first pass, reduce from input to ping
 243     buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
 244     initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);
 245
 246     // update count after initial reduce
 247     count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
 248
 249     // middle pass(es), reduce between ping and pong
 250     const buffer *input_buffer = &ping;
 251     buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T)));
 252     const buffer *output_buffer = &pong;
 253     if(count > vpt * tpb){
 254         while(count > vpt * tpb){
 255             reduce_kernel.set_arg(0, *input_buffer);
 256             reduce_kernel.set_arg(1, uint_(0));
 257             reduce_kernel.set_arg(2, uint_(count));
 258             reduce_kernel.set_arg(3, *output_buffer);
 259             reduce_kernel.set_arg(4, uint_(0));
 260
 261             size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt));
 262             if(work_size % tpb != 0){
 263                 work_size += tpb - work_size % tpb;
 264             }
 265             queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
 266
 267             std::swap(input_buffer, output_buffer);
 268             count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
 269         }
 270     }
 271
 272     // final pass, reduce from ping/pong to result
 273     reduce_kernel.set_arg(0, *input_buffer);
 274     reduce_kernel.set_arg(1, uint_(0));
 275     reduce_kernel.set_arg(2, uint_(count));
 276     reduce_kernel.set_arg(3, result.get_buffer());
 277     reduce_kernel.set_arg(4, uint_(result.get_index()));
 278
 279     queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);
 280 }
 281
 282 } // end detail namespace
 283 } // end compute namespace
 284 } // end boost namespace
 285
 286 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP