Project revived from Feb2017
[EroSomnia.git] / deps / boost_1_63_0 / boost / compute / algorithm / detail / reduce_on_gpu.hpp
blob335fba8724eb8f1a70890699401edce906aaa343
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
14 #include <iterator>
16 #include <boost/compute/utility/source.hpp>
17 #include <boost/compute/program.hpp>
18 #include <boost/compute/command_queue.hpp>
19 #include <boost/compute/detail/vendor.hpp>
20 #include <boost/compute/detail/parameter_cache.hpp>
21 #include <boost/compute/detail/work_size.hpp>
22 #include <boost/compute/detail/meta_kernel.hpp>
23 #include <boost/compute/type_traits/type_name.hpp>
24 #include <boost/compute/utility/program_cache.hpp>
26 namespace boost {
27 namespace compute {
28 namespace detail {
30 /// \internal
31 /// body reduction inside a warp
32 template<typename T,bool isNvidiaDevice>
33 struct ReduceBody
35 static std::string body()
37 std::stringstream k;
38 // local reduction
39 k << "for(int i = 1; i < TPB; i <<= 1){\n" <<
40 " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
41 " uint mask = (i << 1) - 1;\n" <<
42 " if((lid & mask) == 0){\n" <<
43 " scratch[lid] += scratch[lid+i];\n" <<
44 " }\n" <<
45 "}\n";
46 return k.str();
50 /// \internal
51 /// body reduction inside a warp
52 /// for nvidia device we can use the "unsafe"
53 /// memory optimisation
54 template<typename T>
55 struct ReduceBody<T,true>
57 static std::string body()
59 std::stringstream k;
60 // local reduction
61 // we use TPB to compile only useful instruction
62 // local reduction when size is greater than warp size
63 k << "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
64 "if(TPB >= 1024){\n" <<
65 "if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
66 "if(TPB >= 512){\n" <<
67 "if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
68 "if(TPB >= 256){\n" <<
69 "if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
70 "if(TPB >= 128){\n" <<
71 "if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" <<
73 // warp reduction
74 "if(lid < 32){\n" <<
75 // volatile this way we don't need any barrier
76 "volatile __local " << type_name<T>() << " *lmem = scratch;\n" <<
77 "if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" <<
78 "if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" <<
79 "if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" <<
80 "if(TPB >= 8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" <<
81 "if(TPB >= 4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" <<
82 "if(TPB >= 2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" <<
83 "}\n";
84 return k.str();
88 template<class InputIterator, class Function>
89 inline void initial_reduce(InputIterator first,
90 InputIterator last,
91 buffer result,
92 const Function &function,
93 kernel &reduce_kernel,
94 const uint_ vpt,
95 const uint_ tpb,
96 command_queue &queue)
98 (void) function;
99 (void) reduce_kernel;
101 typedef typename std::iterator_traits<InputIterator>::value_type Arg;
102 typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;
104 size_t count = std::distance(first, last);
105 detail::meta_kernel k("initial_reduce");
106 k.add_set_arg<const uint_>("count", uint_(count));
107 size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");
109 k <<
110 k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
111 k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
113 "__local " << type_name<T>() << " scratch[TPB];\n" <<
115 // private reduction
116 k.decl<T>("sum") << " = 0;\n" <<
117 "for(uint i = 0; i < VPT; i++){\n" <<
118 " if(offset + lid + i*TPB < count){\n" <<
119 " sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
120 " }\n" <<
121 "}\n" <<
123 "scratch[lid] = sum;\n" <<
125 // local reduction
126 ReduceBody<T,false>::body() <<
128 // write sum to output
129 "if(lid == 0){\n" <<
130 " output[get_group_id(0)] = scratch[0];\n" <<
131 "}\n";
133 const context &context = queue.get_context();
134 std::stringstream options;
135 options << "-DVPT=" << vpt << " -DTPB=" << tpb;
136 kernel generic_reduce_kernel = k.compile(context, options.str());
137 generic_reduce_kernel.set_arg(output_arg, result);
139 size_t work_size = calculate_work_size(count, vpt, tpb);
141 queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
144 template<class T>
145 inline void initial_reduce(const buffer_iterator<T> &first,
146 const buffer_iterator<T> &last,
147 const buffer &result,
148 const plus<T> &function,
149 kernel &reduce_kernel,
150 const uint_ vpt,
151 const uint_ tpb,
152 command_queue &queue)
154 (void) function;
156 size_t count = std::distance(first, last);
158 reduce_kernel.set_arg(0, first.get_buffer());
159 reduce_kernel.set_arg(1, uint_(first.get_index()));
160 reduce_kernel.set_arg(2, uint_(count));
161 reduce_kernel.set_arg(3, result);
162 reduce_kernel.set_arg(4, uint_(0));
164 size_t work_size = calculate_work_size(count, vpt, tpb);
166 queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
169 template<class InputIterator, class T, class Function>
170 inline void reduce_on_gpu(InputIterator first,
171 InputIterator last,
172 buffer_iterator<T> result,
173 Function function,
174 command_queue &queue)
176 const device &device = queue.get_device();
177 const context &context = queue.get_context();
179 detail::meta_kernel k("reduce");
180 k.add_arg<const T*>(memory_object::global_memory, "input");
181 k.add_arg<const uint_>("offset");
182 k.add_arg<const uint_>("count");
183 k.add_arg<T*>(memory_object::global_memory, "output");
184 k.add_arg<const uint_>("output_offset");
186 k <<
187 k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
188 "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<
189 k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
191 "__local " << type_name<T>() << " scratch[TPB];\n" <<
192 // private reduction
193 k.decl<T>("sum") << " = 0;\n" <<
194 "for(uint i = 0; i < VPT; i++){\n" <<
195 " if(block_offset + lid + i*TPB < count){\n" <<
196 " sum = sum + block[lid+i*TPB]; \n" <<
197 " }\n" <<
198 "}\n" <<
200 "scratch[lid] = sum;\n";
202 // discrimination on vendor name
203 if(is_nvidia_device(device))
204 k << ReduceBody<T,true>::body();
205 else
206 k << ReduceBody<T,false>::body();
208 k <<
209 // write sum to output
210 "if(lid == 0){\n" <<
211 " output[output_offset + get_group_id(0)] = scratch[0];\n" <<
212 "}\n";
214 std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();
216 // load parameters
217 boost::shared_ptr<parameter_cache> parameters =
218 detail::parameter_cache::get_global_cache(device);
220 uint_ vpt = parameters->get(cache_key, "vpt", 8);
221 uint_ tpb = parameters->get(cache_key, "tpb", 128);
223 // reduce program compiler flags
224 std::stringstream options;
225 options << "-DT=" << type_name<T>()
226 << " -DVPT=" << vpt
227 << " -DTPB=" << tpb;
229 // load program
230 boost::shared_ptr<program_cache> cache =
231 program_cache::get_global_cache(context);
233 program reduce_program = cache->get_or_build(
234 cache_key, options.str(), k.source(), context
237 // create reduce kernel
238 kernel reduce_kernel(reduce_program, "reduce");
240 size_t count = std::distance(first, last);
242 // first pass, reduce from input to ping
243 buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
244 initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);
246 // update count after initial reduce
247 count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
249 // middle pass(es), reduce between ping and pong
250 const buffer *input_buffer = &ping;
251 buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T)));
252 const buffer *output_buffer = &pong;
253 if(count > vpt * tpb){
254 while(count > vpt * tpb){
255 reduce_kernel.set_arg(0, *input_buffer);
256 reduce_kernel.set_arg(1, uint_(0));
257 reduce_kernel.set_arg(2, uint_(count));
258 reduce_kernel.set_arg(3, *output_buffer);
259 reduce_kernel.set_arg(4, uint_(0));
261 size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt));
262 if(work_size % tpb != 0){
263 work_size += tpb - work_size % tpb;
265 queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
267 std::swap(input_buffer, output_buffer);
268 count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
272 // final pass, reduce from ping/pong to result
273 reduce_kernel.set_arg(0, *input_buffer);
274 reduce_kernel.set_arg(1, uint_(0));
275 reduce_kernel.set_arg(2, uint_(count));
276 reduce_kernel.set_arg(3, result.get_buffer());
277 reduce_kernel.set_arg(4, uint_(result.get_index()));
279 queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);
282 } // end detail namespace
283 } // end compute namespace
284 } // end boost namespace
286 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP