1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
16 #include <boost/compute/utility/source.hpp>
17 #include <boost/compute/program.hpp>
18 #include <boost/compute/command_queue.hpp>
19 #include <boost/compute/detail/vendor.hpp>
20 #include <boost/compute/detail/parameter_cache.hpp>
21 #include <boost/compute/detail/work_size.hpp>
22 #include <boost/compute/detail/meta_kernel.hpp>
23 #include <boost/compute/type_traits/type_name.hpp>
24 #include <boost/compute/utility/program_cache.hpp>
31 /// body reduction inside a warp
32 template<typename T
,bool isNvidiaDevice
>
35 static std::string
body()
39 k
<< "for(int i = 1; i < TPB; i <<= 1){\n" <<
40 " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
41 " uint mask = (i << 1) - 1;\n" <<
42 " if((lid & mask) == 0){\n" <<
43 " scratch[lid] += scratch[lid+i];\n" <<
51 /// body reduction inside a warp
52 /// for nvidia device we can use the "unsafe"
53 /// memory optimisation
55 struct ReduceBody
<T
,true>
57 static std::string
body()
61 // we use TPB to compile only useful instruction
62 // local reduction when size is greater than warp size
63 k
<< "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
64 "if(TPB >= 1024){\n" <<
65 "if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
66 "if(TPB >= 512){\n" <<
67 "if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
68 "if(TPB >= 256){\n" <<
69 "if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
70 "if(TPB >= 128){\n" <<
71 "if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" <<
75 // volatile this way we don't need any barrier
76 "volatile __local " << type_name
<T
>() << " *lmem = scratch;\n" <<
77 "if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" <<
78 "if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" <<
79 "if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" <<
80 "if(TPB >= 8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" <<
81 "if(TPB >= 4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" <<
82 "if(TPB >= 2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" <<
88 template<class InputIterator
, class Function
>
89 inline void initial_reduce(InputIterator first
,
92 const Function
&function
,
93 kernel
&reduce_kernel
,
101 typedef typename
std::iterator_traits
<InputIterator
>::value_type Arg
;
102 typedef typename
boost::tr1_result_of
<Function(Arg
, Arg
)>::type T
;
104 size_t count
= std::distance(first
, last
);
105 detail::meta_kernel
k("initial_reduce");
106 k
.add_set_arg
<const uint_
>("count", uint_(count
));
107 size_t output_arg
= k
.add_arg
<T
*>(memory_object::global_memory
, "output");
110 k
.decl
<const uint_
>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
111 k
.decl
<const uint_
>("lid") << " = get_local_id(0);\n" <<
113 "__local " << type_name
<T
>() << " scratch[TPB];\n" <<
116 k
.decl
<T
>("sum") << " = 0;\n" <<
117 "for(uint i = 0; i < VPT; i++){\n" <<
118 " if(offset + lid + i*TPB < count){\n" <<
119 " sum = sum + " << first
[k
.var
<uint_
>("offset+lid+i*TPB")] << ";\n" <<
123 "scratch[lid] = sum;\n" <<
126 ReduceBody
<T
,false>::body() <<
128 // write sum to output
130 " output[get_group_id(0)] = scratch[0];\n" <<
133 const context
&context
= queue
.get_context();
134 std::stringstream options
;
135 options
<< "-DVPT=" << vpt
<< " -DTPB=" << tpb
;
136 kernel generic_reduce_kernel
= k
.compile(context
, options
.str());
137 generic_reduce_kernel
.set_arg(output_arg
, result
);
139 size_t work_size
= calculate_work_size(count
, vpt
, tpb
);
141 queue
.enqueue_1d_range_kernel(generic_reduce_kernel
, 0, work_size
, tpb
);
145 inline void initial_reduce(const buffer_iterator
<T
> &first
,
146 const buffer_iterator
<T
> &last
,
147 const buffer
&result
,
148 const plus
<T
> &function
,
149 kernel
&reduce_kernel
,
152 command_queue
&queue
)
156 size_t count
= std::distance(first
, last
);
158 reduce_kernel
.set_arg(0, first
.get_buffer());
159 reduce_kernel
.set_arg(1, uint_(first
.get_index()));
160 reduce_kernel
.set_arg(2, uint_(count
));
161 reduce_kernel
.set_arg(3, result
);
162 reduce_kernel
.set_arg(4, uint_(0));
164 size_t work_size
= calculate_work_size(count
, vpt
, tpb
);
166 queue
.enqueue_1d_range_kernel(reduce_kernel
, 0, work_size
, tpb
);
169 template<class InputIterator
, class T
, class Function
>
170 inline void reduce_on_gpu(InputIterator first
,
172 buffer_iterator
<T
> result
,
174 command_queue
&queue
)
176 const device
&device
= queue
.get_device();
177 const context
&context
= queue
.get_context();
179 detail::meta_kernel
k("reduce");
180 k
.add_arg
<const T
*>(memory_object::global_memory
, "input");
181 k
.add_arg
<const uint_
>("offset");
182 k
.add_arg
<const uint_
>("count");
183 k
.add_arg
<T
*>(memory_object::global_memory
, "output");
184 k
.add_arg
<const uint_
>("output_offset");
187 k
.decl
<const uint_
>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
188 "__global const " << type_name
<T
>() << " *block = input + offset + block_offset;\n" <<
189 k
.decl
<const uint_
>("lid") << " = get_local_id(0);\n" <<
191 "__local " << type_name
<T
>() << " scratch[TPB];\n" <<
193 k
.decl
<T
>("sum") << " = 0;\n" <<
194 "for(uint i = 0; i < VPT; i++){\n" <<
195 " if(block_offset + lid + i*TPB < count){\n" <<
196 " sum = sum + block[lid+i*TPB]; \n" <<
200 "scratch[lid] = sum;\n";
202 // discrimination on vendor name
203 if(is_nvidia_device(device
))
204 k
<< ReduceBody
<T
,true>::body();
206 k
<< ReduceBody
<T
,false>::body();
209 // write sum to output
211 " output[output_offset + get_group_id(0)] = scratch[0];\n" <<
214 std::string cache_key
= std::string("__boost_reduce_on_gpu_") + type_name
<T
>();
217 boost::shared_ptr
<parameter_cache
> parameters
=
218 detail::parameter_cache::get_global_cache(device
);
220 uint_ vpt
= parameters
->get(cache_key
, "vpt", 8);
221 uint_ tpb
= parameters
->get(cache_key
, "tpb", 128);
223 // reduce program compiler flags
224 std::stringstream options
;
225 options
<< "-DT=" << type_name
<T
>()
230 boost::shared_ptr
<program_cache
> cache
=
231 program_cache::get_global_cache(context
);
233 program reduce_program
= cache
->get_or_build(
234 cache_key
, options
.str(), k
.source(), context
237 // create reduce kernel
238 kernel
reduce_kernel(reduce_program
, "reduce");
240 size_t count
= std::distance(first
, last
);
242 // first pass, reduce from input to ping
243 buffer
ping(context
, std::ceil(float(count
) / vpt
/ tpb
) * sizeof(T
));
244 initial_reduce(first
, last
, ping
, function
, reduce_kernel
, vpt
, tpb
, queue
);
246 // update count after initial reduce
247 count
= static_cast<size_t>(std::ceil(float(count
) / vpt
/ tpb
));
249 // middle pass(es), reduce between ping and pong
250 const buffer
*input_buffer
= &ping
;
251 buffer
pong(context
, static_cast<size_t>(count
/ vpt
/ tpb
* sizeof(T
)));
252 const buffer
*output_buffer
= &pong
;
253 if(count
> vpt
* tpb
){
254 while(count
> vpt
* tpb
){
255 reduce_kernel
.set_arg(0, *input_buffer
);
256 reduce_kernel
.set_arg(1, uint_(0));
257 reduce_kernel
.set_arg(2, uint_(count
));
258 reduce_kernel
.set_arg(3, *output_buffer
);
259 reduce_kernel
.set_arg(4, uint_(0));
261 size_t work_size
= static_cast<size_t>(std::ceil(float(count
) / vpt
));
262 if(work_size
% tpb
!= 0){
263 work_size
+= tpb
- work_size
% tpb
;
265 queue
.enqueue_1d_range_kernel(reduce_kernel
, 0, work_size
, tpb
);
267 std::swap(input_buffer
, output_buffer
);
268 count
= static_cast<size_t>(std::ceil(float(count
) / vpt
/ tpb
));
272 // final pass, reduce from ping/pong to result
273 reduce_kernel
.set_arg(0, *input_buffer
);
274 reduce_kernel
.set_arg(1, uint_(0));
275 reduce_kernel
.set_arg(2, uint_(count
));
276 reduce_kernel
.set_arg(3, result
.get_buffer());
277 reduce_kernel
.set_arg(4, uint_(result
.get_index()));
279 queue
.enqueue_1d_range_kernel(reduce_kernel
, 0, tpb
, tpb
);
282 } // end detail namespace
283 } // end compute namespace
284 } // end boost namespace
286 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP