2 //===-- parallel_backend_tbb.h --------------------------------------------===//
4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 // See https://llvm.org/LICENSE.txt for license information.
6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 //===----------------------------------------------------------------------===//
10 #ifndef _PSTL_PARALLEL_BACKEND_TBB_H
11 #define _PSTL_PARALLEL_BACKEND_TBB_H
14 #include <type_traits>
16 #include "parallel_backend_utils.h"
18 // Bring in minimal required subset of Intel TBB
19 #include <tbb/blocked_range.h>
20 #include <tbb/parallel_for.h>
21 #include <tbb/parallel_reduce.h>
22 #include <tbb/parallel_scan.h>
23 #include <tbb/parallel_invoke.h>
24 #include <tbb/task_arena.h>
25 #include <tbb/tbb_allocator.h>
28 #if TBB_INTERFACE_VERSION < 10000
29 # error Intel(R) Threading Building Blocks 2018 is required; older versions are not supported.
34 namespace __tbb_backend
37 //! Raw memory buffer with automatic freeing and no exceptions.
38 /** Some of our algorithms need to start with raw memory buffer,
39 not an initialize array, because initialization/destruction
40 would make the span be at least O(N). */
41 // tbb::allocator can improve performance in some cases.
42 template <typename _Tp
>
45 tbb::tbb_allocator
<_Tp
> _M_allocator
;
47 const std::size_t _M_buf_size
;
48 __buffer(const __buffer
&) = delete;
50 operator=(const __buffer
&) = delete;
53 //! Try to obtain buffer of given size to store objects of _Tp type
54 __buffer(std::size_t n
) : _M_allocator(), _M_ptr(_M_allocator
.allocate(n
)), _M_buf_size(n
) {}
55 //! True if buffer was successfully obtained, zero otherwise.
56 operator bool() const { return _M_ptr
!= NULL
; }
57 //! Return pointer to buffer, or NULL if buffer could not be obtained.
64 ~__buffer() { _M_allocator
.deallocate(_M_ptr
, _M_buf_size
); }
67 // Wrapper for tbb::task
71 #if TBB_INTERFACE_VERSION <= 12000
72 tbb::task::self().group()->cancel_group_execution();
74 tbb::task::current_context()->cancel_group_execution();
78 //------------------------------------------------------------------------
80 //------------------------------------------------------------------------
82 template <class _Index
, class _RealBody
>
83 class __parallel_for_body
86 __parallel_for_body(const _RealBody
& __body
) : _M_body(__body
) {}
87 __parallel_for_body(const __parallel_for_body
& __body
) : _M_body(__body
._M_body
) {}
89 operator()(const tbb::blocked_range
<_Index
>& __range
) const
91 _M_body(__range
.begin(), __range
.end());
98 //! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last)
99 // wrapper over tbb::parallel_for
100 template <class _ExecutionPolicy
, class _Index
, class _Fp
>
102 __parallel_for(__pstl::__internal::__tbb_backend_tag
, _ExecutionPolicy
&&, _Index __first
, _Index __last
, _Fp __f
)
104 tbb::this_task_arena::isolate([=]() {
105 tbb::parallel_for(tbb::blocked_range
<_Index
>(__first
, __last
), __parallel_for_body
<_Index
, _Fp
>(__f
));
109 //! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last)
110 // wrapper over tbb::parallel_reduce
111 template <class _ExecutionPolicy
, class _Value
, class _Index
, typename _RealBody
, typename _Reduction
>
113 __parallel_reduce(__pstl::__internal::__tbb_backend_tag
, _ExecutionPolicy
&&, _Index __first
, _Index __last
,
114 const _Value
& __identity
, const _RealBody
& __real_body
, const _Reduction
& __reduction
)
116 return tbb::this_task_arena::isolate([__first
, __last
, &__identity
, &__real_body
, &__reduction
]() -> _Value
{
117 return tbb::parallel_reduce(
118 tbb::blocked_range
<_Index
>(__first
, __last
), __identity
,
119 [__real_body
](const tbb::blocked_range
<_Index
>& __r
, const _Value
& __value
) -> _Value
{
120 return __real_body(__r
.begin(), __r
.end(), __value
);
126 //------------------------------------------------------------------------
127 // parallel_transform_reduce
130 // r(i,j,init) returns reduction of init with reduction over [i,j)
131 // u(i) returns f(i,i+1,identity) for a hypothetical left identity element of r
132 // c(x,y) combines values x and y that were the result of r or u
133 //------------------------------------------------------------------------
135 template <class _Index
, class _Up
, class _Tp
, class _Cp
, class _Rp
>
136 struct __par_trans_red_body
138 alignas(_Tp
) char _M_sum_storage
[sizeof(_Tp
)]; // Holds generalized non-commutative sum when has_sum==true
139 _Rp _M_brick_reduce
; // Most likely to have non-empty layout
142 bool _M_has_sum
; // Put last to minimize size of class
146 _PSTL_ASSERT_MSG(_M_has_sum
, "sum expected");
147 return *(_Tp
*)_M_sum_storage
;
149 __par_trans_red_body(_Up __u
, _Tp __init
, _Cp __c
, _Rp __r
)
150 : _M_brick_reduce(__r
), _M_u(__u
), _M_combine(__c
), _M_has_sum(true)
152 new (_M_sum_storage
) _Tp(__init
);
155 __par_trans_red_body(__par_trans_red_body
& __left
, tbb::split
)
156 : _M_brick_reduce(__left
._M_brick_reduce
), _M_u(__left
._M_u
), _M_combine(__left
._M_combine
), _M_has_sum(false)
160 ~__par_trans_red_body()
162 // 17.6.5.12 tells us to not worry about catching exceptions from destructors.
168 join(__par_trans_red_body
& __rhs
)
170 sum() = _M_combine(sum(), __rhs
.sum());
174 operator()(const tbb::blocked_range
<_Index
>& __range
)
176 _Index __i
= __range
.begin();
177 _Index __j
= __range
.end();
180 _PSTL_ASSERT_MSG(__range
.size() > 1, "there should be at least 2 elements");
181 new (&_M_sum_storage
)
182 _Tp(_M_combine(_M_u(__i
), _M_u(__i
+ 1))); // The condition i+1 < j is provided by the grain size of 3
184 std::advance(__i
, 2);
188 sum() = _M_brick_reduce(__i
, __j
, sum());
192 template <class _ExecutionPolicy
, class _Index
, class _Up
, class _Tp
, class _Cp
, class _Rp
>
194 __parallel_transform_reduce(__pstl::__internal::__tbb_backend_tag
, _ExecutionPolicy
&&, _Index __first
, _Index __last
,
195 _Up __u
, _Tp __init
, _Cp __combine
, _Rp __brick_reduce
)
197 __tbb_backend::__par_trans_red_body
<_Index
, _Up
, _Tp
, _Cp
, _Rp
> __body(__u
, __init
, __combine
, __brick_reduce
);
198 // The grain size of 3 is used in order to provide mininum 2 elements for each body
199 tbb::this_task_arena::isolate(
200 [__first
, __last
, &__body
]() { tbb::parallel_reduce(tbb::blocked_range
<_Index
>(__first
, __last
, 3), __body
); });
204 //------------------------------------------------------------------------
206 //------------------------------------------------------------------------
208 template <class _Index
, class _Up
, class _Tp
, class _Cp
, class _Rp
, class _Sp
>
209 class __trans_scan_body
211 alignas(_Tp
) char _M_sum_storage
[sizeof(_Tp
)]; // Holds generalized non-commutative sum when has_sum==true
212 _Rp _M_brick_reduce
; // Most likely to have non-empty layout
216 bool _M_has_sum
; // Put last to minimize size of class
218 __trans_scan_body(_Up __u
, _Tp __init
, _Cp __combine
, _Rp __reduce
, _Sp __scan
)
219 : _M_brick_reduce(__reduce
), _M_u(__u
), _M_combine(__combine
), _M_scan(__scan
), _M_has_sum(true)
221 new (_M_sum_storage
) _Tp(__init
);
224 __trans_scan_body(__trans_scan_body
& __b
, tbb::split
)
225 : _M_brick_reduce(__b
._M_brick_reduce
), _M_u(__b
._M_u
), _M_combine(__b
._M_combine
), _M_scan(__b
._M_scan
),
232 // 17.6.5.12 tells us to not worry about catching exceptions from destructors.
240 _PSTL_ASSERT_MSG(_M_has_sum
, "sum expected");
241 return *const_cast<_Tp
*>(reinterpret_cast<_Tp
const*>(_M_sum_storage
));
245 operator()(const tbb::blocked_range
<_Index
>& __range
, tbb::pre_scan_tag
)
247 _Index __i
= __range
.begin();
248 _Index __j
= __range
.end();
251 new (&_M_sum_storage
) _Tp(_M_u(__i
));
257 sum() = _M_brick_reduce(__i
, __j
, sum());
261 operator()(const tbb::blocked_range
<_Index
>& __range
, tbb::final_scan_tag
)
263 sum() = _M_scan(__range
.begin(), __range
.end(), sum());
267 reverse_join(__trans_scan_body
& __a
)
271 sum() = _M_combine(__a
.sum(), sum());
275 new (&_M_sum_storage
) _Tp(__a
.sum());
281 assign(__trans_scan_body
& __b
)
287 template <typename _Index
>
292 while (2 * __k
< __m
)
297 //------------------------------------------------------------------------
298 // __parallel_strict_scan
299 //------------------------------------------------------------------------
301 template <typename _Index
, typename _Tp
, typename _Rp
, typename _Cp
>
303 __upsweep(_Index __i
, _Index __m
, _Index __tilesize
, _Tp
* __r
, _Index __lastsize
, _Rp __reduce
, _Cp __combine
)
306 __r
[0] = __reduce(__i
* __tilesize
, __lastsize
);
309 _Index __k
= __split(__m
);
310 tbb::parallel_invoke(
311 [=] { __tbb_backend::__upsweep(__i
, __k
, __tilesize
, __r
, __tilesize
, __reduce
, __combine
); },
313 __tbb_backend::__upsweep(__i
+ __k
, __m
- __k
, __tilesize
, __r
+ __k
, __lastsize
, __reduce
, __combine
);
316 __r
[__m
- 1] = __combine(__r
[__k
- 1], __r
[__m
- 1]);
320 template <typename _Index
, typename _Tp
, typename _Cp
, typename _Sp
>
322 __downsweep(_Index __i
, _Index __m
, _Index __tilesize
, _Tp
* __r
, _Index __lastsize
, _Tp __initial
, _Cp __combine
,
326 __scan(__i
* __tilesize
, __lastsize
, __initial
);
329 const _Index __k
= __split(__m
);
330 tbb::parallel_invoke(
331 [=] { __tbb_backend::__downsweep(__i
, __k
, __tilesize
, __r
, __tilesize
, __initial
, __combine
, __scan
); },
332 // Assumes that __combine never throws.
333 //TODO: Consider adding a requirement for user functors to be constant.
335 __tbb_backend::__downsweep(__i
+ __k
, __m
- __k
, __tilesize
, __r
+ __k
, __lastsize
,
336 __combine(__initial
, __r
[__k
- 1]), __combine
, __scan
);
341 // Adapted from Intel(R) Cilk(TM) version from cilkpub.
342 // Let i:len denote a counted interval of length n starting at i. s denotes a generalized-sum value.
343 // Expected actions of the functors are:
344 // reduce(i,len) -> s -- return reduction value of i:len.
345 // combine(s1,s2) -> s -- return merged sum
346 // apex(s) -- do any processing necessary between reduce and scan.
347 // scan(i,len,initial) -- perform scan over i:len starting with initial.
348 // The initial range 0:n is partitioned into consecutive subranges.
349 // reduce and scan are each called exactly once per subrange.
350 // Thus callers can rely upon side effects in reduce.
351 // combine must not throw an exception.
352 // apex is called exactly once, after all calls to reduce and before all calls to scan.
353 // For example, it's useful for allocating a __buffer used by scan but whose size is the sum of all reduction values.
354 // T must have a trivial constructor and destructor.
355 template <class _ExecutionPolicy
, typename _Index
, typename _Tp
, typename _Rp
, typename _Cp
, typename _Sp
, typename _Ap
>
357 __parallel_strict_scan(__pstl::__internal::__tbb_backend_tag
, _ExecutionPolicy
&&, _Index __n
, _Tp __initial
,
358 _Rp __reduce
, _Cp __combine
, _Sp __scan
, _Ap __apex
)
360 tbb::this_task_arena::isolate([=, &__combine
]() {
363 _Index __p
= tbb::this_task_arena::max_concurrency();
364 const _Index __slack
= 4;
365 _Index __tilesize
= (__n
- 1) / (__slack
* __p
) + 1;
366 _Index __m
= (__n
- 1) / __tilesize
;
367 __buffer
<_Tp
> __buf(__m
+ 1);
368 _Tp
* __r
= __buf
.get();
369 __tbb_backend::__upsweep(_Index(0), _Index(__m
+ 1), __tilesize
, __r
, __n
- __m
* __tilesize
, __reduce
,
372 // When __apex is a no-op and __combine has no side effects, a good optimizer
373 // should be able to eliminate all code between here and __apex.
374 // Alternatively, provide a default value for __apex that can be
375 // recognized by metaprogramming that conditionlly executes the following.
376 size_t __k
= __m
+ 1;
377 _Tp __t
= __r
[__k
- 1];
378 while ((__k
&= __k
- 1))
379 __t
= __combine(__r
[__k
- 1], __t
);
380 __apex(__combine(__initial
, __t
));
381 __tbb_backend::__downsweep(_Index(0), _Index(__m
+ 1), __tilesize
, __r
, __n
- __m
* __tilesize
, __initial
,
385 // Fewer than 2 elements in sequence, or out of memory. Handle has single block.
386 _Tp __sum
= __initial
;
388 __sum
= __combine(__sum
, __reduce(_Index(0), __n
));
391 __scan(_Index(0), __n
, __initial
);
395 template <class _ExecutionPolicy
, class _Index
, class _Up
, class _Tp
, class _Cp
, class _Rp
, class _Sp
>
397 __parallel_transform_scan(__pstl::__internal::__tbb_backend_tag
, _ExecutionPolicy
&&, _Index __n
, _Up __u
, _Tp __init
,
398 _Cp __combine
, _Rp __brick_reduce
, _Sp __scan
)
400 __trans_scan_body
<_Index
, _Up
, _Tp
, _Cp
, _Rp
, _Sp
> __body(__u
, __init
, __combine
, __brick_reduce
, __scan
);
401 auto __range
= tbb::blocked_range
<_Index
>(0, __n
);
402 tbb::this_task_arena::isolate([__range
, &__body
]() { tbb::parallel_scan(__range
, __body
); });
406 //------------------------------------------------------------------------
407 // parallel_stable_sort
408 //------------------------------------------------------------------------
410 //------------------------------------------------------------------------
411 // stable_sort utilities
413 // These are used by parallel implementations but do not depend on them.
414 //------------------------------------------------------------------------
415 #define _PSTL_MERGE_CUT_OFF 2000
417 template <typename _Func
>
419 template <typename _Func
>
422 #if TBB_INTERFACE_VERSION <= 12000
423 class __task
: public tbb::task
426 template <typename _Fn
>
428 make_continuation(_Fn
&& __f
)
430 return new (allocate_continuation()) __func_task
<typename
std::decay
<_Fn
>::type
>(std::forward
<_Fn
>(__f
));
433 template <typename _Fn
>
435 make_child_of(__task
* parent
, _Fn
&& __f
)
437 return new (parent
->allocate_child()) __func_task
<typename
std::decay
<_Fn
>::type
>(std::forward
<_Fn
>(__f
));
440 template <typename _Fn
>
442 make_additional_child_of(tbb::task
* parent
, _Fn
&& __f
)
444 return new (tbb::task::allocate_additional_child_of(*parent
))
445 __func_task
<typename
std::decay
<_Fn
>::type
>(std::forward
<_Fn
>(__f
));
449 recycle_as_continuation()
451 tbb::task::recycle_as_continuation();
455 recycle_as_child_of(__task
* parent
)
457 tbb::task::recycle_as_child_of(*parent
);
463 tbb::task::spawn(*__t
);
466 template <typename _Fn
>
468 spawn_root_and_wait(__root_task
<_Fn
>& __root
)
470 tbb::task::spawn_root_and_wait(*__root
._M_task
);
474 template <typename _Func
>
475 class __func_task
: public __task
482 return _M_func(this);
486 template <typename _Fn
>
487 __func_task(_Fn
&& __f
) : _M_func
{std::forward
<_Fn
>(__f
)}
498 template <typename _Func
>
504 template <typename
... Args
>
505 __root_task(Args
&&... args
)
506 : _M_task
{new (tbb::task::allocate_root()) __func_task
<_Func
>{_Func(std::forward
<Args
>(args
)...)}}
511 friend class __func_task
<_Func
>;
514 #else // TBB_INTERFACE_VERSION <= 12000
515 class __task
: public tbb::detail::d1::task
518 tbb::detail::d1::small_object_allocator _M_allocator
{};
519 tbb::detail::d1::execution_data
* _M_execute_data
{};
521 std::atomic
<int> _M_refcount
{};
524 template <typename _Fn
>
526 allocate_func_task(_Fn
&& __f
)
528 _PSTL_ASSERT(_M_execute_data
!= nullptr);
529 tbb::detail::d1::small_object_allocator __alloc
{};
531 __alloc
.new_object
<__func_task
<typename
std::decay
<_Fn
>::type
>>(*_M_execute_data
, std::forward
<_Fn
>(__f
));
532 __t
->_M_allocator
= __alloc
;
544 set_ref_count(int __n
)
546 _M_refcount
.store(__n
, std::memory_order_release
);
549 template <typename _Fn
>
551 make_continuation(_Fn
&& __f
)
553 auto __t
= allocate_func_task(std::forward
<_Fn
&&>(__f
));
554 __t
->_M_parent
= _M_parent
;
559 template <typename _Fn
>
561 make_child_of(__task
* __parent
, _Fn
&& __f
)
563 auto __t
= allocate_func_task(std::forward
<_Fn
&&>(__f
));
564 __t
->_M_parent
= __parent
;
568 template <typename _Fn
>
570 make_additional_child_of(__task
* __parent
, _Fn
&& __f
)
572 auto __t
= make_child_of(__parent
, std::forward
<_Fn
>(__f
));
573 _PSTL_ASSERT(__parent
->_M_refcount
.load(std::memory_order_relaxed
) > 0);
574 ++__parent
->_M_refcount
;
579 recycle_as_continuation()
585 recycle_as_child_of(__task
* parent
)
594 _PSTL_ASSERT(_M_execute_data
!= nullptr);
595 tbb::detail::d1::spawn(*__t
, *_M_execute_data
->context
);
598 template <typename _Fn
>
600 spawn_root_and_wait(__root_task
<_Fn
>& __root
)
602 tbb::detail::d1::execute_and_wait(*__root
._M_func_task
, __root
._M_context
, __root
._M_wait_object
,
606 template <typename _Func
>
607 friend class __func_task
;
610 template <typename _Func
>
611 class __func_task
: public __task
616 execute(tbb::detail::d1::execution_data
& __ed
) override
618 _M_execute_data
= &__ed
;
620 __task
* __next
= _M_func(this);
621 return finalize(__next
);
625 cancel(tbb::detail::d1::execution_data
& __ed
) override
627 return finalize(nullptr);
631 finalize(__task
* __next
)
633 bool __recycle
= _M_recycle
;
641 auto __parent
= _M_parent
;
642 auto __alloc
= _M_allocator
;
643 auto __ed
= _M_execute_data
;
645 this->~__func_task();
647 _PSTL_ASSERT(__parent
!= nullptr);
648 _PSTL_ASSERT(__parent
->_M_refcount
.load(std::memory_order_relaxed
) > 0);
649 if (--__parent
->_M_refcount
== 0)
651 _PSTL_ASSERT(__next
== nullptr);
652 __alloc
.deallocate(this, *__ed
);
659 friend class __root_task
<_Func
>;
662 template <typename _Fn
>
663 __func_task(_Fn
&& __f
) : _M_func(std::forward
<_Fn
>(__f
))
674 template <typename _Func
>
675 class __root_task
: public __task
678 execute(tbb::detail::d1::execution_data
& __ed
) override
680 _M_wait_object
.release();
685 cancel(tbb::detail::d1::execution_data
& __ed
) override
687 _M_wait_object
.release();
691 __func_task
<_Func
>* _M_func_task
{};
692 tbb::detail::d1::wait_context _M_wait_object
{0};
693 tbb::task_group_context _M_context
{};
696 template <typename
... Args
>
697 __root_task(Args
&&... args
) : _M_wait_object
{1}
699 tbb::detail::d1::small_object_allocator __alloc
{};
700 _M_func_task
= __alloc
.new_object
<__func_task
<_Func
>>(_Func(std::forward
<Args
>(args
)...));
701 _M_func_task
->_M_allocator
= __alloc
;
702 _M_func_task
->_M_parent
= this;
703 _M_refcount
.store(1, std::memory_order_relaxed
);
708 #endif // TBB_INTERFACE_VERSION <= 12000
710 template <typename _RandomAccessIterator1
, typename _RandomAccessIterator2
, typename _Compare
, typename _Cleanup
,
714 typedef typename
std::iterator_traits
<_RandomAccessIterator1
>::difference_type _DifferenceType1
;
715 typedef typename
std::iterator_traits
<_RandomAccessIterator2
>::difference_type _DifferenceType2
;
716 typedef typename
std::common_type
<_DifferenceType1
, _DifferenceType2
>::type _SizeType
;
717 typedef typename
std::iterator_traits
<_RandomAccessIterator1
>::value_type _ValueType
;
719 _RandomAccessIterator1 _M_x_beg
;
720 _RandomAccessIterator2 _M_z_beg
;
722 _SizeType _M_xs
, _M_xe
;
723 _SizeType _M_ys
, _M_ye
;
726 _LeafMerge _M_leaf_merge
;
727 _SizeType _M_nsort
; //number of elements to be sorted for partial_sort alforithm
729 static const _SizeType __merge_cut_off
= _PSTL_MERGE_CUT_OFF
;
731 bool _root
; //means a task is merging root task
732 bool _x_orig
; //"true" means X(or left ) subrange is in the original container; false - in the buffer
733 bool _y_orig
; //"true" means Y(or right) subrange is in the original container; false - in the buffer
734 bool _split
; //"true" means a merge task is a split task for parallel merging, the execution logic differs
744 template <typename Iterator1
, typename Iterator2
>
746 operator()(Iterator1 __x
, Iterator2 __z
)
748 *__z
= std::move(*__x
);
752 struct __move_value_construct
754 template <typename Iterator1
, typename Iterator2
>
756 operator()(Iterator1 __x
, Iterator2 __z
)
758 ::new (std::addressof(*__z
)) _ValueType(std::move(*__x
));
764 template <typename Iterator1
, typename Iterator2
>
766 operator()(Iterator1 __first1
, Iterator1 __last1
, Iterator2 __first2
)
768 if (__last1
- __first1
< __merge_cut_off
)
769 return std::move(__first1
, __last1
, __first2
);
771 auto __n
= __last1
- __first1
;
772 tbb::parallel_for(tbb::blocked_range
<_SizeType
>(0, __n
, __merge_cut_off
),
773 [__first1
, __first2
](const tbb::blocked_range
<_SizeType
>& __range
) {
774 std::move(__first1
+ __range
.begin(), __first1
+ __range
.end(),
775 __first2
+ __range
.begin());
777 return __first2
+ __n
;
781 struct __move_range_construct
783 template <typename Iterator1
, typename Iterator2
>
785 operator()(Iterator1 __first1
, Iterator1 __last1
, Iterator2 __first2
)
787 if (__last1
- __first1
< __merge_cut_off
)
789 for (; __first1
!= __last1
; ++__first1
, ++__first2
)
790 __move_value_construct()(__first1
, __first2
);
794 auto __n
= __last1
- __first1
;
795 tbb::parallel_for(tbb::blocked_range
<_SizeType
>(0, __n
, __merge_cut_off
),
796 [__first1
, __first2
](const tbb::blocked_range
<_SizeType
>& __range
) {
797 for (auto i
= __range
.begin(); i
!= __range
.end(); ++i
)
798 __move_value_construct()(__first1
+ i
, __first2
+ i
);
800 return __first2
+ __n
;
804 struct __cleanup_range
806 template <typename _Iterator
>
808 operator()(_Iterator __first
, _Iterator __last
)
810 if (__last
- __first
< __merge_cut_off
)
811 _Cleanup()(__first
, __last
);
814 auto __n
= __last
- __first
;
815 tbb::parallel_for(tbb::blocked_range
<_SizeType
>(0, __n
, __merge_cut_off
),
816 [__first
](const tbb::blocked_range
<_SizeType
>& __range
) {
817 _Cleanup()(__first
+ __range
.begin(), __first
+ __range
.end());
824 __merge_func(_SizeType __xs
, _SizeType __xe
, _SizeType __ys
, _SizeType __ye
, _SizeType __zs
, _Compare __comp
,
825 _Cleanup
, _LeafMerge __leaf_merge
, _SizeType __nsort
, _RandomAccessIterator1 __x_beg
,
826 _RandomAccessIterator2 __z_beg
, bool __x_orig
, bool __y_orig
, bool __root
)
827 : _M_xs(__xs
), _M_xe(__xe
), _M_ys(__ys
), _M_ye(__ye
), _M_zs(__zs
), _M_x_beg(__x_beg
), _M_z_beg(__z_beg
),
828 _M_comp(__comp
), _M_leaf_merge(__leaf_merge
), _M_nsort(__nsort
), _root(__root
),
829 _x_orig(__x_orig
), _y_orig(__y_orig
), _split(false)
834 is_left(_SizeType __idx
) const
836 return _M_xs
== __idx
;
839 template <typename IndexType
>
841 set_odd(IndexType __idx
, bool __on_off
)
850 operator()(__task
* __self
);
854 parent_merge(__task
* __self
) const
856 return _root
? nullptr : &static_cast<__func_task
<__merge_func
>*>(__self
->parent())->body();
861 const auto __nx
= (_M_xe
- _M_xs
);
862 const auto __ny
= (_M_ye
- _M_ys
);
863 _PSTL_ASSERT(__nx
> 0 && __ny
> 0);
865 _PSTL_ASSERT(_x_orig
== _y_orig
);
866 _PSTL_ASSERT(!is_partial());
870 _PSTL_ASSERT(std::is_sorted(_M_x_beg
+ _M_xs
, _M_x_beg
+ _M_xe
, _M_comp
));
871 _PSTL_ASSERT(std::is_sorted(_M_x_beg
+ _M_ys
, _M_x_beg
+ _M_ye
, _M_comp
));
872 return !_M_comp(*(_M_x_beg
+ _M_ys
), *(_M_x_beg
+ _M_xe
- 1));
875 _PSTL_ASSERT(std::is_sorted(_M_z_beg
+ _M_xs
, _M_z_beg
+ _M_xe
, _M_comp
));
876 _PSTL_ASSERT(std::is_sorted(_M_z_beg
+ _M_ys
, _M_z_beg
+ _M_ye
, _M_comp
));
877 return !_M_comp(*(_M_z_beg
+ _M_zs
+ __nx
), *(_M_z_beg
+ _M_zs
+ __nx
- 1));
882 const auto __nx
= (_M_xe
- _M_xs
);
883 const auto __ny
= (_M_ye
- _M_ys
);
884 _PSTL_ASSERT(__nx
> 0 && __ny
> 0);
887 __move_range_construct()(_M_x_beg
+ _M_xs
, _M_x_beg
+ _M_xe
, _M_z_beg
+ _M_zs
);
890 __move_range()(_M_z_beg
+ _M_zs
, _M_z_beg
+ _M_zs
+ __nx
, _M_x_beg
+ _M_xs
);
891 __cleanup_range()(_M_z_beg
+ _M_zs
, _M_z_beg
+ _M_zs
+ __nx
);
899 const auto __nx
= (_M_xe
- _M_xs
);
900 const auto __ny
= (_M_ye
- _M_ys
);
903 __move_range_construct()(_M_x_beg
+ _M_ys
, _M_x_beg
+ _M_ye
, _M_z_beg
+ _M_zs
+ __nx
);
906 __move_range()(_M_z_beg
+ _M_zs
+ __nx
, _M_z_beg
+ _M_zs
+ __nx
+ __ny
, _M_x_beg
+ _M_ys
);
907 __cleanup_range()(_M_z_beg
+ _M_zs
+ __nx
, _M_z_beg
+ _M_zs
+ __nx
+ __ny
);
913 merge_ranges(__task
* __self
)
915 _PSTL_ASSERT(_x_orig
== _y_orig
); //two merged subrange must be lie into the same buffer
917 const auto __nx
= (_M_xe
- _M_xs
);
918 const auto __ny
= (_M_ye
- _M_ys
);
919 const auto __n
= __nx
+ __ny
;
921 // need to merge {x} and {y}
922 if (__n
> __merge_cut_off
)
923 return split_merging(__self
);
928 _M_leaf_merge(_M_x_beg
+ _M_xs
, _M_x_beg
+ _M_xe
, _M_x_beg
+ _M_ys
, _M_x_beg
+ _M_ye
, _M_z_beg
+ _M_zs
,
929 _M_comp
, __move_value_construct(), __move_value_construct(), __move_range_construct(),
930 __move_range_construct());
931 _PSTL_ASSERT(parent_merge(__self
)); //not root merging task
936 _PSTL_ASSERT(_x_orig
== _y_orig
);
938 _PSTL_ASSERT(is_partial() || std::is_sorted(_M_z_beg
+ _M_xs
, _M_z_beg
+ _M_xe
, _M_comp
));
939 _PSTL_ASSERT(is_partial() || std::is_sorted(_M_z_beg
+ _M_ys
, _M_z_beg
+ _M_ye
, _M_comp
));
941 const auto __nx
= (_M_xe
- _M_xs
);
942 const auto __ny
= (_M_ye
- _M_ys
);
944 _M_leaf_merge(_M_z_beg
+ _M_xs
, _M_z_beg
+ _M_xe
, _M_z_beg
+ _M_ys
, _M_z_beg
+ _M_ye
, _M_x_beg
+ _M_zs
,
945 _M_comp
, __move_value(), __move_value(), __move_range(), __move_range());
947 __cleanup_range()(_M_z_beg
+ _M_xs
, _M_z_beg
+ _M_xe
);
948 __cleanup_range()(_M_z_beg
+ _M_ys
, _M_z_beg
+ _M_ye
);
954 process_ranges(__task
* __self
)
956 _PSTL_ASSERT(_x_orig
== _y_orig
);
957 _PSTL_ASSERT(!_split
);
959 auto p
= parent_merge(__self
);
962 { //root merging task
964 //optimization, just for sort algorithm, //{x} <= {y}
965 if (!is_partial() && x_less_y()) //we have a solution
968 { //we have to move the solution to the origin
969 move_x_range(); //parallel moving
970 move_y_range(); //parallel moving
974 //else: if we have data in the origin,
975 //we have to move data to the buffer for final merging into the origin.
978 move_x_range(); //parallel moving
979 move_y_range(); //parallel moving
981 // need to merge {x} and {y}.
982 return merge_ranges(__self
);
984 //else: not root merging task (parent_merge() == NULL)
985 //optimization, just for sort algorithm, //{x} <= {y}
986 if (!is_partial() && x_less_y())
988 const auto id_range
= _M_zs
;
989 p
->set_odd(id_range
, _x_orig
);
992 //else: we have to revert "_x(y)_orig" flag of the parent merging task
993 const auto id_range
= _M_zs
;
994 p
->set_odd(id_range
, !_x_orig
);
996 return merge_ranges(__self
);
999 //splitting as merge task into 2 of the same level
1001 split_merging(__task
* __self
)
1003 _PSTL_ASSERT(_x_orig
== _y_orig
);
1004 const auto __nx
= (_M_xe
- _M_xs
);
1005 const auto __ny
= (_M_ye
- _M_ys
);
1011 __ym
= _M_ys
+ __ny
/ 2;
1014 __xm
= std::upper_bound(_M_x_beg
+ _M_xs
, _M_x_beg
+ _M_xe
, *(_M_x_beg
+ __ym
), _M_comp
) - _M_x_beg
;
1016 __xm
= std::upper_bound(_M_z_beg
+ _M_xs
, _M_z_beg
+ _M_xe
, *(_M_z_beg
+ __ym
), _M_comp
) - _M_z_beg
;
1020 __xm
= _M_xs
+ __nx
/ 2;
1023 __ym
= std::lower_bound(_M_x_beg
+ _M_ys
, _M_x_beg
+ _M_ye
, *(_M_x_beg
+ __xm
), _M_comp
) - _M_x_beg
;
1025 __ym
= std::lower_bound(_M_z_beg
+ _M_ys
, _M_z_beg
+ _M_ye
, *(_M_z_beg
+ __xm
), _M_comp
) - _M_z_beg
;
1028 auto __zm
= _M_zs
+ ((__xm
- _M_xs
) + (__ym
- _M_ys
));
1029 __merge_func
__right_func(__xm
, _M_xe
, __ym
, _M_ye
, __zm
, _M_comp
, _Cleanup(), _M_leaf_merge
, _M_nsort
,
1030 _M_x_beg
, _M_z_beg
, _x_orig
, _y_orig
, _root
);
1031 __right_func
._split
= true;
1032 auto __merge_task
= __self
->make_additional_child_of(__self
->parent(), std::move(__right_func
));
1033 __self
->spawn(__merge_task
);
1034 __self
->recycle_as_continuation();
1044 template <typename _RandomAccessIterator1
, typename _RandomAccessIterator2
, typename __M_Compare
, typename _Cleanup
,
1045 typename _LeafMerge
>
1047 __merge_func
<_RandomAccessIterator1
, _RandomAccessIterator2
, __M_Compare
, _Cleanup
, _LeafMerge
>::
1048 operator()(__task
* __self
)
1050 //a. split merge task into 2 of the same level; the special logic,
1051 //without processing(process_ranges) adjacent sub-ranges x and y
1053 return merge_ranges(__self
);
1055 //b. General merging of adjacent sub-ranges x and y (with optimization in case of {x} <= {y} )
1057 //1. x and y are in the even buffer
1058 //2. x and y are in the odd buffer
1059 if (_x_orig
== _y_orig
)
1060 return process_ranges(__self
);
1062 //3. x is in even buffer, y is in the odd buffer
1063 //4. x is in odd buffer, y is in the even buffer
1064 if (!parent_merge(__self
))
1073 const _SizeType __nx
= (_M_xe
- _M_xs
);
1074 const _SizeType __ny
= (_M_ye
- _M_ys
);
1075 _PSTL_ASSERT(__nx
> 0);
1076 _PSTL_ASSERT(__nx
> 0);
1084 return process_ranges(__self
);
1087 template <typename _RandomAccessIterator1
, typename _RandomAccessIterator2
, typename _Compare
, typename _LeafSort
>
1088 class __stable_sort_func
1091 typedef typename
std::iterator_traits
<_RandomAccessIterator1
>::difference_type _DifferenceType1
;
1092 typedef typename
std::iterator_traits
<_RandomAccessIterator2
>::difference_type _DifferenceType2
;
1093 typedef typename
std::common_type
<_DifferenceType1
, _DifferenceType2
>::type _SizeType
;
1096 _RandomAccessIterator1 _M_xs
, _M_xe
, _M_x_beg
;
1097 _RandomAccessIterator2 _M_zs
, _M_z_beg
;
1099 _LeafSort _M_leaf_sort
;
1101 _SizeType _M_nsort
; //zero or number of elements to be sorted for partial_sort alforithm
1104 __stable_sort_func(_RandomAccessIterator1 __xs
, _RandomAccessIterator1 __xe
, _RandomAccessIterator2 __zs
,
1105 bool __root
, _Compare __comp
, _LeafSort __leaf_sort
, _SizeType __nsort
,
1106 _RandomAccessIterator1 __x_beg
, _RandomAccessIterator2 __z_beg
)
1107 : _M_xs(__xs
), _M_xe(__xe
), _M_x_beg(__x_beg
), _M_zs(__zs
), _M_z_beg(__z_beg
), _M_comp(__comp
),
1108 _M_leaf_sort(__leaf_sort
), _M_root(__root
), _M_nsort(__nsort
)
1113 operator()(__task
* __self
);
1116 #define _PSTL_STABLE_SORT_CUT_OFF 500
1118 template <typename _RandomAccessIterator1
, typename _RandomAccessIterator2
, typename _Compare
, typename _LeafSort
>
1120 __stable_sort_func
<_RandomAccessIterator1
, _RandomAccessIterator2
, _Compare
, _LeafSort
>::operator()(__task
* __self
)
1122 typedef __merge_func
<_RandomAccessIterator1
, _RandomAccessIterator2
, _Compare
, __utils::__serial_destroy
,
1123 __utils::__serial_move_merge
>
1126 const _SizeType __n
= _M_xe
- _M_xs
;
1127 const _SizeType __nmerge
= _M_nsort
> 0 ? _M_nsort
: __n
;
1128 const _SizeType __sort_cut_off
= _PSTL_STABLE_SORT_CUT_OFF
;
1129 if (__n
<= __sort_cut_off
)
1131 _M_leaf_sort(_M_xs
, _M_xe
, _M_comp
);
1132 _PSTL_ASSERT(!_M_root
);
1136 const _RandomAccessIterator1 __xm
= _M_xs
+ __n
/ 2;
1137 const _RandomAccessIterator2 __zm
= _M_zs
+ (__xm
- _M_xs
);
1138 const _RandomAccessIterator2 __ze
= _M_zs
+ __n
;
1139 _MergeTaskType
__m(_MergeTaskType(_M_xs
- _M_x_beg
, __xm
- _M_x_beg
, __xm
- _M_x_beg
, _M_xe
- _M_x_beg
,
1140 _M_zs
- _M_z_beg
, _M_comp
, __utils::__serial_destroy(),
1141 __utils::__serial_move_merge(__nmerge
), _M_nsort
, _M_x_beg
, _M_z_beg
,
1142 /*x_orig*/ true, /*y_orig*/ true, /*root*/ _M_root
));
1143 auto __parent
= __self
->make_continuation(std::move(__m
));
1144 __parent
->set_ref_count(2);
1145 auto __right
= __self
->make_child_of(
1146 __parent
, __stable_sort_func(__xm
, _M_xe
, __zm
, false, _M_comp
, _M_leaf_sort
, _M_nsort
, _M_x_beg
, _M_z_beg
));
1147 __self
->spawn(__right
);
1148 __self
->recycle_as_child_of(__parent
);
1155 template <class _ExecutionPolicy
, typename _RandomAccessIterator
, typename _Compare
, typename _LeafSort
>
1157 __parallel_stable_sort(__pstl::__internal::__tbb_backend_tag
, _ExecutionPolicy
&&, _RandomAccessIterator __xs
,
1158 _RandomAccessIterator __xe
, _Compare __comp
, _LeafSort __leaf_sort
, std::size_t __nsort
= 0)
1160 tbb::this_task_arena::isolate([=, &__nsort
]() {
1161 //sorting based on task tree and parallel merge
1162 typedef typename
std::iterator_traits
<_RandomAccessIterator
>::value_type _ValueType
;
1163 typedef typename
std::iterator_traits
<_RandomAccessIterator
>::difference_type _DifferenceType
;
1164 const _DifferenceType __n
= __xe
- __xs
;
1166 __nsort
= 0; // 'partial_sort' becames 'sort'
1168 const _DifferenceType __sort_cut_off
= _PSTL_STABLE_SORT_CUT_OFF
;
1169 if (__n
> __sort_cut_off
)
1171 __buffer
<_ValueType
> __buf(__n
);
1172 __root_task
<__stable_sort_func
<_RandomAccessIterator
, _ValueType
*, _Compare
, _LeafSort
>> __root
{
1173 __xs
, __xe
, __buf
.get(), true, __comp
, __leaf_sort
, __nsort
, __xs
, __buf
.get()};
1174 __task::spawn_root_and_wait(__root
);
1178 __leaf_sort(__xs
, __xe
, __comp
);
1182 //------------------------------------------------------------------------
1184 //------------------------------------------------------------------------
1185 template <typename _RandomAccessIterator1
, typename _RandomAccessIterator2
, typename _RandomAccessIterator3
,
1186 typename _Compare
, typename _LeafMerge
>
1187 class __merge_func_static
1189 _RandomAccessIterator1 _M_xs
, _M_xe
;
1190 _RandomAccessIterator2 _M_ys
, _M_ye
;
1191 _RandomAccessIterator3 _M_zs
;
1193 _LeafMerge _M_leaf_merge
;
1196 __merge_func_static(_RandomAccessIterator1 __xs
, _RandomAccessIterator1 __xe
, _RandomAccessIterator2 __ys
,
1197 _RandomAccessIterator2 __ye
, _RandomAccessIterator3 __zs
, _Compare __comp
,
1198 _LeafMerge __leaf_merge
)
1199 : _M_xs(__xs
), _M_xe(__xe
), _M_ys(__ys
), _M_ye(__ye
), _M_zs(__zs
), _M_comp(__comp
), _M_leaf_merge(__leaf_merge
)
1204 operator()(__task
* __self
);
1207 //TODO: consider usage of parallel_for with a custom blocked_range
1208 template <typename _RandomAccessIterator1
, typename _RandomAccessIterator2
, typename _RandomAccessIterator3
,
1209 typename __M_Compare
, typename _LeafMerge
>
1211 __merge_func_static
<_RandomAccessIterator1
, _RandomAccessIterator2
, _RandomAccessIterator3
, __M_Compare
, _LeafMerge
>::
1212 operator()(__task
* __self
)
1214 typedef typename
std::iterator_traits
<_RandomAccessIterator1
>::difference_type _DifferenceType1
;
1215 typedef typename
std::iterator_traits
<_RandomAccessIterator2
>::difference_type _DifferenceType2
;
1216 typedef typename
std::common_type
<_DifferenceType1
, _DifferenceType2
>::type _SizeType
;
1217 const _SizeType __n
= (_M_xe
- _M_xs
) + (_M_ye
- _M_ys
);
1218 const _SizeType __merge_cut_off
= _PSTL_MERGE_CUT_OFF
;
1219 if (__n
<= __merge_cut_off
)
1221 _M_leaf_merge(_M_xs
, _M_xe
, _M_ys
, _M_ye
, _M_zs
, _M_comp
);
1225 _RandomAccessIterator1 __xm
;
1226 _RandomAccessIterator2 __ym
;
1227 if (_M_xe
- _M_xs
< _M_ye
- _M_ys
)
1229 __ym
= _M_ys
+ (_M_ye
- _M_ys
) / 2;
1230 __xm
= std::upper_bound(_M_xs
, _M_xe
, *__ym
, _M_comp
);
1234 __xm
= _M_xs
+ (_M_xe
- _M_xs
) / 2;
1235 __ym
= std::lower_bound(_M_ys
, _M_ye
, *__xm
, _M_comp
);
1237 const _RandomAccessIterator3 __zm
= _M_zs
+ ((__xm
- _M_xs
) + (__ym
- _M_ys
));
1238 auto __right
= __self
->make_additional_child_of(
1239 __self
->parent(), __merge_func_static(__xm
, _M_xe
, __ym
, _M_ye
, __zm
, _M_comp
, _M_leaf_merge
));
1240 __self
->spawn(__right
);
1241 __self
->recycle_as_continuation();
1248 template <class _ExecutionPolicy
, typename _RandomAccessIterator1
, typename _RandomAccessIterator2
,
1249 typename _RandomAccessIterator3
, typename _Compare
, typename _LeafMerge
>
1251 __parallel_merge(__pstl::__internal::__tbb_backend_tag
, _ExecutionPolicy
&&, _RandomAccessIterator1 __xs
,
1252 _RandomAccessIterator1 __xe
, _RandomAccessIterator2 __ys
, _RandomAccessIterator2 __ye
,
1253 _RandomAccessIterator3 __zs
, _Compare __comp
, _LeafMerge __leaf_merge
)
1255 typedef typename
std::iterator_traits
<_RandomAccessIterator1
>::difference_type _DifferenceType1
;
1256 typedef typename
std::iterator_traits
<_RandomAccessIterator2
>::difference_type _DifferenceType2
;
1257 typedef typename
std::common_type
<_DifferenceType1
, _DifferenceType2
>::type _SizeType
;
1258 const _SizeType __n
= (__xe
- __xs
) + (__ye
- __ys
);
1259 const _SizeType __merge_cut_off
= _PSTL_MERGE_CUT_OFF
;
1260 if (__n
<= __merge_cut_off
)
1262 // Fall back on serial merge
1263 __leaf_merge(__xs
, __xe
, __ys
, __ye
, __zs
, __comp
);
1267 tbb::this_task_arena::isolate([=]() {
1268 typedef __merge_func_static
<_RandomAccessIterator1
, _RandomAccessIterator2
, _RandomAccessIterator3
,
1269 _Compare
, _LeafMerge
>
1271 __root_task
<_TaskType
> __root
{__xs
, __xe
, __ys
, __ye
, __zs
, __comp
, __leaf_merge
};
1272 __task::spawn_root_and_wait(__root
);
1277 //------------------------------------------------------------------------
1279 //------------------------------------------------------------------------
1280 template <class _ExecutionPolicy
, typename _F1
, typename _F2
>
1282 __parallel_invoke(__pstl::__internal::__tbb_backend_tag
, _ExecutionPolicy
&&, _F1
&& __f1
, _F2
&& __f2
)
1284 //TODO: a version of tbb::this_task_arena::isolate with variadic arguments pack should be added in the future
1285 tbb::this_task_arena::isolate([&]() { tbb::parallel_invoke(std::forward
<_F1
>(__f1
), std::forward
<_F2
>(__f2
)); });
1288 } // namespace __tbb_backend
1289 } // namespace __pstl
1291 #endif /* _PSTL_PARALLEL_BACKEND_TBB_H */