18 #ifndef viskores_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
19 #define viskores_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
52 #include <viskores/exec/cuda/internal/ExecutionPolicy.h>
54 #include <cooperative_groups.h>
56 #include <thrust/advance.h>
57 #include <thrust/binary_search.h>
58 #include <thrust/copy.h>
59 #include <thrust/count.h>
60 #include <thrust/iterator/counting_iterator.h>
61 #include <thrust/scan.h>
62 #include <thrust/sort.h>
63 #include <thrust/system/cpp/memory.h>
64 #include <thrust/system/cuda/vector.h>
65 #include <thrust/unique.h>
139 int multiProcessorCount,
140 int maxThreadsPerMultiProcessor,
141 int maxThreadsPerBlock));
146 #if (defined(VISKORES_GCC) || defined(VISKORES_CLANG))
147 #pragma GCC diagnostic push
148 #pragma GCC diagnostic ignored "-Wunused-parameter"
151 template <
typename TaskType>
152 __global__
void TaskStrided1DLaunch(TaskType task,
viskores::Id size)
156 const viskores::Id start = blockIdx.x * blockDim.x + threadIdx.x;
158 task(start, size, inc);
161 template <
typename TaskType>
162 __global__
void TaskStrided3DLaunch(TaskType task,
viskores::Id3 size)
165 const dim3 start(blockIdx.x * blockDim.x + threadIdx.x,
166 blockIdx.y * blockDim.y + threadIdx.y,
167 blockIdx.z * blockDim.z + threadIdx.z);
168 const dim3 inc(blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z);
174 task(size, start.x, size[0], inc.x, j, k);
179 template <
typename T,
typename BinaryOperationType>
180 __global__
void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op)
182 result = binary_op(a, b);
185 #if (defined(VISKORES_GCC) || defined(VISKORES_CLANG))
186 #pragma GCC diagnostic pop
189 template <
typename FunctorType,
typename ArgType>
190 struct FunctorSupportsUnaryImpl
192 template <typename F, typename A, typename = decltype(std::declval<F>()(std::declval<A>()))>
193 static std::true_type has(
int);
194 template <
typename F,
typename A>
195 static std::false_type has(...);
196 using type = decltype(has<FunctorType, ArgType>(0));
198 template <
typename FunctorType,
typename ArgType>
199 using FunctorSupportsUnary =
typename FunctorSupportsUnaryImpl<FunctorType, ArgType>::type;
201 template <
typename PortalType,
202 typename BinaryAndUnaryFunctor,
203 typename = FunctorSupportsUnary<BinaryAndUnaryFunctor, typename PortalType::ValueType>>
206 template <
typename PortalType,
typename BinaryAndUnaryFunctor>
207 struct CastPortal<PortalType, BinaryAndUnaryFunctor, std::true_type>
209 using InputType =
typename PortalType::ValueType;
210 using ValueType = decltype(std::declval<BinaryAndUnaryFunctor>()(std::declval<InputType>()));
213 BinaryAndUnaryFunctor Functor;
216 CastPortal(
const PortalType& portal,
const BinaryAndUnaryFunctor& functor)
223 viskores::Id GetNumberOfValues()
const {
return this->Portal.GetNumberOfValues(); }
226 ValueType
Get(
viskores::Id index)
const {
return this->Functor(this->Portal.Get(index)); }
229 template <
typename PortalType,
typename BinaryFunctor>
230 struct CastPortal<PortalType, BinaryFunctor, std::false_type>
232 using InputType =
typename PortalType::ValueType;
234 decltype(std::declval<BinaryFunctor>()(std::declval<InputType>(), std::declval<InputType>()));
239 CastPortal(
const PortalType& portal,
const BinaryFunctor&)
245 viskores::Id GetNumberOfValues()
const {
return this->Portal.GetNumberOfValues(); }
250 return static_cast<ValueType
>(this->Portal.Get(index));
254 struct CudaFreeFunctor
259 template <
typename T>
260 using CudaUniquePtr = std::unique_ptr<T, CudaFreeFunctor>;
262 template <
typename T>
263 CudaUniquePtr<T> make_CudaUniquePtr(std::size_t numElements)
267 return CudaUniquePtr<T>(ptr);
274 : viskores::cont::internal::DeviceAdapterAlgorithmGeneral<
275 viskores::cont::DeviceAdapterAlgorithm<viskores::cont::DeviceAdapterTagCuda>,
276 viskores::cont::DeviceAdapterTagCuda>
280 #ifndef VISKORES_CUDA
284 using Superclass = viskores::cont::internal::DeviceAdapterAlgorithmGeneral<
288 template <
typename BitsPortal,
typename IndicesPortal,
typename GlobalPopCountType>
293 std::is_same<GlobalPopCountType, viskores::UInt32>::value ||
294 std::is_same<GlobalPopCountType, viskores::UInt64>::value),
295 "Unsupported GlobalPopCountType. Must support CUDA atomicAdd.");
306 const IndicesPortal& output,
307 GlobalPopCountType* globalPopCount)
310 , GlobalPopCount{ globalPopCount }
311 , FinalWordIndex{ input.GetNumberOfWords() - 1 }
312 , FinalWordMask(input.GetFinalWordMask())
320 assert(this->GlobalPopCount !=
nullptr);
327 Word word = this->Input.GetWord(wordIdx);
330 const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~
Word{ 0 };
337 this->ReduceAllocate();
346 this->Output.Set(outIdx, firstBitIdx + bit);
354 assert(this->GlobalPopCount !=
nullptr);
355 GlobalPopCountType result;
357 &result, this->GlobalPopCount,
sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost));
369 const auto activeLanes = cooperative_groups::coalesced_threads();
370 const int activeRank = activeLanes.thread_rank();
371 const int activeSize = activeLanes.size();
375 for (
int delta = 1; delta < activeSize; delta *= 2)
378 if (activeRank + delta < activeSize)
386 this->AllocationHead =
387 atomicAdd(this->GlobalPopCount,
static_cast<GlobalPopCountType
>(rVal));
390 this->AllocationHead = activeLanes.shfl(this->AllocationHead, 0);
400 const auto activeLanes = cooperative_groups::coalesced_threads();
401 const int activeRank = activeLanes.thread_rank();
402 const int activeSize = activeLanes.size();
405 this->AllocationHead += activeSize;
420 template <
class InputPortal,
class OutputPortal>
425 ::thrust::copy(ThrustCudaPolicyPerThread,
426 cuda::internal::IteratorBegin(input),
427 cuda::internal::IteratorEnd(input),
428 cuda::internal::IteratorBegin(output));
432 cuda::internal::throwAsViskoresException();
436 template <
class ValueIterator,
class StencilPortal,
class OutputPortal,
class UnaryPredicate>
438 ValueIterator valuesEnd,
439 StencilPortal stencil,
441 UnaryPredicate unary_predicate)
443 auto outputBegin = cuda::internal::IteratorBegin(output);
445 using ValueType =
typename StencilPortal::ValueType;
447 viskores::exec::cuda::internal::WrappedUnaryPredicate<ValueType, UnaryPredicate> up(
452 auto newLast = ::thrust::copy_if(ThrustCudaPolicyPerThread,
455 cuda::internal::IteratorBegin(stencil),
458 return static_cast<viskores::Id>(::thrust::distance(outputBegin, newLast));
462 cuda::internal::throwAsViskoresException();
467 template <
class ValuePortal,
class StencilPortal,
class OutputPortal,
class UnaryPredicate>
469 StencilPortal stencil,
471 UnaryPredicate unary_predicate)
473 return CopyIfPortal(cuda::internal::IteratorBegin(values),
474 cuda::internal::IteratorEnd(values),
480 template <
class InputPortal,
class OutputPortal>
484 const OutputPortal& output,
489 ::thrust::copy_n(ThrustCudaPolicyPerThread,
490 cuda::internal::IteratorBegin(input) + inputOffset,
491 static_cast<std::size_t
>(size),
492 cuda::internal::IteratorBegin(output) + outputOffset);
496 cuda::internal::throwAsViskoresException();
501 template <
typename BitsPortal,
typename GlobalPopCountType>
506 std::is_same<GlobalPopCountType, viskores::UInt32>::value ||
507 std::is_same<GlobalPopCountType, viskores::UInt64>::value),
508 "Unsupported GlobalPopCountType. Must support CUDA atomicAdd.");
517 , GlobalPopCount{ globalPopCount }
518 , FinalWordIndex{ portal.GetNumberOfWords() - 1 }
519 , FinalWordMask{ portal.GetFinalWordMask() }
527 assert(this->GlobalPopCount !=
nullptr);
534 Word word = this->Portal.GetWord(wordIdx);
537 const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~
Word{ 0 };
550 assert(this->GlobalPopCount !=
nullptr);
551 GlobalPopCountType result;
553 &result, this->GlobalPopCount,
sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost));
564 const auto activeLanes = cooperative_groups::coalesced_threads();
565 const int activeRank = activeLanes.thread_rank();
566 const int activeSize = activeLanes.size();
570 for (
int delta = 1; delta < activeSize; delta *= 2)
573 if (activeRank + delta < activeSize)
581 atomicAdd(this->GlobalPopCount,
static_cast<GlobalPopCountType
>(rVal));
593 template <
class InputPortal,
class ValuesPortal,
class OutputPortal>
595 const ValuesPortal& values,
596 const OutputPortal& output)
598 using ValueType =
typename ValuesPortal::ValueType;
599 LowerBoundsPortal(input, values, output, ::thrust::less<ValueType>());
602 template <
class InputPortal,
class OutputPortal>
604 const OutputPortal& values_output)
606 using ValueType =
typename InputPortal::ValueType;
607 LowerBoundsPortal(input, values_output, values_output, ::thrust::less<ValueType>());
610 template <
class InputPortal,
class ValuesPortal,
class OutputPortal,
class BinaryCompare>
612 const ValuesPortal& values,
613 const OutputPortal& output,
614 BinaryCompare binary_compare)
616 using ValueType =
typename InputPortal::ValueType;
617 viskores::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
622 ::thrust::lower_bound(ThrustCudaPolicyPerThread,
623 cuda::internal::IteratorBegin(input),
624 cuda::internal::IteratorEnd(input),
625 cuda::internal::IteratorBegin(values),
626 cuda::internal::IteratorEnd(values),
627 cuda::internal::IteratorBegin(output),
632 cuda::internal::throwAsViskoresException();
636 template <
class InputPortal,
typename T>
639 return ReducePortal(input, initialValue, ::thrust::plus<T>());
642 template <
class InputPortal,
typename T,
class BinaryFunctor>
645 BinaryFunctor binary_functor)
647 using fast_path = std::is_same<typename InputPortal::ValueType, T>;
648 return ReducePortalImpl(input, initialValue, binary_functor, fast_path());
651 template <
class InputPortal,
typename T,
class BinaryFunctor>
654 BinaryFunctor binary_functor,
659 viskores::exec::cuda::internal::WrappedBinaryOperator<T, BinaryFunctor> bop(binary_functor);
663 return ::thrust::reduce(ThrustCudaPolicyPerThread,
664 cuda::internal::IteratorBegin(input),
665 cuda::internal::IteratorEnd(input),
671 cuda::internal::throwAsViskoresException();
677 template <
class InputPortal,
typename T,
class BinaryFunctor>
680 BinaryFunctor binary_functor,
686 viskores::cont::cuda::internal::CastPortal<InputPortal, BinaryFunctor> castPortal(
687 input, binary_functor);
689 viskores::exec::cuda::internal::WrappedBinaryOperator<T, BinaryFunctor> bop(binary_functor);
693 return ::thrust::reduce(ThrustCudaPolicyPerThread,
694 cuda::internal::IteratorBegin(castPortal),
695 cuda::internal::IteratorEnd(castPortal),
701 cuda::internal::throwAsViskoresException();
707 template <
class KeysPortal,
709 class KeysOutputPortal,
710 class ValueOutputPortal,
713 const ValuesPortal& values,
714 const KeysOutputPortal& keys_output,
715 const ValueOutputPortal& values_output,
716 BinaryFunctor binary_functor)
718 auto keys_out_begin = cuda::internal::IteratorBegin(keys_output);
719 auto values_out_begin = cuda::internal::IteratorBegin(values_output);
721 ::thrust::pair<decltype(keys_out_begin), decltype(values_out_begin)> result_iterators;
723 ::thrust::equal_to<typename KeysPortal::ValueType> binaryPredicate;
725 using ValueType =
typename ValuesPortal::ValueType;
726 viskores::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(
731 result_iterators = ::thrust::reduce_by_key(viskores_cuda_policy(),
732 cuda::internal::IteratorBegin(keys),
733 cuda::internal::IteratorEnd(keys),
734 cuda::internal::IteratorBegin(values),
742 cuda::internal::throwAsViskoresException();
745 return static_cast<viskores::Id>(::thrust::distance(keys_out_begin, result_iterators.first));
748 template <
class InputPortal,
class OutputPortal>
750 const InputPortal& input,
751 const OutputPortal& output)
753 using ValueType =
typename OutputPortal::ValueType;
755 return ScanExclusivePortal(input,
757 (::thrust::plus<ValueType>()),
761 template <
class InputPortal,
class OutputPortal,
class BinaryFunctor>
763 const InputPortal& input,
764 const OutputPortal& output,
765 BinaryFunctor binaryOp,
766 typename InputPortal::ValueType initialValue)
770 using ValueType =
typename OutputPortal::ValueType;
774 ::thrust::system::cuda::vector<ValueType> sum(3);
782 ThrustCudaPolicyPerThread, cuda::internal::IteratorEnd(input) - 1, 1, sum.begin());
784 viskores::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binaryOp);
786 auto end = ::thrust::exclusive_scan(ThrustCudaPolicyPerThread,
787 cuda::internal::IteratorBegin(input),
788 cuda::internal::IteratorEnd(input),
789 cuda::internal::IteratorBegin(output),
796 ::thrust::copy_n(ThrustCudaPolicyPerThread, (end - 1), 1, sum.begin() + 1);
799 cuda::internal::SumExclusiveScan<<<1, 1, 0, cudaStreamPerThread>>>(
800 sum[0], sum[1], sum[2], bop);
804 cuda::internal::throwAsViskoresException();
809 template <
class InputPortal,
class OutputPortal>
811 const InputPortal& input,
812 const OutputPortal& output)
814 using ValueType =
typename OutputPortal::ValueType;
815 return ScanInclusivePortal(input, output, ::thrust::plus<ValueType>());
818 template <
class InputPortal,
class OutputPortal,
class BinaryFunctor>
820 const InputPortal& input,
821 const OutputPortal& output,
822 BinaryFunctor binary_functor)
824 using ValueType =
typename OutputPortal::ValueType;
825 viskores::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(
830 ::thrust::system::cuda::vector<ValueType> result(1);
831 auto end = ::thrust::inclusive_scan(ThrustCudaPolicyPerThread,
832 cuda::internal::IteratorBegin(input),
833 cuda::internal::IteratorEnd(input),
834 cuda::internal::IteratorBegin(output),
837 ::thrust::copy_n(ThrustCudaPolicyPerThread, end - 1, 1, result.begin());
842 cuda::internal::throwAsViskoresException();
843 return typename InputPortal::ValueType();
849 template <
typename KeysPortal,
typename ValuesPortal,
typename OutputPortal>
851 const ValuesPortal& values,
852 const OutputPortal& output)
854 using KeyType =
typename KeysPortal::ValueType;
855 using ValueType =
typename OutputPortal::ValueType;
856 ScanInclusiveByKeyPortal(
857 keys, values, output, ::thrust::equal_to<KeyType>(), ::thrust::plus<ValueType>());
860 template <
typename KeysPortal,
861 typename ValuesPortal,
862 typename OutputPortal,
863 typename BinaryPredicate,
864 typename AssociativeOperator>
866 const ValuesPortal& values,
867 const OutputPortal& output,
868 BinaryPredicate binary_predicate,
869 AssociativeOperator binary_operator)
871 using KeyType =
typename KeysPortal::ValueType;
872 viskores::exec::cuda::internal::WrappedBinaryOperator<KeyType, BinaryPredicate> bpred(
874 using ValueType =
typename OutputPortal::ValueType;
875 viskores::exec::cuda::internal::WrappedBinaryOperator<ValueType, AssociativeOperator> bop(
880 ::thrust::inclusive_scan_by_key(ThrustCudaPolicyPerThread,
881 cuda::internal::IteratorBegin(keys),
882 cuda::internal::IteratorEnd(keys),
883 cuda::internal::IteratorBegin(values),
884 cuda::internal::IteratorBegin(output),
890 cuda::internal::throwAsViskoresException();
894 template <
typename KeysPortal,
typename ValuesPortal,
typename OutputPortal>
896 const ValuesPortal& values,
897 const OutputPortal& output)
899 using KeyType =
typename KeysPortal::ValueType;
900 using ValueType =
typename OutputPortal::ValueType;
901 ScanExclusiveByKeyPortal(keys,
905 ::thrust::equal_to<KeyType>(),
906 ::thrust::plus<ValueType>());
909 template <
typename KeysPortal,
910 typename ValuesPortal,
911 typename OutputPortal,
913 typename BinaryPredicate,
914 typename AssociativeOperator>
916 const ValuesPortal& values,
917 const OutputPortal& output,
919 BinaryPredicate binary_predicate,
920 AssociativeOperator binary_operator)
922 using KeyType =
typename KeysPortal::ValueType;
923 viskores::exec::cuda::internal::WrappedBinaryOperator<KeyType, BinaryPredicate> bpred(
925 using ValueType =
typename OutputPortal::ValueType;
926 viskores::exec::cuda::internal::WrappedBinaryOperator<ValueType, AssociativeOperator> bop(
930 ::thrust::exclusive_scan_by_key(ThrustCudaPolicyPerThread,
931 cuda::internal::IteratorBegin(keys),
932 cuda::internal::IteratorEnd(keys),
933 cuda::internal::IteratorBegin(values),
934 cuda::internal::IteratorBegin(output),
941 cuda::internal::throwAsViskoresException();
945 template <
class ValuesPortal>
948 using ValueType =
typename ValuesPortal::ValueType;
949 SortPortal(values, ::thrust::less<ValueType>());
952 template <
class ValuesPortal,
class BinaryCompare>
955 using ValueType =
typename ValuesPortal::ValueType;
956 viskores::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
960 ::thrust::sort(viskores_cuda_policy(),
961 cuda::internal::IteratorBegin(values),
962 cuda::internal::IteratorEnd(values),
967 cuda::internal::throwAsViskoresException();
971 template <
class KeysPortal,
class ValuesPortal>
974 using ValueType =
typename KeysPortal::ValueType;
975 SortByKeyPortal(keys, values, ::thrust::less<ValueType>());
978 template <
class KeysPortal,
class ValuesPortal,
class BinaryCompare>
980 const ValuesPortal& values,
981 BinaryCompare binary_compare)
983 using ValueType =
typename KeysPortal::ValueType;
984 viskores::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
988 ::thrust::sort_by_key(viskores_cuda_policy(),
989 cuda::internal::IteratorBegin(keys),
990 cuda::internal::IteratorEnd(keys),
991 cuda::internal::IteratorBegin(values),
996 cuda::internal::throwAsViskoresException();
1000 template <
class ValuesPortal>
1005 auto begin = cuda::internal::IteratorBegin(values);
1007 ::thrust::unique(ThrustCudaPolicyPerThread, begin, cuda::internal::IteratorEnd(values));
1008 return static_cast<viskores::Id>(::thrust::distance(begin, newLast));
1012 cuda::internal::throwAsViskoresException();
1017 template <
class ValuesPortal,
class BinaryCompare>
1019 BinaryCompare binary_compare)
1021 using ValueType =
typename ValuesPortal::ValueType;
1022 viskores::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
1026 auto begin = cuda::internal::IteratorBegin(values);
1027 auto newLast = ::thrust::unique(
1028 ThrustCudaPolicyPerThread, begin, cuda::internal::IteratorEnd(values), bop);
1029 return static_cast<viskores::Id>(::thrust::distance(begin, newLast));
1033 cuda::internal::throwAsViskoresException();
1038 template <
class InputPortal,
class ValuesPortal,
class OutputPortal>
1040 const ValuesPortal& values,
1041 const OutputPortal& output)
1045 ::thrust::upper_bound(ThrustCudaPolicyPerThread,
1046 cuda::internal::IteratorBegin(input),
1047 cuda::internal::IteratorEnd(input),
1048 cuda::internal::IteratorBegin(values),
1049 cuda::internal::IteratorEnd(values),
1050 cuda::internal::IteratorBegin(output));
1054 cuda::internal::throwAsViskoresException();
1058 template <
class InputPortal,
class ValuesPortal,
class OutputPortal,
class BinaryCompare>
1060 const ValuesPortal& values,
1061 const OutputPortal& output,
1062 BinaryCompare binary_compare)
1064 using ValueType =
typename OutputPortal::ValueType;
1066 viskores::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
1070 ::thrust::upper_bound(ThrustCudaPolicyPerThread,
1071 cuda::internal::IteratorBegin(input),
1072 cuda::internal::IteratorEnd(input),
1073 cuda::internal::IteratorBegin(values),
1074 cuda::internal::IteratorEnd(values),
1075 cuda::internal::IteratorBegin(output),
1080 cuda::internal::throwAsViskoresException();
1084 template <
class InputPortal,
class OutputPortal>
1086 const OutputPortal& values_output)
1090 ::thrust::upper_bound(ThrustCudaPolicyPerThread,
1091 cuda::internal::IteratorBegin(input),
1092 cuda::internal::IteratorEnd(input),
1093 cuda::internal::IteratorBegin(values_output),
1094 cuda::internal::IteratorEnd(values_output),
1095 cuda::internal::IteratorBegin(values_output));
1099 cuda::internal::throwAsViskoresException();
1103 template <
typename GlobalPopCountType,
typename BitsPortal,
typename IndicesPortal>
1105 const IndicesPortal& indices)
1107 using Functor = BitFieldToUnorderedSetFunctor<BitsPortal, IndicesPortal, GlobalPopCountType>;
1110 auto globalCount = cuda::internal::make_CudaUniquePtr<GlobalPopCountType>(1);
1111 Functor functor{ bits, indices, globalCount.get() };
1113 functor.Initialize();
1114 Schedule(functor, bits.GetNumberOfWords());
1116 return functor.Finalize();
1119 template <
typename GlobalPopCountType,
typename BitsPortal>
1122 using Functor = CountSetBitsFunctor<BitsPortal, GlobalPopCountType>;
1125 auto globalCount = cuda::internal::make_CudaUniquePtr<GlobalPopCountType>(1);
1126 Functor functor{ bits, globalCount.get() };
1128 functor.Initialize();
1129 Schedule(functor, bits.GetNumberOfWords());
1131 return functor.Finalize();
1137 template <
typename IndicesStorage>
1152 numBits = BitFieldToUnorderedSetPortal<viskores::UInt64>(bitsPortal, indicesPortal);
1159 template <
typename T,
typename U,
class SIn,
class SOut>
1176 template <
typename T,
typename U,
class SIn,
class SStencil,
class SOut>
1204 template <
typename T,
typename U,
class SIn,
class SStencil,
class SOut,
class UnaryPredicate>
1208 UnaryPredicate unary_predicate)
1232 template <
typename T,
typename U,
class SIn,
class SOut>
1244 if (input == output &&
1245 ((outputIndex >= inputStartIndex &&
1246 outputIndex < inputStartIndex + numberOfElementsToCopy) ||
1247 (inputStartIndex >= outputIndex &&
1248 inputStartIndex < outputIndex + numberOfElementsToCopy)))
1253 if (inputStartIndex < 0 || numberOfElementsToCopy < 0 || outputIndex < 0 ||
1254 inputStartIndex >= inSize)
1260 if (inSize < (inputStartIndex + numberOfElementsToCopy))
1262 numberOfElementsToCopy = (inSize - inputStartIndex);
1266 const viskores::Id copyOutEnd = outputIndex + numberOfElementsToCopy;
1267 if (outSize < copyOutEnd)
1285 numberOfElementsToCopy,
1297 return CountSetBitsPortal<viskores::UInt64>(bitsPortal);
1300 template <
typename T,
class SIn,
class SVal,
class SOut>
1314 template <
typename T,
class SIn,
class SVal,
class SOut,
class BinaryCompare>
1318 BinaryCompare binary_compare)
1330 template <
class SIn,
class SOut>
1342 template <
typename T,
typename U,
class SIn>
1348 if (numberOfValues <= 0)
1350 return initialValue;
1356 template <
typename T,
typename U,
class SIn,
class BinaryFunctor>
1359 BinaryFunctor binary_functor)
1364 if (numberOfValues <= 0)
1366 return initialValue;
1369 return ReducePortal(
1376 template <
typename T,
typename U,
typename... SIns>
1381 return Superclass::Reduce(input, initialValue);
1383 template <
typename T,
typename U,
typename BinaryFunctor,
typename... SIns>
1387 BinaryFunctor binary_functor)
1389 return Superclass::Reduce(input, initialValue, binary_functor);
1392 template <
typename T,
1398 class BinaryFunctor>
1403 BinaryFunctor binary_functor)
1410 if (numberOfValues <= 0)
1418 reduced_size = ReduceByKeyPortal(
1430 template <
typename T,
class SIn,
class SOut>
1437 if (numberOfValues <= 0)
1449 return ScanExclusivePortal(
1453 template <
typename T,
class SIn,
class SOut,
class BinaryFunctor>
1456 BinaryFunctor binary_functor,
1457 const T& initialValue)
1462 if (numberOfValues <= 0)
1474 return ScanExclusivePortal(
1481 template <
typename T,
class SIn,
class SOut>
1488 if (numberOfValues <= 0)
1500 return ScanInclusivePortal(
1504 template <
typename T,
class SIn,
class SOut,
class BinaryFunctor>
1507 BinaryFunctor binary_functor)
1512 if (numberOfValues <= 0)
1524 return ScanInclusivePortal(
1530 template <
typename T,
typename U,
typename KIn,
typename VIn,
typename VOut>
1538 if (numberOfValues <= 0)
1551 ScanInclusiveByKeyPortal(
1557 template <
typename T,
1562 typename BinaryFunctor>
1566 BinaryFunctor binary_functor)
1571 if (numberOfValues <= 0)
1584 ScanInclusiveByKeyPortal(keysPortal,
1587 ::thrust::equal_to<T>(),
1591 template <
typename T,
typename U,
typename KIn,
typename VIn,
typename VOut>
1599 if (numberOfValues <= 0)
1612 ScanExclusiveByKeyPortal(keysPortal,
1616 ::thrust::equal_to<T>(),
1620 template <
typename T,
1625 typename BinaryFunctor>
1629 const U& initialValue,
1630 BinaryFunctor binary_functor)
1635 if (numberOfValues <= 0)
1648 ScanExclusiveByKeyPortal(keysPortal,
1652 ::thrust::equal_to<T>(),
1658 struct VISKORES_CONT_EXPORT PinnedErrorArray
1660 char* HostPtr =
nullptr;
1661 char* DevicePtr =
nullptr;
1665 VISKORES_CONT_EXPORT
1666 static const PinnedErrorArray& GetPinnedErrorArray();
1668 VISKORES_CONT_EXPORT
1669 static void CheckForErrors();
1671 VISKORES_CONT_EXPORT
1672 static void SetupErrorBuffer(viskores::exec::cuda::internal::TaskStrided& functor);
1674 VISKORES_CONT_EXPORT
1680 VISKORES_CONT_EXPORT
1682 dim3& threadsPerBlock,
1686 template <
typename... Hints,
typename... Args>
1689 using ThreadsPerBlock =
1690 viskores::cont::internal::HintFind<viskores::cont::internal::HintList<Hints...>,
1691 viskores::cont::internal::HintThreadsPerBlock<0>,
1693 GetBlocksAndThreads(std::forward<Args>(args)..., ThreadsPerBlock::MaxThreads);
1696 VISKORES_CONT_EXPORT
1697 static void LogKernelLaunch(
const cudaFuncAttributes& func_attrs,
1698 const std::type_info& worklet_info,
1703 VISKORES_CONT_EXPORT
1704 static void LogKernelLaunch(
const cudaFuncAttributes& func_attrs,
1705 const std::type_info& worklet_info,
1707 dim3 threadsPerBlock,
1711 template <
typename WType,
typename IType,
typename H
ints>
1713 viskores::exec::cuda::internal::TaskStrided1D<WType, IType, Hints>& functor,
1719 if (numInstances < 1)
1726 SetupErrorBuffer(functor);
1729 GetBlocksAndThreads(Hints{}, blocks, threadsPerBlock, numInstances);
1731 #ifdef VISKORES_ENABLE_LOGGING
1734 using FunctorType = std::decay_t<decltype(functor)>;
1735 cudaFuncAttributes empty_kernel_attrs;
1737 cuda::internal::TaskStrided1DLaunch<FunctorType>));
1738 LogKernelLaunch(empty_kernel_attrs,
typeid(WType), blocks, threadsPerBlock, numInstances);
1742 cuda::internal::TaskStrided1DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
1743 functor, numInstances);
1746 template <
typename WType,
typename IType,
typename H
ints>
1748 viskores::exec::cuda::internal::TaskStrided3D<WType, IType, Hints>& functor,
1753 VISKORES_ASSERT((rangeMax[0] >= 0) && (rangeMax[1] >= 0) && (rangeMax[2] >= 0));
1754 if ((rangeMax[0] < 1) || (rangeMax[1] < 1) || (rangeMax[2] < 1))
1761 SetupErrorBuffer(functor);
1768 dim3 threadsPerBlock;
1769 GetBlocksAndThreads(Hints{}, blocks, threadsPerBlock, ranges);
1771 #ifdef VISKORES_ENABLE_LOGGING
1774 using FunctorType = std::decay_t<decltype(functor)>;
1775 cudaFuncAttributes empty_kernel_attrs;
1777 cuda::internal::TaskStrided3DLaunch<FunctorType>));
1778 LogKernelLaunch(empty_kernel_attrs,
typeid(WType), blocks, threadsPerBlock, ranges);
1782 cuda::internal::TaskStrided3DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
1786 template <
typename H
ints,
typename Functor>
1791 viskores::exec::cuda::internal::TaskStrided1D<Functor, viskores::internal::NullType, Hints>
1794 ScheduleTask(kernel, numInstances);
1797 template <
typename FunctorType>
1800 Schedule(viskores::cont::internal::HintList<>{}, functor, numInstances);
1803 template <
typename H
ints,
typename Functor>
1808 viskores::exec::cuda::internal::TaskStrided3D<Functor, viskores::internal::NullType, Hints>
1810 ScheduleTask(kernel, rangeMax);
1813 template <
typename FunctorType>
1816 Schedule(viskores::cont::internal::HintList<>{}, functor, rangeMax);
1819 template <
typename T,
class Storage>
1828 template <
typename T,
class Storage,
class BinaryCompare>
1830 BinaryCompare binary_compare)
1838 template <
typename T,
typename U,
class StorageT,
class StorageU>
1849 template <
typename T,
typename U,
class StorageT,
class StorageU,
class BinaryCompare>
1852 BinaryCompare binary_compare)
1862 template <
typename T,
class Storage>
1877 template <
typename T,
class Storage,
class BinaryCompare>
1879 BinaryCompare binary_compare)
1893 template <
typename T,
class SIn,
class SVal,
class SOut>
1907 template <
typename T,
class SIn,
class SVal,
class SOut,
class BinaryCompare>
1911 BinaryCompare binary_compare)
1923 template <
class SIn,
class SOut>
1948 template <
typename H
ints,
typename WorkletType,
typename InvocationType>
1949 static viskores::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType, Hints>
1952 return { worklet, invocation };
1955 template <
typename H
ints,
typename WorkletType,
typename InvocationType>
1956 static viskores::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType, Hints>
1959 return { worklet, invocation };
1962 template <
typename WorkletType,
typename InvocationType,
typename RangeType>
1964 InvocationType& invocation,
1965 const RangeType& range)
1967 return MakeTask<viskores::cont::internal::HintList<>>(worklet, invocation, range);
1973 #endif //viskores_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h