Viskores  1.0
DeviceAdapterAlgorithmCuda.h
Go to the documentation of this file.
1 //============================================================================
2 // The contents of this file are covered by the Viskores license. See
3 // LICENSE.txt for details.
4 //
5 // By contributing to this file, all contributors agree to the Developer
6 // Certificate of Origin Version 1.1 (DCO 1.1) as stated in DCO.txt.
7 //============================================================================
8 
9 //============================================================================
10 // Copyright (c) Kitware, Inc.
11 // All rights reserved.
12 // See LICENSE.txt for details.
13 //
14 // This software is distributed WITHOUT ANY WARRANTY; without even
15 // the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
16 // PURPOSE. See the above copyright notice for more information.
17 //============================================================================
18 #ifndef viskores_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
19 #define viskores_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
20 
21 #include <viskores/Math.h>
22 #include <viskores/TypeTraits.h>
23 #include <viskores/Types.h>
25 
28 #include <viskores/cont/BitField.h>
31 #include <viskores/cont/Logging.h>
32 #include <viskores/cont/Token.h>
34 
36 
44 
47 
48 // Disable warnings we check viskores for but Thrust does not.
51 //needs to be first
52 #include <viskores/exec/cuda/internal/ExecutionPolicy.h>
53 
54 #include <cooperative_groups.h>
55 #include <cuda.h>
56 #include <thrust/advance.h>
57 #include <thrust/binary_search.h>
58 #include <thrust/copy.h>
59 #include <thrust/count.h>
60 #include <thrust/iterator/counting_iterator.h>
61 #include <thrust/scan.h>
62 #include <thrust/sort.h>
63 #include <thrust/system/cpp/memory.h>
64 #include <thrust/system/cuda/vector.h>
65 #include <thrust/unique.h>
66 
68 
69 #include <limits>
70 #include <memory>
71 
72 namespace viskores
73 {
74 namespace cont
75 {
76 namespace cuda
77 {
78 
89 struct VISKORES_CONT_EXPORT ScheduleParameters
90 {
93 
96 
99 };
100 
135 VISKORES_CONT_EXPORT void InitScheduleParameters(
136  viskores::cont::cuda::ScheduleParameters (*)(char const* name,
137  int major,
138  int minor,
139  int multiProcessorCount,
140  int maxThreadsPerMultiProcessor,
141  int maxThreadsPerBlock));
142 
143 namespace internal
144 {
145 
146 #if (defined(VISKORES_GCC) || defined(VISKORES_CLANG))
147 #pragma GCC diagnostic push
148 #pragma GCC diagnostic ignored "-Wunused-parameter"
149 #endif
150 
151 template <typename TaskType>
152 __global__ void TaskStrided1DLaunch(TaskType task, viskores::Id size)
153 {
154  //see https://devblogs.nvidia.com/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
155  //for why our inc is grid-stride
156  const viskores::Id start = blockIdx.x * blockDim.x + threadIdx.x;
157  const viskores::Id inc = blockDim.x * gridDim.x;
158  task(start, size, inc);
159 }
160 
161 template <typename TaskType>
162 __global__ void TaskStrided3DLaunch(TaskType task, viskores::Id3 size)
163 {
164  //This is the 3D version of executing in a grid-stride manner
165  const dim3 start(blockIdx.x * blockDim.x + threadIdx.x,
166  blockIdx.y * blockDim.y + threadIdx.y,
167  blockIdx.z * blockDim.z + threadIdx.z);
168  const dim3 inc(blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z);
169 
170  for (viskores::Id k = start.z; k < size[2]; k += inc.z)
171  {
172  for (viskores::Id j = start.y; j < size[1]; j += inc.y)
173  {
174  task(size, start.x, size[0], inc.x, j, k);
175  }
176  }
177 }
178 
179 template <typename T, typename BinaryOperationType>
180 __global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op)
181 {
182  result = binary_op(a, b);
183 }
184 
185 #if (defined(VISKORES_GCC) || defined(VISKORES_CLANG))
186 #pragma GCC diagnostic pop
187 #endif
188 
189 template <typename FunctorType, typename ArgType>
190 struct FunctorSupportsUnaryImpl
191 {
192  template <typename F, typename A, typename = decltype(std::declval<F>()(std::declval<A>()))>
193  static std::true_type has(int);
194  template <typename F, typename A>
195  static std::false_type has(...);
196  using type = decltype(has<FunctorType, ArgType>(0));
197 };
198 template <typename FunctorType, typename ArgType>
199 using FunctorSupportsUnary = typename FunctorSupportsUnaryImpl<FunctorType, ArgType>::type;
200 
201 template <typename PortalType,
202  typename BinaryAndUnaryFunctor,
203  typename = FunctorSupportsUnary<BinaryAndUnaryFunctor, typename PortalType::ValueType>>
204 struct CastPortal;
205 
206 template <typename PortalType, typename BinaryAndUnaryFunctor>
207 struct CastPortal<PortalType, BinaryAndUnaryFunctor, std::true_type>
208 {
209  using InputType = typename PortalType::ValueType;
210  using ValueType = decltype(std::declval<BinaryAndUnaryFunctor>()(std::declval<InputType>()));
211 
212  PortalType Portal;
213  BinaryAndUnaryFunctor Functor;
214 
216  CastPortal(const PortalType& portal, const BinaryAndUnaryFunctor& functor)
217  : Portal(portal)
218  , Functor(functor)
219  {
220  }
221 
223  viskores::Id GetNumberOfValues() const { return this->Portal.GetNumberOfValues(); }
224 
226  ValueType Get(viskores::Id index) const { return this->Functor(this->Portal.Get(index)); }
227 };
228 
229 template <typename PortalType, typename BinaryFunctor>
230 struct CastPortal<PortalType, BinaryFunctor, std::false_type>
231 {
232  using InputType = typename PortalType::ValueType;
233  using ValueType =
234  decltype(std::declval<BinaryFunctor>()(std::declval<InputType>(), std::declval<InputType>()));
235 
236  PortalType Portal;
237 
239  CastPortal(const PortalType& portal, const BinaryFunctor&)
240  : Portal(portal)
241  {
242  }
243 
245  viskores::Id GetNumberOfValues() const { return this->Portal.GetNumberOfValues(); }
246 
248  ValueType Get(viskores::Id index) const
249  {
250  return static_cast<ValueType>(this->Portal.Get(index));
251  }
252 };
253 
254 struct CudaFreeFunctor
255 {
256  void operator()(void* ptr) const { VISKORES_CUDA_CALL(cudaFree(ptr)); }
257 };
258 
259 template <typename T>
260 using CudaUniquePtr = std::unique_ptr<T, CudaFreeFunctor>;
261 
262 template <typename T>
263 CudaUniquePtr<T> make_CudaUniquePtr(std::size_t numElements)
264 {
265  T* ptr;
266  VISKORES_CUDA_CALL(cudaMalloc(&ptr, sizeof(T) * numElements));
267  return CudaUniquePtr<T>(ptr);
268 }
269 }
270 } // end namespace cuda::internal
271 
272 template <>
274  : viskores::cont::internal::DeviceAdapterAlgorithmGeneral<
275  viskores::cont::DeviceAdapterAlgorithm<viskores::cont::DeviceAdapterTagCuda>,
276  viskores::cont::DeviceAdapterTagCuda>
277 {
278 // Because of some funny code conversions in nvcc, kernels for devices have to
279 // be public.
280 #ifndef VISKORES_CUDA
281 private:
282 #endif
283 
284  using Superclass = viskores::cont::internal::DeviceAdapterAlgorithmGeneral<
287 
288  template <typename BitsPortal, typename IndicesPortal, typename GlobalPopCountType>
289  struct BitFieldToUnorderedSetFunctor : public viskores::exec::FunctorBase
290  {
292  VISKORES_PASS_COMMAS(std::is_same<GlobalPopCountType, viskores::Int32>::value ||
293  std::is_same<GlobalPopCountType, viskores::UInt32>::value ||
294  std::is_same<GlobalPopCountType, viskores::UInt64>::value),
295  "Unsupported GlobalPopCountType. Must support CUDA atomicAdd.");
296 
297  //Using typename BitsPortal::WordTypePreferred causes dependent type errors using GCC 4.8.5
298  //which is the GCC required compiler for CUDA 9.2 on summit/power9
300 
302  VISKORES_PASS_COMMAS(std::is_same<typename IndicesPortal::ValueType, viskores::Id>::value));
303 
305  BitFieldToUnorderedSetFunctor(const BitsPortal& input,
306  const IndicesPortal& output,
307  GlobalPopCountType* globalPopCount)
308  : Input{ input }
309  , Output{ output }
310  , GlobalPopCount{ globalPopCount }
311  , FinalWordIndex{ input.GetNumberOfWords() - 1 }
312  , FinalWordMask(input.GetFinalWordMask())
313  {
314  }
315 
317 
319  {
320  assert(this->GlobalPopCount != nullptr);
321  VISKORES_CUDA_CALL(cudaMemset(this->GlobalPopCount, 0, sizeof(GlobalPopCountType)));
322  }
323 
325  __device__ void operator()(viskores::Id wordIdx) const
326  {
327  Word word = this->Input.GetWord(wordIdx);
328 
329  // The last word may be partial -- mask out trailing bits if needed.
330  const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~Word{ 0 };
331 
332  word &= mask;
333 
334  if (word != 0)
335  {
336  this->LocalPopCount = viskores::CountSetBits(word);
337  this->ReduceAllocate();
338 
339  viskores::Id firstBitIdx = wordIdx * sizeof(Word) * CHAR_BIT;
340  do
341  {
342  // Find next bit. FindFirstSetBit's result is indexed starting at 1.
344  viskores::Id outIdx = this->GetNextOutputIndex();
345  // Write index of bit
346  this->Output.Set(outIdx, firstBitIdx + bit);
347  word ^= (1 << bit); // clear bit
348  } while (word != 0); // have bits
349  }
350  }
351 
353  {
354  assert(this->GlobalPopCount != nullptr);
355  GlobalPopCountType result;
356  VISKORES_CUDA_CALL(cudaMemcpy(
357  &result, this->GlobalPopCount, sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost));
358  return static_cast<viskores::Id>(result);
359  }
360 
361  private:
362  // Every thread with a non-zero local popcount calls this function, which
363  // computes the total popcount for the coalesced threads and allocates
364  // a contiguous block in the output by atomically increasing the global
365  // popcount.
367  __device__ void ReduceAllocate() const
368  {
369  const auto activeLanes = cooperative_groups::coalesced_threads();
370  const int activeRank = activeLanes.thread_rank();
371  const int activeSize = activeLanes.size();
372 
373  // Reduction value:
374  viskores::Int32 rVal = this->LocalPopCount;
375  for (int delta = 1; delta < activeSize; delta *= 2)
376  {
377  const viskores::Int32 shflVal = activeLanes.shfl_down(rVal, delta);
378  if (activeRank + delta < activeSize)
379  {
380  rVal += shflVal;
381  }
382  }
383 
384  if (activeRank == 0)
385  {
386  this->AllocationHead =
387  atomicAdd(this->GlobalPopCount, static_cast<GlobalPopCountType>(rVal));
388  }
389 
390  this->AllocationHead = activeLanes.shfl(this->AllocationHead, 0);
391  }
392 
393  // The global output allocation is written to by striding the writes across
394  // the warp lanes, allowing the writes to global memory to be coalesced.
396  __device__ viskores::Id GetNextOutputIndex() const
397  {
398  // Only lanes with unwritten output indices left will call this method,
399  // so just check the coalesced threads:
400  const auto activeLanes = cooperative_groups::coalesced_threads();
401  const int activeRank = activeLanes.thread_rank();
402  const int activeSize = activeLanes.size();
403 
404  viskores::Id nextIdx = static_cast<viskores::Id>(this->AllocationHead + activeRank);
405  this->AllocationHead += activeSize;
406 
407  return nextIdx;
408  }
409 
410  const BitsPortal Input;
411  const IndicesPortal Output;
412  GlobalPopCountType* GlobalPopCount;
413  mutable viskores::UInt64 AllocationHead{ 0 };
414  mutable viskores::Int32 LocalPopCount{ 0 };
415  // Used to mask trailing bits the in last word.
416  viskores::Id FinalWordIndex{ 0 };
417  Word FinalWordMask{ 0 };
418  };
419 
420  template <class InputPortal, class OutputPortal>
421  VISKORES_CONT static void CopyPortal(const InputPortal& input, const OutputPortal& output)
422  {
423  try
424  {
425  ::thrust::copy(ThrustCudaPolicyPerThread,
426  cuda::internal::IteratorBegin(input),
427  cuda::internal::IteratorEnd(input),
428  cuda::internal::IteratorBegin(output));
429  }
430  catch (...)
431  {
432  cuda::internal::throwAsViskoresException();
433  }
434  }
435 
436  template <class ValueIterator, class StencilPortal, class OutputPortal, class UnaryPredicate>
437  VISKORES_CONT static viskores::Id CopyIfPortal(ValueIterator valuesBegin,
438  ValueIterator valuesEnd,
439  StencilPortal stencil,
440  OutputPortal output,
441  UnaryPredicate unary_predicate)
442  {
443  auto outputBegin = cuda::internal::IteratorBegin(output);
444 
445  using ValueType = typename StencilPortal::ValueType;
446 
447  viskores::exec::cuda::internal::WrappedUnaryPredicate<ValueType, UnaryPredicate> up(
448  unary_predicate);
449 
450  try
451  {
452  auto newLast = ::thrust::copy_if(ThrustCudaPolicyPerThread,
453  valuesBegin,
454  valuesEnd,
455  cuda::internal::IteratorBegin(stencil),
456  outputBegin,
457  up);
458  return static_cast<viskores::Id>(::thrust::distance(outputBegin, newLast));
459  }
460  catch (...)
461  {
462  cuda::internal::throwAsViskoresException();
463  return viskores::Id(0);
464  }
465  }
466 
467  template <class ValuePortal, class StencilPortal, class OutputPortal, class UnaryPredicate>
468  VISKORES_CONT static viskores::Id CopyIfPortal(ValuePortal values,
469  StencilPortal stencil,
470  OutputPortal output,
471  UnaryPredicate unary_predicate)
472  {
473  return CopyIfPortal(cuda::internal::IteratorBegin(values),
474  cuda::internal::IteratorEnd(values),
475  stencil,
476  output,
477  unary_predicate);
478  }
479 
480  template <class InputPortal, class OutputPortal>
481  VISKORES_CONT static void CopySubRangePortal(const InputPortal& input,
482  viskores::Id inputOffset,
483  viskores::Id size,
484  const OutputPortal& output,
485  viskores::Id outputOffset)
486  {
487  try
488  {
489  ::thrust::copy_n(ThrustCudaPolicyPerThread,
490  cuda::internal::IteratorBegin(input) + inputOffset,
491  static_cast<std::size_t>(size),
492  cuda::internal::IteratorBegin(output) + outputOffset);
493  }
494  catch (...)
495  {
496  cuda::internal::throwAsViskoresException();
497  }
498  }
499 
500 
501  template <typename BitsPortal, typename GlobalPopCountType>
502  struct CountSetBitsFunctor : public viskores::exec::FunctorBase
503  {
505  VISKORES_PASS_COMMAS(std::is_same<GlobalPopCountType, viskores::Int32>::value ||
506  std::is_same<GlobalPopCountType, viskores::UInt32>::value ||
507  std::is_same<GlobalPopCountType, viskores::UInt64>::value),
508  "Unsupported GlobalPopCountType. Must support CUDA atomicAdd.");
509 
510  //Using typename BitsPortal::WordTypePreferred causes dependent type errors using GCC 4.8.5
511  //which is the GCC required compiler for CUDA 9.2 on summit/power9
513 
515  CountSetBitsFunctor(const BitsPortal& portal, GlobalPopCountType* globalPopCount)
516  : Portal{ portal }
517  , GlobalPopCount{ globalPopCount }
518  , FinalWordIndex{ portal.GetNumberOfWords() - 1 }
519  , FinalWordMask{ portal.GetFinalWordMask() }
520  {
521  }
522 
524 
526  {
527  assert(this->GlobalPopCount != nullptr);
528  VISKORES_CUDA_CALL(cudaMemset(this->GlobalPopCount, 0, sizeof(GlobalPopCountType)));
529  }
530 
532  __device__ void operator()(viskores::Id wordIdx) const
533  {
534  Word word = this->Portal.GetWord(wordIdx);
535 
536  // The last word may be partial -- mask out trailing bits if needed.
537  const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~Word{ 0 };
538 
539  word &= mask;
540 
541  if (word != 0)
542  {
543  this->LocalPopCount = viskores::CountSetBits(word);
544  this->Reduce();
545  }
546  }
547 
549  {
550  assert(this->GlobalPopCount != nullptr);
551  GlobalPopCountType result;
552  VISKORES_CUDA_CALL(cudaMemcpy(
553  &result, this->GlobalPopCount, sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost));
554  return static_cast<viskores::Id>(result);
555  }
556 
557  private:
558  // Every thread with a non-zero local popcount calls this function, which
559  // computes the total popcount for the coalesced threads and atomically
560  // increasing the global popcount.
562  __device__ void Reduce() const
563  {
564  const auto activeLanes = cooperative_groups::coalesced_threads();
565  const int activeRank = activeLanes.thread_rank();
566  const int activeSize = activeLanes.size();
567 
568  // Reduction value:
569  viskores::Int32 rVal = this->LocalPopCount;
570  for (int delta = 1; delta < activeSize; delta *= 2)
571  {
572  const viskores::Int32 shflVal = activeLanes.shfl_down(rVal, delta);
573  if (activeRank + delta < activeSize)
574  {
575  rVal += shflVal;
576  }
577  }
578 
579  if (activeRank == 0)
580  {
581  atomicAdd(this->GlobalPopCount, static_cast<GlobalPopCountType>(rVal));
582  }
583  }
584 
585  const BitsPortal Portal;
586  GlobalPopCountType* GlobalPopCount;
587  mutable viskores::Int32 LocalPopCount{ 0 };
588  // Used to mask trailing bits the in last word.
589  viskores::Id FinalWordIndex{ 0 };
590  Word FinalWordMask{ 0 };
591  };
592 
593  template <class InputPortal, class ValuesPortal, class OutputPortal>
594  VISKORES_CONT static void LowerBoundsPortal(const InputPortal& input,
595  const ValuesPortal& values,
596  const OutputPortal& output)
597  {
598  using ValueType = typename ValuesPortal::ValueType;
599  LowerBoundsPortal(input, values, output, ::thrust::less<ValueType>());
600  }
601 
602  template <class InputPortal, class OutputPortal>
603  VISKORES_CONT static void LowerBoundsPortal(const InputPortal& input,
604  const OutputPortal& values_output)
605  {
606  using ValueType = typename InputPortal::ValueType;
607  LowerBoundsPortal(input, values_output, values_output, ::thrust::less<ValueType>());
608  }
609 
610  template <class InputPortal, class ValuesPortal, class OutputPortal, class BinaryCompare>
611  VISKORES_CONT static void LowerBoundsPortal(const InputPortal& input,
612  const ValuesPortal& values,
613  const OutputPortal& output,
614  BinaryCompare binary_compare)
615  {
616  using ValueType = typename InputPortal::ValueType;
617  viskores::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
618  binary_compare);
619 
620  try
621  {
622  ::thrust::lower_bound(ThrustCudaPolicyPerThread,
623  cuda::internal::IteratorBegin(input),
624  cuda::internal::IteratorEnd(input),
625  cuda::internal::IteratorBegin(values),
626  cuda::internal::IteratorEnd(values),
627  cuda::internal::IteratorBegin(output),
628  bop);
629  }
630  catch (...)
631  {
632  cuda::internal::throwAsViskoresException();
633  }
634  }
635 
636  template <class InputPortal, typename T>
637  VISKORES_CONT static T ReducePortal(const InputPortal& input, T initialValue)
638  {
639  return ReducePortal(input, initialValue, ::thrust::plus<T>());
640  }
641 
642  template <class InputPortal, typename T, class BinaryFunctor>
643  VISKORES_CONT static T ReducePortal(const InputPortal& input,
644  T initialValue,
645  BinaryFunctor binary_functor)
646  {
647  using fast_path = std::is_same<typename InputPortal::ValueType, T>;
648  return ReducePortalImpl(input, initialValue, binary_functor, fast_path());
649  }
650 
651  template <class InputPortal, typename T, class BinaryFunctor>
652  VISKORES_CONT static T ReducePortalImpl(const InputPortal& input,
653  T initialValue,
654  BinaryFunctor binary_functor,
655  std::true_type)
656  {
657  //The portal type and the initial value are the same so we can use
658  //the thrust reduction algorithm
659  viskores::exec::cuda::internal::WrappedBinaryOperator<T, BinaryFunctor> bop(binary_functor);
660 
661  try
662  {
663  return ::thrust::reduce(ThrustCudaPolicyPerThread,
664  cuda::internal::IteratorBegin(input),
665  cuda::internal::IteratorEnd(input),
666  initialValue,
667  bop);
668  }
669  catch (...)
670  {
671  cuda::internal::throwAsViskoresException();
672  }
673 
674  return initialValue;
675  }
676 
677  template <class InputPortal, typename T, class BinaryFunctor>
678  VISKORES_CONT static T ReducePortalImpl(const InputPortal& input,
679  T initialValue,
680  BinaryFunctor binary_functor,
681  std::false_type)
682  {
683  //The portal type and the initial value AREN'T the same type so we have
684  //to a slower approach, where we wrap the input portal inside a cast
685  //portal
686  viskores::cont::cuda::internal::CastPortal<InputPortal, BinaryFunctor> castPortal(
687  input, binary_functor);
688 
689  viskores::exec::cuda::internal::WrappedBinaryOperator<T, BinaryFunctor> bop(binary_functor);
690 
691  try
692  {
693  return ::thrust::reduce(ThrustCudaPolicyPerThread,
694  cuda::internal::IteratorBegin(castPortal),
695  cuda::internal::IteratorEnd(castPortal),
696  initialValue,
697  bop);
698  }
699  catch (...)
700  {
701  cuda::internal::throwAsViskoresException();
702  }
703 
704  return initialValue;
705  }
706 
707  template <class KeysPortal,
708  class ValuesPortal,
709  class KeysOutputPortal,
710  class ValueOutputPortal,
711  class BinaryFunctor>
712  VISKORES_CONT static viskores::Id ReduceByKeyPortal(const KeysPortal& keys,
713  const ValuesPortal& values,
714  const KeysOutputPortal& keys_output,
715  const ValueOutputPortal& values_output,
716  BinaryFunctor binary_functor)
717  {
718  auto keys_out_begin = cuda::internal::IteratorBegin(keys_output);
719  auto values_out_begin = cuda::internal::IteratorBegin(values_output);
720 
721  ::thrust::pair<decltype(keys_out_begin), decltype(values_out_begin)> result_iterators;
722 
723  ::thrust::equal_to<typename KeysPortal::ValueType> binaryPredicate;
724 
725  using ValueType = typename ValuesPortal::ValueType;
726  viskores::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(
727  binary_functor);
728 
729  try
730  {
731  result_iterators = ::thrust::reduce_by_key(viskores_cuda_policy(),
732  cuda::internal::IteratorBegin(keys),
733  cuda::internal::IteratorEnd(keys),
734  cuda::internal::IteratorBegin(values),
735  keys_out_begin,
736  values_out_begin,
737  binaryPredicate,
738  bop);
739  }
740  catch (...)
741  {
742  cuda::internal::throwAsViskoresException();
743  }
744 
745  return static_cast<viskores::Id>(::thrust::distance(keys_out_begin, result_iterators.first));
746  }
747 
748  template <class InputPortal, class OutputPortal>
749  VISKORES_CONT static typename InputPortal::ValueType ScanExclusivePortal(
750  const InputPortal& input,
751  const OutputPortal& output)
752  {
753  using ValueType = typename OutputPortal::ValueType;
754 
755  return ScanExclusivePortal(input,
756  output,
757  (::thrust::plus<ValueType>()),
759  }
760 
761  template <class InputPortal, class OutputPortal, class BinaryFunctor>
762  VISKORES_CONT static typename InputPortal::ValueType ScanExclusivePortal(
763  const InputPortal& input,
764  const OutputPortal& output,
765  BinaryFunctor binaryOp,
766  typename InputPortal::ValueType initialValue)
767  {
768  // Use iterator to get value so that thrust device_ptr has chance to handle
769  // data on device.
770  using ValueType = typename OutputPortal::ValueType;
771 
772  //we have size three so that we can store the origin end value, the
773  //new end value, and the sum of those two
774  ::thrust::system::cuda::vector<ValueType> sum(3);
775  try
776  {
777 
778  //store the current value of the last position array in a separate cuda
779  //memory location since the exclusive_scan will overwrite that value
780  //once run
781  ::thrust::copy_n(
782  ThrustCudaPolicyPerThread, cuda::internal::IteratorEnd(input) - 1, 1, sum.begin());
783 
784  viskores::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binaryOp);
785 
786  auto end = ::thrust::exclusive_scan(ThrustCudaPolicyPerThread,
787  cuda::internal::IteratorBegin(input),
788  cuda::internal::IteratorEnd(input),
789  cuda::internal::IteratorBegin(output),
790  initialValue,
791  bop);
792 
793  //Store the new value for the end of the array. This is done because
794  //with items such as the transpose array it is unsafe to pass the
795  //portal to the SumExclusiveScan
796  ::thrust::copy_n(ThrustCudaPolicyPerThread, (end - 1), 1, sum.begin() + 1);
797 
798  //execute the binaryOp one last time on the device.
799  cuda::internal::SumExclusiveScan<<<1, 1, 0, cudaStreamPerThread>>>(
800  sum[0], sum[1], sum[2], bop);
801  }
802  catch (...)
803  {
804  cuda::internal::throwAsViskoresException();
805  }
806  return sum[2];
807  }
808 
809  template <class InputPortal, class OutputPortal>
810  VISKORES_CONT static typename InputPortal::ValueType ScanInclusivePortal(
811  const InputPortal& input,
812  const OutputPortal& output)
813  {
814  using ValueType = typename OutputPortal::ValueType;
815  return ScanInclusivePortal(input, output, ::thrust::plus<ValueType>());
816  }
817 
818  template <class InputPortal, class OutputPortal, class BinaryFunctor>
819  VISKORES_CONT static typename InputPortal::ValueType ScanInclusivePortal(
820  const InputPortal& input,
821  const OutputPortal& output,
822  BinaryFunctor binary_functor)
823  {
824  using ValueType = typename OutputPortal::ValueType;
825  viskores::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(
826  binary_functor);
827 
828  try
829  {
830  ::thrust::system::cuda::vector<ValueType> result(1);
831  auto end = ::thrust::inclusive_scan(ThrustCudaPolicyPerThread,
832  cuda::internal::IteratorBegin(input),
833  cuda::internal::IteratorEnd(input),
834  cuda::internal::IteratorBegin(output),
835  bop);
836 
837  ::thrust::copy_n(ThrustCudaPolicyPerThread, end - 1, 1, result.begin());
838  return result[0];
839  }
840  catch (...)
841  {
842  cuda::internal::throwAsViskoresException();
843  return typename InputPortal::ValueType();
844  }
845 
846  //return the value at the last index in the array, as that is the sum
847  }
848 
849  template <typename KeysPortal, typename ValuesPortal, typename OutputPortal>
850  VISKORES_CONT static void ScanInclusiveByKeyPortal(const KeysPortal& keys,
851  const ValuesPortal& values,
852  const OutputPortal& output)
853  {
854  using KeyType = typename KeysPortal::ValueType;
855  using ValueType = typename OutputPortal::ValueType;
856  ScanInclusiveByKeyPortal(
857  keys, values, output, ::thrust::equal_to<KeyType>(), ::thrust::plus<ValueType>());
858  }
859 
860  template <typename KeysPortal,
861  typename ValuesPortal,
862  typename OutputPortal,
863  typename BinaryPredicate,
864  typename AssociativeOperator>
865  VISKORES_CONT static void ScanInclusiveByKeyPortal(const KeysPortal& keys,
866  const ValuesPortal& values,
867  const OutputPortal& output,
868  BinaryPredicate binary_predicate,
869  AssociativeOperator binary_operator)
870  {
871  using KeyType = typename KeysPortal::ValueType;
872  viskores::exec::cuda::internal::WrappedBinaryOperator<KeyType, BinaryPredicate> bpred(
873  binary_predicate);
874  using ValueType = typename OutputPortal::ValueType;
875  viskores::exec::cuda::internal::WrappedBinaryOperator<ValueType, AssociativeOperator> bop(
876  binary_operator);
877 
878  try
879  {
880  ::thrust::inclusive_scan_by_key(ThrustCudaPolicyPerThread,
881  cuda::internal::IteratorBegin(keys),
882  cuda::internal::IteratorEnd(keys),
883  cuda::internal::IteratorBegin(values),
884  cuda::internal::IteratorBegin(output),
885  bpred,
886  bop);
887  }
888  catch (...)
889  {
890  cuda::internal::throwAsViskoresException();
891  }
892  }
893 
894  template <typename KeysPortal, typename ValuesPortal, typename OutputPortal>
895  VISKORES_CONT static void ScanExclusiveByKeyPortal(const KeysPortal& keys,
896  const ValuesPortal& values,
897  const OutputPortal& output)
898  {
899  using KeyType = typename KeysPortal::ValueType;
900  using ValueType = typename OutputPortal::ValueType;
901  ScanExclusiveByKeyPortal(keys,
902  values,
903  output,
905  ::thrust::equal_to<KeyType>(),
906  ::thrust::plus<ValueType>());
907  }
908 
909  template <typename KeysPortal,
910  typename ValuesPortal,
911  typename OutputPortal,
912  typename T,
913  typename BinaryPredicate,
914  typename AssociativeOperator>
915  VISKORES_CONT static void ScanExclusiveByKeyPortal(const KeysPortal& keys,
916  const ValuesPortal& values,
917  const OutputPortal& output,
918  T initValue,
919  BinaryPredicate binary_predicate,
920  AssociativeOperator binary_operator)
921  {
922  using KeyType = typename KeysPortal::ValueType;
923  viskores::exec::cuda::internal::WrappedBinaryOperator<KeyType, BinaryPredicate> bpred(
924  binary_predicate);
925  using ValueType = typename OutputPortal::ValueType;
926  viskores::exec::cuda::internal::WrappedBinaryOperator<ValueType, AssociativeOperator> bop(
927  binary_operator);
928  try
929  {
930  ::thrust::exclusive_scan_by_key(ThrustCudaPolicyPerThread,
931  cuda::internal::IteratorBegin(keys),
932  cuda::internal::IteratorEnd(keys),
933  cuda::internal::IteratorBegin(values),
934  cuda::internal::IteratorBegin(output),
935  initValue,
936  bpred,
937  bop);
938  }
939  catch (...)
940  {
941  cuda::internal::throwAsViskoresException();
942  }
943  }
944 
945  template <class ValuesPortal>
946  VISKORES_CONT static void SortPortal(const ValuesPortal& values)
947  {
948  using ValueType = typename ValuesPortal::ValueType;
949  SortPortal(values, ::thrust::less<ValueType>());
950  }
951 
952  template <class ValuesPortal, class BinaryCompare>
953  VISKORES_CONT static void SortPortal(const ValuesPortal& values, BinaryCompare binary_compare)
954  {
955  using ValueType = typename ValuesPortal::ValueType;
956  viskores::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
957  binary_compare);
958  try
959  {
960  ::thrust::sort(viskores_cuda_policy(),
961  cuda::internal::IteratorBegin(values),
962  cuda::internal::IteratorEnd(values),
963  bop);
964  }
965  catch (...)
966  {
967  cuda::internal::throwAsViskoresException();
968  }
969  }
970 
971  template <class KeysPortal, class ValuesPortal>
972  VISKORES_CONT static void SortByKeyPortal(const KeysPortal& keys, const ValuesPortal& values)
973  {
974  using ValueType = typename KeysPortal::ValueType;
975  SortByKeyPortal(keys, values, ::thrust::less<ValueType>());
976  }
977 
978  template <class KeysPortal, class ValuesPortal, class BinaryCompare>
979  VISKORES_CONT static void SortByKeyPortal(const KeysPortal& keys,
980  const ValuesPortal& values,
981  BinaryCompare binary_compare)
982  {
983  using ValueType = typename KeysPortal::ValueType;
984  viskores::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
985  binary_compare);
986  try
987  {
988  ::thrust::sort_by_key(viskores_cuda_policy(),
989  cuda::internal::IteratorBegin(keys),
990  cuda::internal::IteratorEnd(keys),
991  cuda::internal::IteratorBegin(values),
992  bop);
993  }
994  catch (...)
995  {
996  cuda::internal::throwAsViskoresException();
997  }
998  }
999 
1000  template <class ValuesPortal>
1001  VISKORES_CONT static viskores::Id UniquePortal(const ValuesPortal values)
1002  {
1003  try
1004  {
1005  auto begin = cuda::internal::IteratorBegin(values);
1006  auto newLast =
1007  ::thrust::unique(ThrustCudaPolicyPerThread, begin, cuda::internal::IteratorEnd(values));
1008  return static_cast<viskores::Id>(::thrust::distance(begin, newLast));
1009  }
1010  catch (...)
1011  {
1012  cuda::internal::throwAsViskoresException();
1013  return viskores::Id(0);
1014  }
1015  }
1016 
1017  template <class ValuesPortal, class BinaryCompare>
1018  VISKORES_CONT static viskores::Id UniquePortal(const ValuesPortal values,
1019  BinaryCompare binary_compare)
1020  {
1021  using ValueType = typename ValuesPortal::ValueType;
1022  viskores::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
1023  binary_compare);
1024  try
1025  {
1026  auto begin = cuda::internal::IteratorBegin(values);
1027  auto newLast = ::thrust::unique(
1028  ThrustCudaPolicyPerThread, begin, cuda::internal::IteratorEnd(values), bop);
1029  return static_cast<viskores::Id>(::thrust::distance(begin, newLast));
1030  }
1031  catch (...)
1032  {
1033  cuda::internal::throwAsViskoresException();
1034  return viskores::Id(0);
1035  }
1036  }
1037 
1038  template <class InputPortal, class ValuesPortal, class OutputPortal>
1039  VISKORES_CONT static void UpperBoundsPortal(const InputPortal& input,
1040  const ValuesPortal& values,
1041  const OutputPortal& output)
1042  {
1043  try
1044  {
1045  ::thrust::upper_bound(ThrustCudaPolicyPerThread,
1046  cuda::internal::IteratorBegin(input),
1047  cuda::internal::IteratorEnd(input),
1048  cuda::internal::IteratorBegin(values),
1049  cuda::internal::IteratorEnd(values),
1050  cuda::internal::IteratorBegin(output));
1051  }
1052  catch (...)
1053  {
1054  cuda::internal::throwAsViskoresException();
1055  }
1056  }
1057 
1058  template <class InputPortal, class ValuesPortal, class OutputPortal, class BinaryCompare>
1059  VISKORES_CONT static void UpperBoundsPortal(const InputPortal& input,
1060  const ValuesPortal& values,
1061  const OutputPortal& output,
1062  BinaryCompare binary_compare)
1063  {
1064  using ValueType = typename OutputPortal::ValueType;
1065 
1066  viskores::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
1067  binary_compare);
1068  try
1069  {
1070  ::thrust::upper_bound(ThrustCudaPolicyPerThread,
1071  cuda::internal::IteratorBegin(input),
1072  cuda::internal::IteratorEnd(input),
1073  cuda::internal::IteratorBegin(values),
1074  cuda::internal::IteratorEnd(values),
1075  cuda::internal::IteratorBegin(output),
1076  bop);
1077  }
1078  catch (...)
1079  {
1080  cuda::internal::throwAsViskoresException();
1081  }
1082  }
1083 
1084  template <class InputPortal, class OutputPortal>
1085  VISKORES_CONT static void UpperBoundsPortal(const InputPortal& input,
1086  const OutputPortal& values_output)
1087  {
1088  try
1089  {
1090  ::thrust::upper_bound(ThrustCudaPolicyPerThread,
1091  cuda::internal::IteratorBegin(input),
1092  cuda::internal::IteratorEnd(input),
1093  cuda::internal::IteratorBegin(values_output),
1094  cuda::internal::IteratorEnd(values_output),
1095  cuda::internal::IteratorBegin(values_output));
1096  }
1097  catch (...)
1098  {
1099  cuda::internal::throwAsViskoresException();
1100  }
1101  }
1102 
1103  template <typename GlobalPopCountType, typename BitsPortal, typename IndicesPortal>
1105  const IndicesPortal& indices)
1106  {
1107  using Functor = BitFieldToUnorderedSetFunctor<BitsPortal, IndicesPortal, GlobalPopCountType>;
1108 
1109  // RAII for the global atomic counter.
1110  auto globalCount = cuda::internal::make_CudaUniquePtr<GlobalPopCountType>(1);
1111  Functor functor{ bits, indices, globalCount.get() };
1112 
1113  functor.Initialize();
1114  Schedule(functor, bits.GetNumberOfWords());
1115  Synchronize(); // Ensure kernel is done before checking final atomic count
1116  return functor.Finalize();
1117  }
1118 
1119  template <typename GlobalPopCountType, typename BitsPortal>
1120  VISKORES_CONT static viskores::Id CountSetBitsPortal(const BitsPortal& bits)
1121  {
1122  using Functor = CountSetBitsFunctor<BitsPortal, GlobalPopCountType>;
1123 
1124  // RAII for the global atomic counter.
1125  auto globalCount = cuda::internal::make_CudaUniquePtr<GlobalPopCountType>(1);
1126  Functor functor{ bits, globalCount.get() };
1127 
1128  functor.Initialize();
1129  Schedule(functor, bits.GetNumberOfWords());
1130  Synchronize(); // Ensure kernel is done before checking final atomic count
1131  return functor.Finalize();
1132  }
1133 
1134  //-----------------------------------------------------------------------------
1135 
1136 public:
1137  template <typename IndicesStorage>
1139  const viskores::cont::BitField& bits,
1141  {
1143 
1144  viskores::Id numBits = bits.GetNumberOfBits();
1145 
1146  {
1147  viskores::cont::Token token;
1148  auto bitsPortal = bits.PrepareForInput(DeviceAdapterTagCuda{}, token);
1149  auto indicesPortal = indices.PrepareForOutput(numBits, DeviceAdapterTagCuda{}, token);
1150 
1151  // Use a uint64 for accumulator, as atomicAdd does not support signed int64.
1152  numBits = BitFieldToUnorderedSetPortal<viskores::UInt64>(bitsPortal, indicesPortal);
1153  }
1154 
1155  indices.Allocate(numBits, viskores::CopyFlag::On);
1156  return numBits;
1157  }
1158 
1159  template <typename T, typename U, class SIn, class SOut>
1162  {
1164 
1165  const viskores::Id inSize = input.GetNumberOfValues();
1166  if (inSize <= 0)
1167  {
1168  output.Allocate(inSize, viskores::CopyFlag::On);
1169  return;
1170  }
1171  viskores::cont::Token token;
1172  CopyPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1173  output.PrepareForOutput(inSize, DeviceAdapterTagCuda(), token));
1174  }
1175 
1176  template <typename T, typename U, class SIn, class SStencil, class SOut>
1180  {
1182 
1183  viskores::Id size = stencil.GetNumberOfValues();
1184  if (size <= 0)
1185  {
1186  output.Allocate(size, viskores::CopyFlag::On);
1187  return;
1188  }
1189 
1190  viskores::Id newSize;
1191 
1192  {
1193  viskores::cont::Token token;
1194 
1195  newSize = CopyIfPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1196  stencil.PrepareForInput(DeviceAdapterTagCuda(), token),
1197  output.PrepareForOutput(size, DeviceAdapterTagCuda(), token),
1198  ::viskores::NotZeroInitialized()); //yes on the stencil
1199  }
1200 
1201  output.Allocate(newSize, viskores::CopyFlag::On);
1202  }
1203 
1204  template <typename T, typename U, class SIn, class SStencil, class SOut, class UnaryPredicate>
1208  UnaryPredicate unary_predicate)
1209  {
1211 
1212  viskores::Id size = stencil.GetNumberOfValues();
1213  if (size <= 0)
1214  {
1215  output.Allocate(size, viskores::CopyFlag::On);
1216  return;
1217  }
1218 
1219  viskores::Id newSize;
1220 
1221  {
1222  viskores::cont::Token token;
1223  newSize = CopyIfPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1224  stencil.PrepareForInput(DeviceAdapterTagCuda(), token),
1225  output.PrepareForOutput(size, DeviceAdapterTagCuda(), token),
1226  unary_predicate);
1227  }
1228 
1229  output.Allocate(newSize, viskores::CopyFlag::On);
1230  }
1231 
1232  template <typename T, typename U, class SIn, class SOut>
1234  viskores::Id inputStartIndex,
1235  viskores::Id numberOfElementsToCopy,
1237  viskores::Id outputIndex = 0)
1238  {
1240 
1241  const viskores::Id inSize = input.GetNumberOfValues();
1242 
1243  // Check if the ranges overlap and fail if they do.
1244  if (input == output &&
1245  ((outputIndex >= inputStartIndex &&
1246  outputIndex < inputStartIndex + numberOfElementsToCopy) ||
1247  (inputStartIndex >= outputIndex &&
1248  inputStartIndex < outputIndex + numberOfElementsToCopy)))
1249  {
1250  return false;
1251  }
1252 
1253  if (inputStartIndex < 0 || numberOfElementsToCopy < 0 || outputIndex < 0 ||
1254  inputStartIndex >= inSize)
1255  { //invalid parameters
1256  return false;
1257  }
1258 
1259  //determine if the numberOfElementsToCopy needs to be reduced
1260  if (inSize < (inputStartIndex + numberOfElementsToCopy))
1261  { //adjust the size
1262  numberOfElementsToCopy = (inSize - inputStartIndex);
1263  }
1264 
1265  const viskores::Id outSize = output.GetNumberOfValues();
1266  const viskores::Id copyOutEnd = outputIndex + numberOfElementsToCopy;
1267  if (outSize < copyOutEnd)
1268  { //output is not large enough
1269  if (outSize == 0)
1270  { //since output has nothing, just need to allocate to correct length
1271  output.Allocate(copyOutEnd);
1272  }
1273  else
1274  { //we currently have data in this array, so preserve it in the new
1275  //resized array
1277  temp.Allocate(copyOutEnd);
1278  CopySubRange(output, 0, outSize, temp);
1279  output = temp;
1280  }
1281  }
1282  viskores::cont::Token token;
1283  CopySubRangePortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1284  inputStartIndex,
1285  numberOfElementsToCopy,
1286  output.PrepareForInPlace(DeviceAdapterTagCuda(), token),
1287  outputIndex);
1288  return true;
1289  }
1290 
1292  {
1294  viskores::cont::Token token;
1295  auto bitsPortal = bits.PrepareForInput(DeviceAdapterTagCuda{}, token);
1296  // Use a uint64 for accumulator, as atomicAdd does not support signed int64.
1297  return CountSetBitsPortal<viskores::UInt64>(bitsPortal);
1298  }
1299 
1300  template <typename T, class SIn, class SVal, class SOut>
1304  {
1306 
1307  viskores::Id numberOfValues = values.GetNumberOfValues();
1308  viskores::cont::Token token;
1309  LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1310  values.PrepareForInput(DeviceAdapterTagCuda(), token),
1311  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1312  }
1313 
1314  template <typename T, class SIn, class SVal, class SOut, class BinaryCompare>
1318  BinaryCompare binary_compare)
1319  {
1321 
1322  viskores::Id numberOfValues = values.GetNumberOfValues();
1323  viskores::cont::Token token;
1324  LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1325  values.PrepareForInput(DeviceAdapterTagCuda(), token),
1326  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1327  binary_compare);
1328  }
1329 
1330  template <class SIn, class SOut>
1334  {
1336 
1337  viskores::cont::Token token;
1338  LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1339  values_output.PrepareForInPlace(DeviceAdapterTagCuda(), token));
1340  }
1341 
1342  template <typename T, typename U, class SIn>
1343  VISKORES_CONT static U Reduce(const viskores::cont::ArrayHandle<T, SIn>& input, U initialValue)
1344  {
1346 
1347  const viskores::Id numberOfValues = input.GetNumberOfValues();
1348  if (numberOfValues <= 0)
1349  {
1350  return initialValue;
1351  }
1352  viskores::cont::Token token;
1353  return ReducePortal(input.PrepareForInput(DeviceAdapterTagCuda(), token), initialValue);
1354  }
1355 
1356  template <typename T, typename U, class SIn, class BinaryFunctor>
1358  U initialValue,
1359  BinaryFunctor binary_functor)
1360  {
1362 
1363  const viskores::Id numberOfValues = input.GetNumberOfValues();
1364  if (numberOfValues <= 0)
1365  {
1366  return initialValue;
1367  }
1368  viskores::cont::Token token;
1369  return ReducePortal(
1370  input.PrepareForInput(DeviceAdapterTagCuda(), token), initialValue, binary_functor);
1371  }
1372 
1373  // At least some versions of Thrust/nvcc result in compile errors when calling Thrust's
1374  // reduce with sufficiently complex iterators, which can happen with some versions of
1375  // ArrayHandleMultiplexer. Thus, don't use the Thrust version for ArrayHandleMultiplexer.
1376  template <typename T, typename U, typename... SIns>
1379  U initialValue)
1380  {
1381  return Superclass::Reduce(input, initialValue);
1382  }
1383  template <typename T, typename U, typename BinaryFunctor, typename... SIns>
1386  U initialValue,
1387  BinaryFunctor binary_functor)
1388  {
1389  return Superclass::Reduce(input, initialValue, binary_functor);
1390  }
1391 
1392  template <typename T,
1393  typename U,
1394  class KIn,
1395  class VIn,
1396  class KOut,
1397  class VOut,
1398  class BinaryFunctor>
1402  viskores::cont::ArrayHandle<U, VOut>& values_output,
1403  BinaryFunctor binary_functor)
1404  {
1406 
1407  //there is a concern that by default we will allocate too much
1408  //space for the keys/values output. 1 option is to
1409  const viskores::Id numberOfValues = keys.GetNumberOfValues();
1410  if (numberOfValues <= 0)
1411  {
1412  return;
1413  }
1414 
1415  viskores::Id reduced_size;
1416  {
1417  viskores::cont::Token token;
1418  reduced_size = ReduceByKeyPortal(
1419  keys.PrepareForInput(DeviceAdapterTagCuda(), token),
1420  values.PrepareForInput(DeviceAdapterTagCuda(), token),
1421  keys_output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1422  values_output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1423  binary_functor);
1424  }
1425 
1426  keys_output.Allocate(reduced_size, viskores::CopyFlag::On);
1427  values_output.Allocate(reduced_size, viskores::CopyFlag::On);
1428  }
1429 
1430  template <typename T, class SIn, class SOut>
1433  {
1435 
1436  const viskores::Id numberOfValues = input.GetNumberOfValues();
1437  if (numberOfValues <= 0)
1438  {
1439  output.Allocate(0);
1441  }
1442 
1443  //We need call PrepareForInput on the input argument before invoking a
1444  //function. The order of execution of parameters of a function is undefined
1445  //so we need to make sure input is called before output, or else in-place
1446  //use case breaks.
1447  viskores::cont::Token token;
1448  auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token);
1449  return ScanExclusivePortal(
1450  inputPortal, output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1451  }
1452 
1453  template <typename T, class SIn, class SOut, class BinaryFunctor>
1456  BinaryFunctor binary_functor,
1457  const T& initialValue)
1458  {
1460 
1461  const viskores::Id numberOfValues = input.GetNumberOfValues();
1462  if (numberOfValues <= 0)
1463  {
1464  output.Allocate(0);
1466  }
1467 
1468  //We need call PrepareForInput on the input argument before invoking a
1469  //function. The order of execution of parameters of a function is undefined
1470  //so we need to make sure input is called before output, or else in-place
1471  //use case breaks.
1472  viskores::cont::Token token;
1473  auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token);
1474  return ScanExclusivePortal(
1475  inputPortal,
1476  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1477  binary_functor,
1478  initialValue);
1479  }
1480 
1481  template <typename T, class SIn, class SOut>
1484  {
1486 
1487  const viskores::Id numberOfValues = input.GetNumberOfValues();
1488  if (numberOfValues <= 0)
1489  {
1490  output.Allocate(0);
1492  }
1493 
1494  //We need call PrepareForInput on the input argument before invoking a
1495  //function. The order of execution of parameters of a function is undefined
1496  //so we need to make sure input is called before output, or else in-place
1497  //use case breaks.
1498  viskores::cont::Token token;
1499  auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token);
1500  return ScanInclusivePortal(
1501  inputPortal, output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1502  }
1503 
1504  template <typename T, class SIn, class SOut, class BinaryFunctor>
1507  BinaryFunctor binary_functor)
1508  {
1510 
1511  const viskores::Id numberOfValues = input.GetNumberOfValues();
1512  if (numberOfValues <= 0)
1513  {
1514  output.Allocate(0);
1516  }
1517 
1518  //We need call PrepareForInput on the input argument before invoking a
1519  //function. The order of execution of parameters of a function is undefined
1520  //so we need to make sure input is called before output, or else in-place
1521  //use case breaks.
1522  viskores::cont::Token token;
1523  auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token);
1524  return ScanInclusivePortal(
1525  inputPortal,
1526  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1527  binary_functor);
1528  }
1529 
1530  template <typename T, typename U, typename KIn, typename VIn, typename VOut>
1534  {
1536 
1537  const viskores::Id numberOfValues = keys.GetNumberOfValues();
1538  if (numberOfValues <= 0)
1539  {
1540  output.Allocate(0);
1541  return;
1542  }
1543 
1544  //We need call PrepareForInput on the input argument before invoking a
1545  //function. The order of execution of parameters of a function is undefined
1546  //so we need to make sure input is called before output, or else in-place
1547  //use case breaks.
1548  viskores::cont::Token token;
1549  auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token);
1550  auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token);
1551  ScanInclusiveByKeyPortal(
1552  keysPortal,
1553  valuesPortal,
1554  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1555  }
1556 
1557  template <typename T,
1558  typename U,
1559  typename KIn,
1560  typename VIn,
1561  typename VOut,
1562  typename BinaryFunctor>
1566  BinaryFunctor binary_functor)
1567  {
1569 
1570  const viskores::Id numberOfValues = keys.GetNumberOfValues();
1571  if (numberOfValues <= 0)
1572  {
1573  output.Allocate(0);
1574  return;
1575  }
1576 
1577  //We need call PrepareForInput on the input argument before invoking a
1578  //function. The order of execution of parameters of a function is undefined
1579  //so we need to make sure input is called before output, or else in-place
1580  //use case breaks.
1581  viskores::cont::Token token;
1582  auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token);
1583  auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token);
1584  ScanInclusiveByKeyPortal(keysPortal,
1585  valuesPortal,
1586  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1587  ::thrust::equal_to<T>(),
1588  binary_functor);
1589  }
1590 
1591  template <typename T, typename U, typename KIn, typename VIn, typename VOut>
1595  {
1597 
1598  const viskores::Id numberOfValues = keys.GetNumberOfValues();
1599  if (numberOfValues <= 0)
1600  {
1601  output.Allocate(0);
1602  return;
1603  }
1604 
1605  //We need call PrepareForInput on the input argument before invoking a
1606  //function. The order of execution of parameters of a function is undefined
1607  //so we need to make sure input is called before output, or else in-place
1608  //use case breaks.
1609  viskores::cont::Token token;
1610  auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token);
1611  auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token);
1612  ScanExclusiveByKeyPortal(keysPortal,
1613  valuesPortal,
1614  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1616  ::thrust::equal_to<T>(),
1617  viskores::Add());
1618  }
1619 
1620  template <typename T,
1621  typename U,
1622  typename KIn,
1623  typename VIn,
1624  typename VOut,
1625  typename BinaryFunctor>
1629  const U& initialValue,
1630  BinaryFunctor binary_functor)
1631  {
1633 
1634  const viskores::Id numberOfValues = keys.GetNumberOfValues();
1635  if (numberOfValues <= 0)
1636  {
1637  output.Allocate(0);
1638  return;
1639  }
1640 
1641  //We need call PrepareForInput on the input argument before invoking a
1642  //function. The order of execution of parameters of a function is undefined
1643  //so we need to make sure input is called before output, or else in-place
1644  //use case breaks.
1645  viskores::cont::Token token;
1646  auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token);
1647  auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token);
1648  ScanExclusiveByKeyPortal(keysPortal,
1649  valuesPortal,
1650  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1651  initialValue,
1652  ::thrust::equal_to<T>(),
1653  binary_functor);
1654  }
1655 
1656  // we use cuda pinned memory to reduce the amount of synchronization
1657  // and mem copies between the host and device.
1658  struct VISKORES_CONT_EXPORT PinnedErrorArray
1659  {
1660  char* HostPtr = nullptr;
1661  char* DevicePtr = nullptr;
1662  viskores::Id Size = 0;
1663  };
1664 
1665  VISKORES_CONT_EXPORT
1666  static const PinnedErrorArray& GetPinnedErrorArray();
1667 
1668  VISKORES_CONT_EXPORT
1669  static void CheckForErrors(); // throws viskores::cont::ErrorExecution
1670 
1671  VISKORES_CONT_EXPORT
1672  static void SetupErrorBuffer(viskores::exec::cuda::internal::TaskStrided& functor);
1673 
1674  VISKORES_CONT_EXPORT
1675  static void GetBlocksAndThreads(viskores::UInt32& blocks,
1676  viskores::UInt32& threadsPerBlock,
1677  viskores::Id size,
1678  viskores::IdComponent maxThreadsPerBlock);
1679 
1680  VISKORES_CONT_EXPORT
1681  static void GetBlocksAndThreads(viskores::UInt32& blocks,
1682  dim3& threadsPerBlock,
1683  const dim3& size,
1684  viskores::IdComponent maxThreadsPerBlock);
1685 
1686  template <typename... Hints, typename... Args>
1687  static void GetBlocksAndThreads(viskores::cont::internal::HintList<Hints...>, Args&&... args)
1688  {
1689  using ThreadsPerBlock =
1690  viskores::cont::internal::HintFind<viskores::cont::internal::HintList<Hints...>,
1691  viskores::cont::internal::HintThreadsPerBlock<0>,
1693  GetBlocksAndThreads(std::forward<Args>(args)..., ThreadsPerBlock::MaxThreads);
1694  }
1695 
1696  VISKORES_CONT_EXPORT
1697  static void LogKernelLaunch(const cudaFuncAttributes& func_attrs,
1698  const std::type_info& worklet_info,
1699  viskores::UInt32 blocks,
1700  viskores::UInt32 threadsPerBlock,
1701  viskores::Id size);
1702 
1703  VISKORES_CONT_EXPORT
1704  static void LogKernelLaunch(const cudaFuncAttributes& func_attrs,
1705  const std::type_info& worklet_info,
1706  viskores::UInt32 blocks,
1707  dim3 threadsPerBlock,
1708  const dim3& size);
1709 
1710 public:
1711  template <typename WType, typename IType, typename Hints>
1712  static void ScheduleTask(
1713  viskores::exec::cuda::internal::TaskStrided1D<WType, IType, Hints>& functor,
1714  viskores::Id numInstances)
1715  {
1717 
1718  VISKORES_ASSERT(numInstances >= 0);
1719  if (numInstances < 1)
1720  {
1721  // No instances means nothing to run. Just return.
1722  return;
1723  }
1724 
1725  CheckForErrors();
1726  SetupErrorBuffer(functor);
1727 
1728  viskores::UInt32 blocks, threadsPerBlock;
1729  GetBlocksAndThreads(Hints{}, blocks, threadsPerBlock, numInstances);
1730 
1731 #ifdef VISKORES_ENABLE_LOGGING
1733  {
1734  using FunctorType = std::decay_t<decltype(functor)>;
1735  cudaFuncAttributes empty_kernel_attrs;
1736  VISKORES_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs,
1737  cuda::internal::TaskStrided1DLaunch<FunctorType>));
1738  LogKernelLaunch(empty_kernel_attrs, typeid(WType), blocks, threadsPerBlock, numInstances);
1739  }
1740 #endif
1741 
1742  cuda::internal::TaskStrided1DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
1743  functor, numInstances);
1744  }
1745 
1746  template <typename WType, typename IType, typename Hints>
1747  static void ScheduleTask(
1748  viskores::exec::cuda::internal::TaskStrided3D<WType, IType, Hints>& functor,
1749  viskores::Id3 rangeMax)
1750  {
1752 
1753  VISKORES_ASSERT((rangeMax[0] >= 0) && (rangeMax[1] >= 0) && (rangeMax[2] >= 0));
1754  if ((rangeMax[0] < 1) || (rangeMax[1] < 1) || (rangeMax[2] < 1))
1755  {
1756  // No instances means nothing to run. Just return.
1757  return;
1758  }
1759 
1760  CheckForErrors();
1761  SetupErrorBuffer(functor);
1762 
1763  const dim3 ranges(static_cast<viskores::UInt32>(rangeMax[0]),
1764  static_cast<viskores::UInt32>(rangeMax[1]),
1765  static_cast<viskores::UInt32>(rangeMax[2]));
1766 
1767  viskores::UInt32 blocks;
1768  dim3 threadsPerBlock;
1769  GetBlocksAndThreads(Hints{}, blocks, threadsPerBlock, ranges);
1770 
1771 #ifdef VISKORES_ENABLE_LOGGING
1773  {
1774  using FunctorType = std::decay_t<decltype(functor)>;
1775  cudaFuncAttributes empty_kernel_attrs;
1776  VISKORES_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs,
1777  cuda::internal::TaskStrided3DLaunch<FunctorType>));
1778  LogKernelLaunch(empty_kernel_attrs, typeid(WType), blocks, threadsPerBlock, ranges);
1779  }
1780 #endif
1781 
1782  cuda::internal::TaskStrided3DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
1783  functor, rangeMax);
1784  }
1785 
1786  template <typename Hints, typename Functor>
1787  VISKORES_CONT static void Schedule(Hints, Functor functor, viskores::Id numInstances)
1788  {
1790 
1791  viskores::exec::cuda::internal::TaskStrided1D<Functor, viskores::internal::NullType, Hints>
1792  kernel(functor);
1793 
1794  ScheduleTask(kernel, numInstances);
1795  }
1796 
1797  template <typename FunctorType>
1798  VISKORES_CONT static inline void Schedule(FunctorType&& functor, viskores::Id numInstances)
1799  {
1800  Schedule(viskores::cont::internal::HintList<>{}, functor, numInstances);
1801  }
1802 
1803  template <typename Hints, typename Functor>
1804  VISKORES_CONT static void Schedule(Hints, Functor functor, const viskores::Id3& rangeMax)
1805  {
1807 
1808  viskores::exec::cuda::internal::TaskStrided3D<Functor, viskores::internal::NullType, Hints>
1809  kernel(functor);
1810  ScheduleTask(kernel, rangeMax);
1811  }
1812 
1813  template <typename FunctorType>
1814  VISKORES_CONT static inline void Schedule(FunctorType&& functor, viskores::Id3 rangeMax)
1815  {
1816  Schedule(viskores::cont::internal::HintList<>{}, functor, rangeMax);
1817  }
1818 
1819  template <typename T, class Storage>
1821  {
1823 
1824  viskores::cont::Token token;
1825  SortPortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token));
1826  }
1827 
1828  template <typename T, class Storage, class BinaryCompare>
1830  BinaryCompare binary_compare)
1831  {
1833 
1834  viskores::cont::Token token;
1835  SortPortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token), binary_compare);
1836  }
1837 
1838  template <typename T, typename U, class StorageT, class StorageU>
1841  {
1843 
1844  viskores::cont::Token token;
1845  SortByKeyPortal(keys.PrepareForInPlace(DeviceAdapterTagCuda(), token),
1846  values.PrepareForInPlace(DeviceAdapterTagCuda(), token));
1847  }
1848 
1849  template <typename T, typename U, class StorageT, class StorageU, class BinaryCompare>
1852  BinaryCompare binary_compare)
1853  {
1855 
1856  viskores::cont::Token token;
1857  SortByKeyPortal(keys.PrepareForInPlace(DeviceAdapterTagCuda(), token),
1858  values.PrepareForInPlace(DeviceAdapterTagCuda(), token),
1859  binary_compare);
1860  }
1861 
1862  template <typename T, class Storage>
1864  {
1866 
1867  viskores::Id newSize;
1868 
1869  {
1870  viskores::cont::Token token;
1871  newSize = UniquePortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token));
1872  }
1873 
1874  values.Allocate(newSize, viskores::CopyFlag::On);
1875  }
1876 
1877  template <typename T, class Storage, class BinaryCompare>
1879  BinaryCompare binary_compare)
1880  {
1882 
1883  viskores::Id newSize;
1884  {
1885  viskores::cont::Token token;
1886  newSize =
1887  UniquePortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token), binary_compare);
1888  }
1889 
1890  values.Allocate(newSize, viskores::CopyFlag::On);
1891  }
1892 
1893  template <typename T, class SIn, class SVal, class SOut>
1897  {
1899 
1900  viskores::Id numberOfValues = values.GetNumberOfValues();
1901  viskores::cont::Token token;
1902  UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1903  values.PrepareForInput(DeviceAdapterTagCuda(), token),
1904  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1905  }
1906 
1907  template <typename T, class SIn, class SVal, class SOut, class BinaryCompare>
1911  BinaryCompare binary_compare)
1912  {
1914 
1915  viskores::Id numberOfValues = values.GetNumberOfValues();
1916  viskores::cont::Token token;
1917  UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1918  values.PrepareForInput(DeviceAdapterTagCuda(), token),
1919  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1920  binary_compare);
1921  }
1922 
1923  template <class SIn, class SOut>
1927  {
1929 
1930  viskores::cont::Token token;
1931  UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1932  values_output.PrepareForInPlace(DeviceAdapterTagCuda(), token));
1933  }
1934 
1936  {
1938 
1939  VISKORES_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
1940  CheckForErrors();
1941  }
1942 };
1943 
1944 template <>
1946 {
1947 public:
1948  template <typename Hints, typename WorkletType, typename InvocationType>
1949  static viskores::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType, Hints>
1950  MakeTask(WorkletType& worklet, InvocationType& invocation, viskores::Id, Hints = Hints{})
1951  {
1952  return { worklet, invocation };
1953  }
1954 
1955  template <typename Hints, typename WorkletType, typename InvocationType>
1956  static viskores::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType, Hints>
1957  MakeTask(WorkletType& worklet, InvocationType& invocation, viskores::Id3, Hints = Hints{})
1958  {
1959  return { worklet, invocation };
1960  }
1961 
1962  template <typename WorkletType, typename InvocationType, typename RangeType>
1963  VISKORES_CONT static auto MakeTask(WorkletType& worklet,
1964  InvocationType& invocation,
1965  const RangeType& range)
1966  {
1967  return MakeTask<viskores::cont::internal::HintList<>>(worklet, invocation, range);
1968  }
1969 };
1970 }
1971 } // namespace viskores::cont
1972 
1973 #endif //viskores_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CopySubRange
static bool CopySubRange(const viskores::cont::ArrayHandle< T, SIn > &input, viskores::Id inputStartIndex, viskores::Id numberOfElementsToCopy, viskores::cont::ArrayHandle< U, SOut > &output, viskores::Id outputIndex=0)
Definition: DeviceAdapterAlgorithmCuda.h:1233
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScheduleTask
static void ScheduleTask(viskores::exec::cuda::internal::TaskStrided1D< WType, IType, Hints > &functor, viskores::Id numInstances)
Definition: DeviceAdapterAlgorithmCuda.h:1712
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::UpperBounds
static void UpperBounds(const viskores::cont::ArrayHandle< T, SIn > &input, const viskores::cont::ArrayHandle< T, SVal > &values, viskores::cont::ArrayHandle< viskores::Id, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1894
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanInclusiveByKeyPortal
static void ScanInclusiveByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:850
viskores::cont::DeviceAdapterAlgorithm
Struct containing device adapter algorithms.
Definition: DeviceAdapterAlgorithm.h:49
ArrayHandle.h
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScheduleTask
static void ScheduleTask(viskores::exec::cuda::internal::TaskStrided3D< WType, IType, Hints > &functor, viskores::Id3 rangeMax)
Definition: DeviceAdapterAlgorithmCuda.h:1747
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::LowerBounds
static void LowerBounds(const viskores::cont::ArrayHandle< viskores::Id, SIn > &input, viskores::cont::ArrayHandle< viskores::Id, SOut > &values_output)
Definition: DeviceAdapterAlgorithmCuda.h:1331
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CopyIfPortal
static viskores::Id CopyIfPortal(ValuePortal values, StencilPortal stencil, OutputPortal output, UnaryPredicate unary_predicate)
Definition: DeviceAdapterAlgorithmCuda.h:468
viskores::exec::FunctorBase
Base class for all user worklets invoked in the execution environment from a call to viskores::cont::...
Definition: FunctorBase.h:38
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::Superclass
viskores::cont::internal::DeviceAdapterAlgorithmGeneral< viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >, viskores::cont::DeviceAdapterTagCuda > Superclass
Definition: DeviceAdapterAlgorithmCuda.h:286
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::Schedule
static void Schedule(Hints, Functor functor, viskores::Id numInstances)
Definition: DeviceAdapterAlgorithmCuda.h:1787
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::SortPortal
static void SortPortal(const ValuesPortal &values)
Definition: DeviceAdapterAlgorithmCuda.h:946
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::UniquePortal
static viskores::Id UniquePortal(const ValuesPortal values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1018
Types.h
viskores::cont::ArrayHandle::PrepareForInput
ReadPortalType PrepareForInput(viskores::cont::DeviceAdapterId device, viskores::cont::Token &token) const
Prepares this array to be used as an input to an operation in the execution environment.
Definition: ArrayHandle.h:615
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ReducePortal
static T ReducePortal(const InputPortal &input, T initialValue, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:643
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanInclusivePortal
static InputPortal::ValueType ScanInclusivePortal(const InputPortal &input, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:810
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >
Definition: DeviceAdapterAlgorithmCuda.h:273
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::Reduce
static U Reduce(const viskores::cont::ArrayHandle< T, viskores::cont::StorageTagMultiplexer< SIns... >> &input, U initialValue)
Definition: DeviceAdapterAlgorithmCuda.h:1377
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanExclusive
static T ScanExclusive(const viskores::cont::ArrayHandle< T, SIn > &input, viskores::cont::ArrayHandle< T, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1431
viskores::cont::DeviceTaskTypes< viskores::cont::DeviceAdapterTagCuda >::MakeTask
static viskores::exec::cuda::internal::TaskStrided3D< WorkletType, InvocationType, Hints > MakeTask(WorkletType &worklet, InvocationType &invocation, viskores::Id3, Hints=Hints{})
Definition: DeviceAdapterAlgorithmCuda.h:1957
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanInclusiveByKeyPortal
static void ScanInclusiveByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, const OutputPortal &output, BinaryPredicate binary_predicate, AssociativeOperator binary_operator)
Definition: DeviceAdapterAlgorithmCuda.h:865
VISKORES_CUDA_CALL
#define VISKORES_CUDA_CALL(command)
A macro that can be wrapped around a CUDA command and will throw an ErrorCuda exception if the CUDA c...
Definition: ErrorCuda.h:46
UnaryPredicates.h
viskores::AtomicTypePreferred
viskores::UInt32 AtomicTypePreferred
The preferred type to use for atomic operations.
Definition: Atomic.h:794
viskores::cont::cuda::ScheduleParameters::three_d_threads_per_block
dim3 three_d_threads_per_block
Definition: DeviceAdapterAlgorithmCuda.h:98
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::UpperBoundsPortal
static void UpperBoundsPortal(const InputPortal &input, const ValuesPortal &values, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:1039
VISKORES_THIRDPARTY_POST_INCLUDE
#define VISKORES_THIRDPARTY_POST_INCLUDE
Definition: Configure.h:200
viskores::TypeTraits::ZeroInitialization
static T ZeroInitialization()
A static function that returns 0 (or the closest equivalent to it) for the given type.
Definition: TypeTraits.h:85
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ReducePortalImpl
static T ReducePortalImpl(const InputPortal &input, T initialValue, BinaryFunctor binary_functor, std::false_type)
Definition: DeviceAdapterAlgorithmCuda.h:678
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::LowerBoundsPortal
static void LowerBoundsPortal(const InputPortal &input, const OutputPortal &values_output)
Definition: DeviceAdapterAlgorithmCuda.h:603
BitField.h
viskores::cont::DeviceAdapterTagCuda
Tag for a device adapter that uses a CUDA capable GPU device.
Definition: DeviceAdapterTagCuda.h:41
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::Finalize
viskores::Id Finalize() const
Definition: DeviceAdapterAlgorithmCuda.h:352
DeviceAdapterAlgorithmGeneral.h
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanInclusive
static T ScanInclusive(const viskores::cont::ArrayHandle< T, SIn > &input, viskores::cont::ArrayHandle< T, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1482
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::BitFieldToUnorderedSetFunctor
BitFieldToUnorderedSetFunctor(const BitsPortal &input, const IndicesPortal &output, GlobalPopCountType *globalPopCount)
Definition: DeviceAdapterAlgorithmCuda.h:305
viskores::cont::GetStderrLogLevel
viskores::cont::LogLevel GetStderrLogLevel()
Get the active highest log level that will be printed to stderr.
viskores::cont::BitField::GetNumberOfBits
viskores::Id GetNumberOfBits() const
Return the number of bits stored by this BitField.
VISKORES_SUPPRESS_EXEC_WARNINGS
#define VISKORES_SUPPRESS_EXEC_WARNINGS
Definition: ExportMacros.h:61
viskores::cont::ArrayHandle
Manages an array-worth of data.
Definition: ArrayHandle.h:313
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CopyPortal
static void CopyPortal(const InputPortal &input, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:421
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ReduceByKey
static void ReduceByKey(const viskores::cont::ArrayHandle< T, KIn > &keys, const viskores::cont::ArrayHandle< U, VIn > &values, viskores::cont::ArrayHandle< T, KOut > &keys_output, viskores::cont::ArrayHandle< U, VOut > &values_output, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1399
viskores::IdComponent
viskores::Int32 IdComponent
Base type to use to index small lists.
Definition: Types.h:202
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::GlobalPopCount
GlobalPopCountType * GlobalPopCount
Definition: DeviceAdapterAlgorithmCuda.h:412
viskores::cont::DeviceAdapterAlgorithm::VIn
static T VIn
Definition: DeviceAdapterAlgorithm.h:360
viskores::FindFirstSetBit
viskores::Int32 FindFirstSetBit(viskores::UInt32 word)
Bitwise operations.
Definition: Math.h:2852
viskores::cont::cuda::ScheduleParameters::one_d_threads_per_block
int one_d_threads_per_block
Definition: DeviceAdapterAlgorithmCuda.h:92
viskores::cont::DeviceAdapterAlgorithm::U
static T U
Definition: DeviceAdapterAlgorithm.h:358
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::Reduce
static U Reduce(const viskores::cont::ArrayHandle< T, SIn > &input, U initialValue, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1357
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CopyIfPortal
static viskores::Id CopyIfPortal(ValueIterator valuesBegin, ValueIterator valuesEnd, StencilPortal stencil, OutputPortal output, UnaryPredicate unary_predicate)
Definition: DeviceAdapterAlgorithmCuda.h:437
DeviceAdapterAlgorithm.h
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::GetNextOutputIndex
__device__ viskores::Id GetNextOutputIndex() const
Definition: DeviceAdapterAlgorithmCuda.h:396
DeviceAdapterTimerImplementationCuda.h
viskores::cont::DeviceAdapterAlgorithm::VOut
static T VOut
Definition: DeviceAdapterAlgorithm.h:361
DeviceAdapterRuntimeDetectorCuda.h
viskores::cont::DeviceAdapterAlgorithm::Reduce
static U Reduce(const viskores::cont::ArrayHandle< T, CIn > &input, U initialValue)
Compute a accumulated sum operation on the input ArrayHandle.
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::Output
const IndicesPortal Output
Definition: DeviceAdapterAlgorithmCuda.h:411
viskores::cont::DeviceTaskTypes< viskores::cont::DeviceAdapterTagCuda >::MakeTask
static viskores::exec::cuda::internal::TaskStrided1D< WorkletType, InvocationType, Hints > MakeTask(WorkletType &worklet, InvocationType &invocation, viskores::Id, Hints=Hints{})
Definition: DeviceAdapterAlgorithmCuda.h:1950
viskores::Add
Definition: Types.h:268
ThrustPatches.h
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::Initialize
void Initialize()
Definition: DeviceAdapterAlgorithmCuda.h:525
DeviceAdapterTagCuda.h
viskores::cont::BitField::PrepareForInput
ReadPortalType PrepareForInput(viskores::cont::DeviceAdapterId device, viskores::cont::Token &token) const
Prepares this BitField to be used as an input to an operation in the execution environment.
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanExclusivePortal
static InputPortal::ValueType ScanExclusivePortal(const InputPortal &input, const OutputPortal &output, BinaryFunctor binaryOp, typename InputPortal::ValueType initialValue)
Definition: DeviceAdapterAlgorithmCuda.h:762
viskores::cont::cuda::InitScheduleParameters
void InitScheduleParameters(viskores::cont::cuda::ScheduleParameters(*)(char const *name, int major, int minor, int multiProcessorCount, int maxThreadsPerMultiProcessor, int maxThreadsPerBlock))
Specify the custom scheduling to use for Viskores CUDA kernel launches.
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::SortByKeyPortal
static void SortByKeyPortal(const KeysPortal &keys, const ValuesPortal &values)
Definition: DeviceAdapterAlgorithmCuda.h:972
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::LowerBounds
static void LowerBounds(const viskores::cont::ArrayHandle< T, SIn > &input, const viskores::cont::ArrayHandle< T, SVal > &values, viskores::cont::ArrayHandle< viskores::Id, SOut > &output, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1315
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::Word
viskores::AtomicTypePreferred Word
Definition: DeviceAdapterAlgorithmCuda.h:299
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanInclusive
static T ScanInclusive(const viskores::cont::ArrayHandle< T, SIn > &input, viskores::cont::ArrayHandle< T, SOut > &output, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1505
viskores::cont::DeviceTaskTypes< viskores::cont::DeviceAdapterTagCuda >::MakeTask
static auto MakeTask(WorkletType &worklet, InvocationType &invocation, const RangeType &range)
Definition: DeviceAdapterAlgorithmCuda.h:1963
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::UpperBoundsPortal
static void UpperBoundsPortal(const InputPortal &input, const ValuesPortal &values, const OutputPortal &output, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1059
TypeTraits.h
viskores::cont::cuda::ScheduleParameters::three_d_blocks
int three_d_blocks
Definition: DeviceAdapterAlgorithmCuda.h:97
viskores::TypeTraits
The TypeTraits class provides helpful compile-time information about the basic types used in Viskores...
Definition: TypeTraits.h:69
ThrustExceptionHandler.h
viskores::Id
viskores::Int64 Id
Base type to use to index arrays.
Definition: Types.h:235
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ReduceByKeyPortal
static viskores::Id ReduceByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, const KeysOutputPortal &keys_output, const ValueOutputPortal &values_output, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:712
viskores::cont::DeviceAdapterAlgorithm::CopySubRange
static bool CopySubRange(const viskores::cont::ArrayHandle< T, CIn > &input, viskores::Id inputStartIndex, viskores::Id numberOfElementsToCopy, viskores::cont::ArrayHandle< U, COut > &output, viskores::Id outputIndex=0)
Copy the contents of a section of one ArrayHandle to another.
VISKORES_CONT
#define VISKORES_CONT
Definition: ExportMacros.h:65
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::~BitFieldToUnorderedSetFunctor
~BitFieldToUnorderedSetFunctor()
Definition: DeviceAdapterAlgorithmCuda.h:316
viskores::cont::BitField
Definition: BitField.h:507
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::Schedule
static void Schedule(FunctorType &&functor, viskores::Id numInstances)
Definition: DeviceAdapterAlgorithmCuda.h:1798
viskores
Groups connected points that have the same field value.
Definition: Atomic.h:27
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::SortByKey
static void SortByKey(viskores::cont::ArrayHandle< T, StorageT > &keys, viskores::cont::ArrayHandle< U, StorageU > &values)
Definition: DeviceAdapterAlgorithmCuda.h:1839
Math.h
viskores::NotZeroInitialized
Predicate that takes a single argument x, and returns True if it isn't the identity of the Type T.
Definition: UnaryPredicates.h:40
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::Unique
static void Unique(viskores::cont::ArrayHandle< T, Storage > &values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1878
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CountSetBits
static viskores::Id CountSetBits(const viskores::cont::BitField &bits)
Definition: DeviceAdapterAlgorithmCuda.h:1291
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CountSetBitsPortal
static viskores::Id CountSetBitsPortal(const BitsPortal &bits)
Definition: DeviceAdapterAlgorithmCuda.h:1120
viskores::cont::ArrayHandle::PrepareForInPlace
WritePortalType PrepareForInPlace(viskores::cont::DeviceAdapterId device, viskores::cont::Token &token) const
Prepares this array to be used in an in-place operation (both as input and output) in the execution e...
Definition: ArrayHandle.h:634
viskores::CopyFlag::On
@ On
ErrorMessageBuffer.h
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanExclusiveByKey
static void ScanExclusiveByKey(const viskores::cont::ArrayHandle< T, KIn > &keys, const viskores::cont::ArrayHandle< U, VIn > &values, viskores::cont::ArrayHandle< U, VOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1592
viskores::cont::cuda::ScheduleParameters::one_d_blocks
int one_d_blocks
Definition: DeviceAdapterAlgorithmCuda.h:91
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::Finalize
viskores::Id Finalize() const
Definition: DeviceAdapterAlgorithmCuda.h:548
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::Portal
const BitsPortal Portal
Definition: DeviceAdapterAlgorithmCuda.h:585
viskores::CountSetBits
viskores::Int32 CountSetBits(viskores::UInt32 word)
Count the total number of bits set in word.
Definition: Math.h:2948
viskores::cont::ArrayHandle::Allocate
void Allocate(viskores::Id numberOfValues, viskores::CopyFlag preserve, viskores::cont::Token &token) const
Allocates an array large enough to hold the given number of values.
Definition: ArrayHandle.h:504
MakeThrustIterator.h
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::CountSetBitsFunctor
CountSetBitsFunctor(const BitsPortal &portal, GlobalPopCountType *globalPopCount)
Definition: DeviceAdapterAlgorithmCuda.h:515
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::ReduceAllocate
__device__ void ReduceAllocate() const
Definition: DeviceAdapterAlgorithmCuda.h:367
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::UpperBoundsPortal
static void UpperBoundsPortal(const InputPortal &input, const OutputPortal &values_output)
Definition: DeviceAdapterAlgorithmCuda.h:1085
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CopyIf
static void CopyIf(const viskores::cont::ArrayHandle< U, SIn > &input, const viskores::cont::ArrayHandle< T, SStencil > &stencil, viskores::cont::ArrayHandle< U, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1177
viskores::cont::ArrayHandle::GetNumberOfValues
viskores::Id GetNumberOfValues() const
Returns the number of entries in the array.
Definition: ArrayHandle.h:482
viskores::cont::LogLevel::KernelLaunches
@ KernelLaunches
Details on device-side kernel launches.
viskores::cont::ArrayHandle::PrepareForOutput
WritePortalType PrepareForOutput(viskores::Id numberOfValues, viskores::cont::DeviceAdapterId device, viskores::cont::Token &token) const
Prepares (allocates) this array to be used as an output from an operation in the execution environmen...
Definition: ArrayHandle.h:654
VISKORES_ASSERT
#define VISKORES_ASSERT(condition)
Definition: Assert.h:51
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::Initialize
void Initialize()
Definition: DeviceAdapterAlgorithmCuda.h:318
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::Input
const BitsPortal Input
Definition: DeviceAdapterAlgorithmCuda.h:410
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSet
static viskores::Id BitFieldToUnorderedSet(const viskores::cont::BitField &bits, viskores::cont::ArrayHandle< Id, IndicesStorage > &indices)
Definition: DeviceAdapterAlgorithmCuda.h:1138
VISKORES_PASS_COMMAS
#define VISKORES_PASS_COMMAS(...)
Definition: Configure.h:372
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::operator()
__device__ void operator()(viskores::Id wordIdx) const
Definition: DeviceAdapterAlgorithmCuda.h:532
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::UniquePortal
static viskores::Id UniquePortal(const ValuesPortal values)
Definition: DeviceAdapterAlgorithmCuda.h:1001
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::Word
viskores::AtomicTypePreferred Word
Definition: DeviceAdapterAlgorithmCuda.h:512
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::UpperBounds
static void UpperBounds(const viskores::cont::ArrayHandle< T, SIn > &input, const viskores::cont::ArrayHandle< T, SVal > &values, viskores::cont::ArrayHandle< viskores::Id, SOut > &output, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1908
viskores::cont::DeviceAdapterAlgorithm::KIn
static T KIn
Definition: DeviceAdapterAlgorithm.h:359
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::GlobalPopCount
GlobalPopCountType * GlobalPopCount
Definition: DeviceAdapterAlgorithmCuda.h:586
ErrorExecution.h
viskores::cont::cuda::ScheduleParameters::two_d_blocks
int two_d_blocks
Definition: DeviceAdapterAlgorithmCuda.h:94
VISKORES_STATIC_ASSERT_MSG
#define VISKORES_STATIC_ASSERT_MSG(condition, message)
Definition: StaticAssert.h:26
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::~CountSetBitsFunctor
~CountSetBitsFunctor()
Definition: DeviceAdapterAlgorithmCuda.h:523
viskores::UInt64
unsigned long long UInt64
Base type to use for 64-bit signed integer numbers.
Definition: Types.h:215
viskores::Int32
int32_t Int32
Base type to use for 32-bit signed integer numbers.
Definition: Types.h:189
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanExclusiveByKey
static void ScanExclusiveByKey(const viskores::cont::ArrayHandle< T, KIn > &keys, const viskores::cont::ArrayHandle< U, VIn > &values, viskores::cont::ArrayHandle< U, VOut > &output, const U &initialValue, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1626
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanInclusivePortal
static InputPortal::ValueType ScanInclusivePortal(const InputPortal &input, const OutputPortal &output, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:819
viskores::cont::cuda::ScheduleParameters::two_d_threads_per_block
dim3 two_d_threads_per_block
Definition: DeviceAdapterAlgorithmCuda.h:95
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::SortByKeyPortal
static void SortByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:979
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::Schedule
static void Schedule(Hints, Functor functor, const viskores::Id3 &rangeMax)
Definition: DeviceAdapterAlgorithmCuda.h:1804
ArrayHandleMultiplexer.h
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanExclusive
static T ScanExclusive(const viskores::cont::ArrayHandle< T, SIn > &input, viskores::cont::ArrayHandle< T, SOut > &output, BinaryFunctor binary_functor, const T &initialValue)
Definition: DeviceAdapterAlgorithmCuda.h:1454
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::Sort
static void Sort(viskores::cont::ArrayHandle< T, Storage > &values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1829
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::Copy
static void Copy(const viskores::cont::ArrayHandle< T, SIn > &input, viskores::cont::ArrayHandle< U, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1160
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::UpperBounds
static void UpperBounds(const viskores::cont::ArrayHandle< viskores::Id, SIn > &input, viskores::cont::ArrayHandle< viskores::Id, SOut > &values_output)
Definition: DeviceAdapterAlgorithmCuda.h:1924
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::Reduce
static U Reduce(const viskores::cont::ArrayHandle< T, SIn > &input, U initialValue)
Definition: DeviceAdapterAlgorithmCuda.h:1343
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanExclusiveByKeyPortal
static void ScanExclusiveByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:895
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::Reduce
__device__ void Reduce() const
Definition: DeviceAdapterAlgorithmCuda.h:562
VISKORES_LOG_SCOPE_FUNCTION
#define VISKORES_LOG_SCOPE_FUNCTION(level)
Definition: Logging.h:225
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::Sort
static void Sort(viskores::cont::ArrayHandle< T, Storage > &values)
Definition: DeviceAdapterAlgorithmCuda.h:1820
viskores::cont::LogLevel::Perf
@ Perf
General timing data and algorithm flow information, such as filter execution, worklet dispatches,...
viskores::cont::DeviceTaskTypes
Class providing a device-specific support for selecting the optimal Task type for a given worklet.
Definition: DeviceAdapterAlgorithm.h:757
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanExclusivePortal
static InputPortal::ValueType ScanExclusivePortal(const InputPortal &input, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:749
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::Unique
static void Unique(viskores::cont::ArrayHandle< T, Storage > &values)
Definition: DeviceAdapterAlgorithmCuda.h:1863
viskores::Get
auto Get(const viskores::Tuple< Ts... > &tuple)
Retrieve the object from a viskores::Tuple at the given index.
Definition: Tuple.h:89
viskores::cont::StorageTagMultiplexer
Definition: ArrayHandleMultiplexer.h:149
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanInclusiveByKey
static void ScanInclusiveByKey(const viskores::cont::ArrayHandle< T, KIn > &keys, const viskores::cont::ArrayHandle< U, VIn > &values, viskores::cont::ArrayHandle< U, VOut > &output, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1563
Logging.h
Logging utilities.
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ReducePortalImpl
static T ReducePortalImpl(const InputPortal &input, T initialValue, BinaryFunctor binary_functor, std::true_type)
Definition: DeviceAdapterAlgorithmCuda.h:652
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::operator()
__device__ void operator()(viskores::Id wordIdx) const
Definition: DeviceAdapterAlgorithmCuda.h:325
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanExclusiveByKeyPortal
static void ScanExclusiveByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, const OutputPortal &output, T initValue, BinaryPredicate binary_predicate, AssociativeOperator binary_operator)
Definition: DeviceAdapterAlgorithmCuda.h:915
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::SortPortal
static void SortPortal(const ValuesPortal &values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:953
ErrorCuda.h
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::Reduce
static U Reduce(const viskores::cont::ArrayHandle< T, viskores::cont::StorageTagMultiplexer< SIns... >> &input, U initialValue, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1384
Token.h
VISKORES_THIRDPARTY_PRE_INCLUDE
#define VISKORES_THIRDPARTY_PRE_INCLUDE
Definition: Configure.h:199
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::Schedule
static void Schedule(FunctorType &&functor, viskores::Id3 rangeMax)
Definition: DeviceAdapterAlgorithmCuda.h:1814
viskores::Vec< viskores::Id, 3 >
viskores::cont::Token
A token to hold the scope of an ArrayHandle or other object.
Definition: Token.h:43
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::LowerBounds
static void LowerBounds(const viskores::cont::ArrayHandle< T, SIn > &input, const viskores::cont::ArrayHandle< T, SVal > &values, viskores::cont::ArrayHandle< viskores::Id, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1301
TaskStrided.h
viskores::UInt32
uint32_t UInt32
Base type to use for 32-bit unsigned integer numbers.
Definition: Types.h:193
VISKORES_EXEC
#define VISKORES_EXEC
Definition: ExportMacros.h:59
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CopyIf
static void CopyIf(const viskores::cont::ArrayHandle< U, SIn > &input, const viskores::cont::ArrayHandle< T, SStencil > &stencil, viskores::cont::ArrayHandle< U, SOut > &output, UnaryPredicate unary_predicate)
Definition: DeviceAdapterAlgorithmCuda.h:1205
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ReducePortal
static T ReducePortal(const InputPortal &input, T initialValue)
Definition: DeviceAdapterAlgorithmCuda.h:637
viskores::cont::cuda::ScheduleParameters
Represents how to schedule 1D, 2D, and 3D Cuda kernels.
Definition: DeviceAdapterAlgorithmCuda.h:89
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::Synchronize
static void Synchronize()
Definition: DeviceAdapterAlgorithmCuda.h:1935
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::ScanInclusiveByKey
static void ScanInclusiveByKey(const viskores::cont::ArrayHandle< T, KIn > &keys, const viskores::cont::ArrayHandle< U, VIn > &values, viskores::cont::ArrayHandle< U, VOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1531
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::LowerBoundsPortal
static void LowerBoundsPortal(const InputPortal &input, const ValuesPortal &values, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:594
VISKORES_STATIC_ASSERT
#define VISKORES_STATIC_ASSERT(condition)
Definition: StaticAssert.h:24
WrappedOperators.h
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetPortal
static viskores::Id BitFieldToUnorderedSetPortal(const BitsPortal &bits, const IndicesPortal &indices)
Definition: DeviceAdapterAlgorithmCuda.h:1104
viskores::cont::DeviceAdapterAlgorithm::Schedule
static void Schedule(Functor functor, viskores::Id numInstances)
Schedule many instances of a function to run on concurrent threads.
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::CopySubRangePortal
static void CopySubRangePortal(const InputPortal &input, viskores::Id inputOffset, viskores::Id size, const OutputPortal &output, viskores::Id outputOffset)
Definition: DeviceAdapterAlgorithmCuda.h:481
viskores_cont_export.h
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::LowerBoundsPortal
static void LowerBoundsPortal(const InputPortal &input, const ValuesPortal &values, const OutputPortal &output, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:611
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::GetBlocksAndThreads
static void GetBlocksAndThreads(viskores::cont::internal::HintList< Hints... >, Args &&... args)
Definition: DeviceAdapterAlgorithmCuda.h:1687
viskores::cont::DeviceAdapterAlgorithm< viskores::cont::DeviceAdapterTagCuda >::SortByKey
static void SortByKey(viskores::cont::ArrayHandle< T, StorageT > &keys, viskores::cont::ArrayHandle< U, StorageU > &values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1850