18 #ifndef viskores_cont_openmp_internal_FunctorsOpenMP_h
19 #define viskores_cont_openmp_internal_FunctorsOpenMP_h
36 #include <type_traits>
42 #define VISKORES_OPENMP_DIRECTIVE_IMPL(fullDir) _Pragma(#fullDir)
43 #define VISKORES_OPENMP_DIRECTIVE(dir) VISKORES_OPENMP_DIRECTIVE_IMPL(omp dir)
45 #define VISKORES_OPENMP_DIRECTIVE(directive)
54 #if defined(VISKORES_GCC) && (__GNUC__ < 9)
55 #define VISKORES_OPENMP_SHARED_CONST(...)
57 #define VISKORES_OPENMP_SHARED_CONST(...) shared(__VA_ARGS__)
74 constexpr
static viskores::Id VISKORES_CACHE_LINE_SIZE = 64;
79 static constexpr T CeilDivide(
const T& numerator,
const T& denominator)
81 return (numerator + denominator - 1) / denominator;
95 const viskores::Id pagesIn = CeilDivide(bytesIn, VISKORES_PAGE_SIZE);
97 numChunks = (pagesIn > numThreads * chunksPerThread) ? numThreads * chunksPerThread : numThreads;
98 const viskores::Id pagesPerChunk = CeilDivide(pagesIn, numChunks);
99 valuesPerChunk = CeilDivide(pagesPerChunk * VISKORES_PAGE_SIZE, bytesPerValue);
102 template <
typename T>
108 template <
typename PortalType>
111 using type =
typename PortalType::ValueType;
114 template <
typename T>
117 template <
typename T,
typename U>
118 static void DoCopy(T src, U dst,
viskores::Id numVals, std::true_type)
122 std::copy(src, src + numVals, dst);
127 template <
typename InIterT,
typename OutIterT>
128 static void DoCopy(InIterT inIter, OutIterT outIter,
viskores::Id numVals, std::false_type)
130 using InValueType = CleanArrayRef<typename std::iterator_traits<InIterT>::value_type>;
131 using OutValueType = CleanArrayRef<typename std::iterator_traits<OutIterT>::value_type>;
139 *(outIter++) =
static_cast<OutValueType
>(
static_cast<InValueType
>(*(inIter++)));
143 template <
typename InIterT,
typename OutIterT>
144 static void DoCopy(InIterT inIter, OutIterT outIter,
viskores::Id numVals)
146 using InValueType = CleanArrayRef<typename std::iterator_traits<InIterT>::value_type>;
147 using OutValueType = CleanArrayRef<typename std::iterator_traits<OutIterT>::value_type>;
149 DoCopy(inIter, outIter, numVals, std::is_same<InValueType, OutValueType>());
153 template <
typename InPortalT,
typename OutPortalT>
154 static void CopyHelper(InPortalT inPortal,
155 OutPortalT outPortal,
160 using InValueT =
typename InPortalT::ValueType;
161 using OutValueT =
typename OutPortalT::ValueType;
162 constexpr
auto isSame = std::is_same<InValueT, OutValueT>();
180 .GetThreads(numThreads);
181 ComputeChunkSize(numVals, numThreads, 8,
sizeof(InValueT), numChunks, valuesPerChunk);
185 for (
viskores::
Id i = 0; i < numVals; i += valuesPerChunk)
187 viskores::Id chunkSize = std::min(numVals - i, valuesPerChunk);
188 DoCopy(inIter + i, outIter + i, chunkSize, isSame);
207 this->NumValues = numValues;
210 .GetThreads(this->NumThreads);
211 this->ValueSize = valueSize;
216 this->NumValues, this->NumThreads, 8, valueSize, this->NumChunks, this->ChunkSize);
218 this->EndIds.resize(
static_cast<std::size_t
>(this->NumChunks));
221 template <
typename InIterT,
typename StencilIterT,
typename OutIterT,
typename PredicateT>
223 StencilIterT stencilIter,
228 viskores::Id startPos = std::min(chunk * this->ChunkSize, this->NumValues);
229 viskores::Id endPos = std::min((chunk + 1) * this->ChunkSize, this->NumValues);
232 for (
viskores::Id inPos = startPos; inPos < endPos; ++inPos)
234 if (pred(stencilIter[inPos]))
236 outIter[outPos++] = inIter[inPos];
240 this->EndIds[
static_cast<std::size_t
>(chunk)] = outPos;
243 template <
typename OutIterT>
249 viskores::Id chunkStart = std::min(i * this->ChunkSize, this->NumValues);
250 viskores::Id chunkEnd = this->EndIds[
static_cast<std::size_t
>(i)];
252 if (numValuesToCopy > 0 && chunkStart != endPos)
254 std::copy(data + chunkStart, data + chunkEnd, data + endPos);
256 endPos += numValuesToCopy;
262 #ifdef VISKORES_OPENMP_USE_NATIVE_REDUCTION
265 template <
typename T>
310 template <
typename T>
312 #endif // VISKORES_OPENMP_USE_NATIVE_REDUCTION
317 template <
typename T>
322 template <
typename T, viskores::IdComponent Size>
327 template <
typename T,
typename U>
329 :
public std::integral_constant<bool, std::is_integral<T>::value && std::is_integral<U>::value>
334 template <
typename PortalT,
typename ReturnType,
typename Functor>
335 static ReturnType
Execute(PortalT portal, ReturnType init, Functor functorIn, std::false_type)
337 internal::WrappedBinaryOperator<ReturnType, Functor> f(functorIn);
339 const viskores::Id numVals = portal.GetNumberOfValues();
342 bool doParallel =
false;
347 .GetThreads(numThreads);
349 std::unique_ptr<ReturnType[]> threadData;
354 int tid = omp_get_thread_num();
358 if (numVals >= numThreads * 2)
361 threadData.reset(
new ReturnType[
static_cast<std::size_t
>(numThreads)]);
368 const ReturnType localResult = ReduceHelper::DoParallelReduction<ReturnType>(
371 threadData[
static_cast<std::size_t
>(tid)] = localResult;
378 for (
size_t i = 0; i < static_cast<size_t>(numThreads); ++i)
380 init = f(init, threadData[i]);
388 init = f(init, data[i]);
397 template <
typename ReturnType,
typename IterType,
typename FunctorType>
401 const int& numThreads,
406 ReturnType accum = f(data[2 * tid], data[2 * tid + 1]);
409 const viskores::Id end = std::max(((numVals / 4) * 4) - 4, offset);
410 const viskores::Id unrollEnd = end - ((end - offset) % 4);
415 #pragma GCC diagnostic push
416 #pragma GCC diagnostic ignored "-Wsign-conversion"
418 for (i = offset; i < unrollEnd; i += 4)
419 #pragma GCC diagnostic pop
421 const auto t1 = f(data[i], data[i + 1]);
422 const auto t2 = f(data[i + 2], data[i + 3]);
423 accum = f(accum, t1);
424 accum = f(accum, t2);
429 if (tid == numThreads - 1)
431 for (i = unrollEnd; i < numVals; ++i)
433 accum = f(accum, data[i]);
442 template <
typename ReturnType,
typename IterType,
typename FunctorType>
446 const int& numThreads,
451 ReturnType accum = f(data[2 * tid], data[2 * tid + 1]);
454 #pragma GCC diagnostic push
455 #pragma GCC diagnostic ignored "-Wsign-conversion"
458 #pragma GCC diagnostic pop
460 accum = f(accum, data[i]);
466 #ifdef VISKORES_OPENMP_USE_NATIVE_REDUCTION
469 #define VISKORES_OPENMP_SPECIALIZE_REDUCE1(FunctorType, PragmaString) \
470 template <typename PortalT, typename ReturnType> \
471 static ReturnType Execute( \
472 PortalT portal, ReturnType value, FunctorType functorIn, std::true_type) \
474 const viskores::Id numValues = portal.GetNumberOfValues(); \
475 internal::WrappedBinaryOperator<ReturnType, FunctorType> f(functorIn); \
476 _Pragma(#PragmaString) for (viskores::Id i = 0; i < numValues; ++i) \
478 value = f(value, portal.Get(i)); \
485 #define VISKORES_OPENMP_SPECIALIZE_REDUCE(FunctorType, Operator) \
486 VISKORES_OPENMP_SPECIALIZE_REDUCE1(FunctorType, "omp parallel for reduction(" #Operator ":value)")
490 VISKORES_OPENMP_SPECIALIZE_REDUCE(
viskores::
Sum, +)
511 #undef VISKORES_OPENMP_SPECIALIZE_REDUCE
512 #undef VISKORES_OPENMP_SPECIALIZE_REDUCE1
514 #endif // VISKORES_OPENMP_USE_NATIVE_REDUCTION
517 template <
typename KeysInArray,
518 typename ValuesInArray,
519 typename KeysOutArray,
520 typename ValuesOutArray,
521 typename BinaryFunctor>
523 ValuesInArray valuesInArray,
524 KeysOutArray keysOutArray,
525 ValuesOutArray valuesOutArray,
526 BinaryFunctor functor)
528 using KeyType =
typename KeysInArray::ValueType;
529 using ValueType =
typename ValuesInArray::ValueType;
533 const viskores::Id numValues = keysInArray.GetNumberOfValues();
540 auto valuesOutPortal =
545 internal::WrappedBinaryOperator<ValueType, BinaryFunctor> f(functor);
551 .GetThreads(numThreads);
554 firstprivate(keysIn, valuesIn, keysOut, valuesOut, f)
557 int tid = omp_get_thread_num();
560 viskores::Id chunkSize = (numValues + numThreads - 1) / numThreads;
561 viskores::Id scanIdx = std::min(tid * chunkSize, numValues);
562 viskores::Id scanEnd = std::min(scanIdx + chunkSize, numValues);
564 auto threadKeysBegin = keysOut + scanIdx;
565 auto threadValuesBegin = valuesOut + scanIdx;
566 auto threadKey = threadKeysBegin;
567 auto threadValue = threadValuesBegin;
571 ValueType rangeValue;
574 if (scanIdx < scanEnd)
576 rangeKey = keysIn[scanIdx];
577 rangeValue = valuesIn[scanIdx];
581 while (scanIdx < scanEnd &&
static_cast<KeyType
>(keysIn[scanIdx]) == rangeKey)
583 rangeValue = f(rangeValue, valuesIn[scanIdx]);
587 *threadKey = rangeKey;
588 *threadValue = rangeValue;
600 outIdx =
static_cast<viskores::Id>(threadKey - threadKeysBegin);
605 for (
int i = 1; i < numThreads; ++i)
617 if (outIdx > 0 && threadKeysBegin < threadKey && keysOut[outIdx - 1] == *threadKeysBegin)
619 valuesOut[outIdx - 1] = f(valuesOut[outIdx - 1], *threadValuesBegin);
625 if (threadKeysBegin < threadKey && threadKeysBegin != keysOut + outIdx)
627 std::copy(threadKeysBegin, threadKey, keysOut + outIdx);
628 std::copy(threadValuesBegin, threadValue, valuesOut + outIdx);
631 outIdx +=
static_cast<viskores::Id>(threadKey - threadKeysBegin);
643 template <
typename IterT,
typename RawPredicateT>
646 using ValueType =
typename std::iterator_traits<IterT>::value_type;
647 using PredicateT = internal::WrappedBinaryOperator<bool, RawPredicateT>;
705 .GetThreads(numThreads);
710 this->NumValues, numThreads, chunksPerThread,
sizeof(
ValueType), numChunks, this->LeafSize);
713 std::size_t numNodes =
static_cast<std::size_t
>(numChunks);
714 while (numChunks > 1)
716 numChunks = (numChunks + 1) / 2;
717 numNodes +=
static_cast<std::size_t
>(numChunks);
719 this->Nodes.resize(numNodes);
728 #pragma GCC diagnostic push
729 #pragma GCC diagnostic ignored "-Wunused-value"
737 #pragma GCC diagnostic pop
741 return &this->Nodes[nodeIdx];
753 return CeilDivide(n / 2, np) * np + range[0];
770 auto explicitThis =
this;
776 explicitThis->Uniquify(right);
798 auto start = this->Data + node->
InputRange[0];
800 end = std::unique(start, end, this->Predicate);
810 #endif // viskores_cont_openmp_internal_FunctorsOpenMP_h