Viskores  1.0
FunctorsOpenMP.h
Go to the documentation of this file.
1 //============================================================================
2 // The contents of this file are covered by the Viskores license. See
3 // LICENSE.txt for details.
4 //
5 // By contributing to this file, all contributors agree to the Developer
6 // Certificate of Origin Version 1.1 (DCO 1.1) as stated in DCO.txt.
7 //============================================================================
8 
9 //============================================================================
10 // Copyright (c) Kitware, Inc.
11 // All rights reserved.
12 // See LICENSE.txt for details.
13 //
14 // This software is distributed WITHOUT ANY WARRANTY; without even
15 // the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
16 // PURPOSE. See the above copyright notice for more information.
17 //============================================================================
18 #ifndef viskores_cont_openmp_internal_FunctorsOpenMP_h
19 #define viskores_cont_openmp_internal_FunctorsOpenMP_h
20 
23 
25 
28 #include <viskores/Pair.h>
29 #include <viskores/Types.h>
32 
33 #include <omp.h>
34 
35 #include <algorithm>
36 #include <type_traits>
37 #include <vector>
38 
39 // Wrap all '#pragma omp ...' calls in this macro so we can disable them in
40 // non-omp builds and avoid a multitude of 'ignoring pragma..." warnings.
41 #ifdef _OPENMP
42 #define VISKORES_OPENMP_DIRECTIVE_IMPL(fullDir) _Pragma(#fullDir)
43 #define VISKORES_OPENMP_DIRECTIVE(dir) VISKORES_OPENMP_DIRECTIVE_IMPL(omp dir)
44 #else // _OPENMP
45 #define VISKORES_OPENMP_DIRECTIVE(directive)
46 #endif // _OPENMP
47 
48 // See "OpenMP data sharing" section of
49 // https://www.gnu.org/software/gcc/gcc-9/porting_to.html. OpenMP broke
50 // backwards compatibility regarding const variable handling.
51 // tl;dr, put all const variables accessed from openmp blocks in a
52 // VISKORES_OPENMP_SHARED_CONST(var1, var2, ...) macro. This will do The Right Thing
53 // on all gcc.
54 #if defined(VISKORES_GCC) && (__GNUC__ < 9)
55 #define VISKORES_OPENMP_SHARED_CONST(...)
56 #else
57 #define VISKORES_OPENMP_SHARED_CONST(...) shared(__VA_ARGS__)
58 #endif
59 
60 // When defined, supported type / operator combinations will use the OpenMP
61 // reduction(...) clause. Otherwise, all reductions use the general
62 // implementation with a manual reduction once the threads complete.
63 // I don't know how, but the benchmarks currently perform better without the
64 // specializations.
65 //#define VISKORES_OPENMP_USE_NATIVE_REDUCTION
66 
67 namespace viskores
68 {
69 namespace cont
70 {
71 namespace openmp
72 {
73 
74 constexpr static viskores::Id VISKORES_CACHE_LINE_SIZE = 64;
75 constexpr static viskores::Id VISKORES_PAGE_SIZE = 4096;
76 
77 // Returns ceil(num/den) for integral types
78 template <typename T>
79 static constexpr T CeilDivide(const T& numerator, const T& denominator)
80 {
81  return (numerator + denominator - 1) / denominator;
82 }
83 
84 // Computes the number of values per chunk. Note that numChunks + chunkSize may
85 // exceed numVals, so be sure to check upper limits.
86 static void ComputeChunkSize(const viskores::Id numVals,
87  const viskores::Id numThreads,
88  const viskores::Id chunksPerThread,
89  const viskores::Id bytesPerValue,
90  viskores::Id& numChunks,
91  viskores::Id& valuesPerChunk)
92 {
93  // try to evenly distribute pages across chunks:
94  const viskores::Id bytesIn = numVals * bytesPerValue;
95  const viskores::Id pagesIn = CeilDivide(bytesIn, VISKORES_PAGE_SIZE);
96  // If we don't have enough pages to honor chunksPerThread, ignore it:
97  numChunks = (pagesIn > numThreads * chunksPerThread) ? numThreads * chunksPerThread : numThreads;
98  const viskores::Id pagesPerChunk = CeilDivide(pagesIn, numChunks);
99  valuesPerChunk = CeilDivide(pagesPerChunk * VISKORES_PAGE_SIZE, bytesPerValue);
100 }
101 
102 template <typename T>
104 {
105  using type = T;
106 };
107 
108 template <typename PortalType>
109 struct CleanArrayRefImpl<viskores::internal::ArrayPortalValueReference<PortalType>>
110 {
111  using type = typename PortalType::ValueType;
112 };
113 
114 template <typename T>
116 
117 template <typename T, typename U>
118 static void DoCopy(T src, U dst, viskores::Id numVals, std::true_type)
119 {
120  if (numVals)
121  {
122  std::copy(src, src + numVals, dst);
123  }
124 }
125 
126 // Don't use std::copy when type conversion is required because MSVC.
127 template <typename InIterT, typename OutIterT>
128 static void DoCopy(InIterT inIter, OutIterT outIter, viskores::Id numVals, std::false_type)
129 {
130  using InValueType = CleanArrayRef<typename std::iterator_traits<InIterT>::value_type>;
131  using OutValueType = CleanArrayRef<typename std::iterator_traits<OutIterT>::value_type>;
132 
133  for (viskores::Id i = 0; i < numVals; ++i)
134  {
135  // The conversion to InputType and then OutputType looks weird, but it is necessary.
136  // *inItr actually returns an ArrayPortalValueReference, which can automatically convert
137  // itself to InputType but not necessarily OutputType. Thus, we first convert to
138  // InputType, and then allow the conversion to OutputType.
139  *(outIter++) = static_cast<OutValueType>(static_cast<InValueType>(*(inIter++)));
140  }
141 }
142 
143 template <typename InIterT, typename OutIterT>
144 static void DoCopy(InIterT inIter, OutIterT outIter, viskores::Id numVals)
145 {
146  using InValueType = CleanArrayRef<typename std::iterator_traits<InIterT>::value_type>;
147  using OutValueType = CleanArrayRef<typename std::iterator_traits<OutIterT>::value_type>;
148 
149  DoCopy(inIter, outIter, numVals, std::is_same<InValueType, OutValueType>());
150 }
151 
152 
153 template <typename InPortalT, typename OutPortalT>
154 static void CopyHelper(InPortalT inPortal,
155  OutPortalT outPortal,
156  viskores::Id inStart,
157  viskores::Id outStart,
158  viskores::Id numVals)
159 {
160  using InValueT = typename InPortalT::ValueType;
161  using OutValueT = typename OutPortalT::ValueType;
162  constexpr auto isSame = std::is_same<InValueT, OutValueT>();
163 
164  auto inIter = viskores::cont::ArrayPortalToIteratorBegin(inPortal) + inStart;
165  auto outIter = viskores::cont::ArrayPortalToIteratorBegin(outPortal) + outStart;
166  viskores::Id valuesPerChunk;
167 
168  VISKORES_OPENMP_DIRECTIVE(parallel default(none) shared(inIter, outIter, valuesPerChunk, numVals)
170  {
171 
173  {
174  // Evenly distribute full pages to all threads. We manually chunk the
175  // data here so that we can exploit std::copy's memmove optimizations.
176  viskores::Id numChunks;
177  viskores::Id numThreads;
180  .GetThreads(numThreads);
181  ComputeChunkSize(numVals, numThreads, 8, sizeof(InValueT), numChunks, valuesPerChunk);
182  }
183 
184  VISKORES_OPENMP_DIRECTIVE(for schedule(static))
185  for (viskores::Id i = 0; i < numVals; i += valuesPerChunk)
186  {
187  viskores::Id chunkSize = std::min(numVals - i, valuesPerChunk);
188  DoCopy(inIter + i, outIter + i, chunkSize, isSame);
189  }
190  }
191 }
192 
194 {
198 
201  std::vector<viskores::Id> EndIds;
202 
203  CopyIfHelper() = default;
204 
205  void Initialize(viskores::Id numValues, viskores::Id valueSize)
206  {
207  this->NumValues = numValues;
210  .GetThreads(this->NumThreads);
211  this->ValueSize = valueSize;
212 
213  // Evenly distribute pages across the threads. We manually chunk the
214  // data here so that we can exploit std::copy's memmove optimizations.
215  ComputeChunkSize(
216  this->NumValues, this->NumThreads, 8, valueSize, this->NumChunks, this->ChunkSize);
217 
218  this->EndIds.resize(static_cast<std::size_t>(this->NumChunks));
219  }
220 
221  template <typename InIterT, typename StencilIterT, typename OutIterT, typename PredicateT>
222  void CopyIf(InIterT inIter,
223  StencilIterT stencilIter,
224  OutIterT outIter,
225  PredicateT pred,
226  viskores::Id chunk)
227  {
228  viskores::Id startPos = std::min(chunk * this->ChunkSize, this->NumValues);
229  viskores::Id endPos = std::min((chunk + 1) * this->ChunkSize, this->NumValues);
230 
231  viskores::Id outPos = startPos;
232  for (viskores::Id inPos = startPos; inPos < endPos; ++inPos)
233  {
234  if (pred(stencilIter[inPos]))
235  {
236  outIter[outPos++] = inIter[inPos];
237  }
238  }
239 
240  this->EndIds[static_cast<std::size_t>(chunk)] = outPos;
241  }
242 
243  template <typename OutIterT>
244  viskores::Id Reduce(OutIterT data)
245  {
246  viskores::Id endPos = this->EndIds.front();
247  for (viskores::Id i = 1; i < this->NumChunks; ++i)
248  {
249  viskores::Id chunkStart = std::min(i * this->ChunkSize, this->NumValues);
250  viskores::Id chunkEnd = this->EndIds[static_cast<std::size_t>(i)];
251  viskores::Id numValuesToCopy = chunkEnd - chunkStart;
252  if (numValuesToCopy > 0 && chunkStart != endPos)
253  {
254  std::copy(data + chunkStart, data + chunkEnd, data + endPos);
255  }
256  endPos += numValuesToCopy;
257  }
258  return endPos;
259  }
260 };
261 
262 #ifdef VISKORES_OPENMP_USE_NATIVE_REDUCTION
263 // OpenMP only declares reduction operations for primitive types. This utility
264 // detects if a type T is supported.
265 template <typename T>
266 struct OpenMPReductionSupported : std::false_type
267 {
268 };
269 template <>
270 struct OpenMPReductionSupported<Int8> : std::true_type
271 {
272 };
273 template <>
274 struct OpenMPReductionSupported<UInt8> : std::true_type
275 {
276 };
277 template <>
278 struct OpenMPReductionSupported<Int16> : std::true_type
279 {
280 };
281 template <>
282 struct OpenMPReductionSupported<UInt16> : std::true_type
283 {
284 };
285 template <>
286 struct OpenMPReductionSupported<Int32> : std::true_type
287 {
288 };
289 template <>
290 struct OpenMPReductionSupported<UInt32> : std::true_type
291 {
292 };
293 template <>
294 struct OpenMPReductionSupported<Int64> : std::true_type
295 {
296 };
297 template <>
298 struct OpenMPReductionSupported<UInt64> : std::true_type
299 {
300 };
301 template <>
302 struct OpenMPReductionSupported<Float32> : std::true_type
303 {
304 };
305 template <>
306 struct OpenMPReductionSupported<Float64> : std::true_type
307 {
308 };
309 #else
310 template <typename T>
311 using OpenMPReductionSupported = std::false_type;
312 #endif // VISKORES_OPENMP_USE_NATIVE_REDUCTION
313 
315 {
316  // std::is_integral, but adapted to see through vecs and pairs.
317  template <typename T>
318  struct IsIntegral : public std::is_integral<T>
319  {
320  };
321 
322  template <typename T, viskores::IdComponent Size>
323  struct IsIntegral<viskores::Vec<T, Size>> : public std::is_integral<T>
324  {
325  };
326 
327  template <typename T, typename U>
328  struct IsIntegral<viskores::Pair<T, U>>
329  : public std::integral_constant<bool, std::is_integral<T>::value && std::is_integral<U>::value>
330  {
331  };
332 
333  // Generic implementation:
334  template <typename PortalT, typename ReturnType, typename Functor>
335  static ReturnType Execute(PortalT portal, ReturnType init, Functor functorIn, std::false_type)
336  {
337  internal::WrappedBinaryOperator<ReturnType, Functor> f(functorIn);
338 
339  const viskores::Id numVals = portal.GetNumberOfValues();
340  auto data = viskores::cont::ArrayPortalToIteratorBegin(portal);
341 
342  bool doParallel = false;
343  viskores::Id numThreads = 0;
344 
347  .GetThreads(numThreads);
348 
349  std::unique_ptr<ReturnType[]> threadData;
350 
351  VISKORES_OPENMP_DIRECTIVE(parallel default(none) firstprivate(f) shared(
352  data, doParallel, numThreads, threadData) VISKORES_OPENMP_SHARED_CONST(numVals))
353  {
354  int tid = omp_get_thread_num();
355 
357  {
358  if (numVals >= numThreads * 2)
359  {
360  doParallel = true;
361  threadData.reset(new ReturnType[static_cast<std::size_t>(numThreads)]);
362  }
363  }
364 
365  if (doParallel)
366  {
367  // Static dispatch to unroll non-integral types:
368  const ReturnType localResult = ReduceHelper::DoParallelReduction<ReturnType>(
369  data, numVals, tid, numThreads, f, IsIntegral<ReturnType>{});
370 
371  threadData[static_cast<std::size_t>(tid)] = localResult;
372  }
373  } // end parallel
374 
375  if (doParallel)
376  {
377  // do the final reduction serially:
378  for (size_t i = 0; i < static_cast<size_t>(numThreads); ++i)
379  {
380  init = f(init, threadData[i]);
381  }
382  }
383  else
384  {
385  // Not enough threads. Do the entire reduction in serial:
386  for (viskores::Id i = 0; i < numVals; ++i)
387  {
388  init = f(init, data[i]);
389  }
390  }
391 
392  return init;
393  }
394 
395  // non-integer reduction: unroll loop manually.
396  // This gives faster code for floats and non-trivial types.
397  template <typename ReturnType, typename IterType, typename FunctorType>
398  static ReturnType DoParallelReduction(IterType data,
399  const viskores::Id& numVals,
400  const int& tid,
401  const int& numThreads,
402  FunctorType f,
403  std::false_type /* isIntegral */)
404  {
405  // Use the first (numThreads*2) values for initializing:
406  ReturnType accum = f(data[2 * tid], data[2 * tid + 1]);
407 
408  const viskores::Id offset = numThreads * 2;
409  const viskores::Id end = std::max(((numVals / 4) * 4) - 4, offset);
410  const viskores::Id unrollEnd = end - ((end - offset) % 4);
411  viskores::Id i = offset;
412 
413 // When initializing the looping iterator to a non integral type, intel compilers will
414 // convert the iterator type to an unsigned value
415 #pragma GCC diagnostic push
416 #pragma GCC diagnostic ignored "-Wsign-conversion"
417  VISKORES_OPENMP_DIRECTIVE(for schedule(static))
418  for (i = offset; i < unrollEnd; i += 4)
419 #pragma GCC diagnostic pop
420  {
421  const auto t1 = f(data[i], data[i + 1]);
422  const auto t2 = f(data[i + 2], data[i + 3]);
423  accum = f(accum, t1);
424  accum = f(accum, t2);
425  }
426 
427  // Let the last thread mop up any remaining values as it would
428  // have just accessed the adjacent data
429  if (tid == numThreads - 1)
430  {
431  for (i = unrollEnd; i < numVals; ++i)
432  {
433  accum = f(accum, data[i]);
434  }
435  }
436 
437  return accum;
438  }
439 
440  // Integer reduction: no unrolling. Ints vectorize easily and unrolling can
441  // hurt performance.
442  template <typename ReturnType, typename IterType, typename FunctorType>
443  static ReturnType DoParallelReduction(IterType data,
444  const viskores::Id& numVals,
445  const int& tid,
446  const int& numThreads,
447  FunctorType f,
448  std::true_type /* isIntegral */)
449  {
450  // Use the first (numThreads*2) values for initializing:
451  ReturnType accum = f(data[2 * tid], data[2 * tid + 1]);
452 
453 // Assign each thread chunks of the remaining values for local reduction
454 #pragma GCC diagnostic push
455 #pragma GCC diagnostic ignored "-Wsign-conversion"
456  VISKORES_OPENMP_DIRECTIVE(for schedule(static))
457  for (viskores::Id i = numThreads * 2; i < numVals; i++)
458 #pragma GCC diagnostic pop
459  {
460  accum = f(accum, data[i]);
461  }
462 
463  return accum;
464  }
465 
466 #ifdef VISKORES_OPENMP_USE_NATIVE_REDUCTION
467 
468 // Specialize for viskores functors with OpenMP special cases:
469 #define VISKORES_OPENMP_SPECIALIZE_REDUCE1(FunctorType, PragmaString) \
470  template <typename PortalT, typename ReturnType> \
471  static ReturnType Execute( \
472  PortalT portal, ReturnType value, FunctorType functorIn, std::true_type) \
473  { \
474  const viskores::Id numValues = portal.GetNumberOfValues(); \
475  internal::WrappedBinaryOperator<ReturnType, FunctorType> f(functorIn); \
476  _Pragma(#PragmaString) for (viskores::Id i = 0; i < numValues; ++i) \
477  { \
478  value = f(value, portal.Get(i)); \
479  } \
480  return value; \
481  }
482 
483 // Constructing the pragma string inside the _Pragma call doesn't work so
484 // we jump through a hoop:
485 #define VISKORES_OPENMP_SPECIALIZE_REDUCE(FunctorType, Operator) \
486  VISKORES_OPENMP_SPECIALIZE_REDUCE1(FunctorType, "omp parallel for reduction(" #Operator ":value)")
487 
488  // + (Add, Sum)
489  VISKORES_OPENMP_SPECIALIZE_REDUCE(viskores::Add, +)
490  VISKORES_OPENMP_SPECIALIZE_REDUCE(viskores::Sum, +)
491  // * (Multiply, Product)
492  VISKORES_OPENMP_SPECIALIZE_REDUCE(viskores::Multiply, *)
493  VISKORES_OPENMP_SPECIALIZE_REDUCE(viskores::Product, *)
494  // - (Subtract)
495  VISKORES_OPENMP_SPECIALIZE_REDUCE(viskores::Subtract, -)
496  // & (BitwiseAnd)
497  VISKORES_OPENMP_SPECIALIZE_REDUCE(viskores::BitwiseAnd, &)
498  // | (BitwiseOr)
499  VISKORES_OPENMP_SPECIALIZE_REDUCE(viskores::BitwiseOr, |)
500  // ^ (BitwiseXor)
501  VISKORES_OPENMP_SPECIALIZE_REDUCE(viskores::BitwiseXor, ^)
502  // && (LogicalAnd)
503  VISKORES_OPENMP_SPECIALIZE_REDUCE(viskores::LogicalAnd, &&)
504  // || (LogicalOr)
505  VISKORES_OPENMP_SPECIALIZE_REDUCE(viskores::LogicalOr, ||)
506  // min (Minimum)
507  VISKORES_OPENMP_SPECIALIZE_REDUCE(viskores::Minimum, min)
508  // max (Maximum)
509  VISKORES_OPENMP_SPECIALIZE_REDUCE(viskores::Maximum, max)
510 
511 #undef VISKORES_OPENMP_SPECIALIZE_REDUCE
512 #undef VISKORES_OPENMP_SPECIALIZE_REDUCE1
513 
514 #endif // VISKORES_OPENMP_USE_NATIVE_REDUCTION
515 };
516 
517 template <typename KeysInArray,
518  typename ValuesInArray,
519  typename KeysOutArray,
520  typename ValuesOutArray,
521  typename BinaryFunctor>
522 void ReduceByKeyHelper(KeysInArray keysInArray,
523  ValuesInArray valuesInArray,
524  KeysOutArray keysOutArray,
525  ValuesOutArray valuesOutArray,
526  BinaryFunctor functor)
527 {
528  using KeyType = typename KeysInArray::ValueType;
529  using ValueType = typename ValuesInArray::ValueType;
530 
531  viskores::cont::Token token;
532 
533  const viskores::Id numValues = keysInArray.GetNumberOfValues();
534  auto keysInPortal = keysInArray.PrepareForInput(DeviceAdapterTagOpenMP(), token);
535  auto valuesInPortal = valuesInArray.PrepareForInput(DeviceAdapterTagOpenMP(), token);
536  auto keysIn = viskores::cont::ArrayPortalToIteratorBegin(keysInPortal);
537  auto valuesIn = viskores::cont::ArrayPortalToIteratorBegin(valuesInPortal);
538 
539  auto keysOutPortal = keysOutArray.PrepareForOutput(numValues, DeviceAdapterTagOpenMP(), token);
540  auto valuesOutPortal =
541  valuesOutArray.PrepareForOutput(numValues, DeviceAdapterTagOpenMP(), token);
542  auto keysOut = viskores::cont::ArrayPortalToIteratorBegin(keysOutPortal);
543  auto valuesOut = viskores::cont::ArrayPortalToIteratorBegin(valuesOutPortal);
544 
545  internal::WrappedBinaryOperator<ValueType, BinaryFunctor> f(functor);
546  viskores::Id outIdx = 0;
547  viskores::Id numThreads = 0;
548 
551  .GetThreads(numThreads);
552 
553  VISKORES_OPENMP_DIRECTIVE(parallel default(none)
554  firstprivate(keysIn, valuesIn, keysOut, valuesOut, f)
555  shared(numThreads, outIdx) VISKORES_OPENMP_SHARED_CONST(numValues))
556  {
557  int tid = omp_get_thread_num();
558 
559  // Determine bounds for this thread's scan operation:
560  viskores::Id chunkSize = (numValues + numThreads - 1) / numThreads;
561  viskores::Id scanIdx = std::min(tid * chunkSize, numValues);
562  viskores::Id scanEnd = std::min(scanIdx + chunkSize, numValues);
563 
564  auto threadKeysBegin = keysOut + scanIdx;
565  auto threadValuesBegin = valuesOut + scanIdx;
566  auto threadKey = threadKeysBegin;
567  auto threadValue = threadValuesBegin;
568 
569  // Reduce each thread's partition:
570  KeyType rangeKey;
571  ValueType rangeValue;
572  for (;;)
573  {
574  if (scanIdx < scanEnd)
575  {
576  rangeKey = keysIn[scanIdx];
577  rangeValue = valuesIn[scanIdx];
578  ++scanIdx;
579 
580  // Locate end of current range:
581  while (scanIdx < scanEnd && static_cast<KeyType>(keysIn[scanIdx]) == rangeKey)
582  {
583  rangeValue = f(rangeValue, valuesIn[scanIdx]);
584  ++scanIdx;
585  }
586 
587  *threadKey = rangeKey;
588  *threadValue = rangeValue;
589  ++threadKey;
590  ++threadValue;
591  }
592  else
593  {
594  break;
595  }
596  }
597 
598  if (tid == 0)
599  {
600  outIdx = static_cast<viskores::Id>(threadKey - threadKeysBegin);
601  }
602 
603  // Combine the reduction results. Skip tid == 0, since it's already in
604  // the correct location:
605  for (int i = 1; i < numThreads; ++i)
606  {
607 
608  // This barrier ensures that:
609  // 1) Threads remain synchronized through this final reduction loop.
610  // 2) The outIdx variable is initialized by thread 0.
611  // 3) All threads have reduced their partitions.
613 
614  if (tid == i)
615  {
616  // Check if the previous thread's last key matches our first:
617  if (outIdx > 0 && threadKeysBegin < threadKey && keysOut[outIdx - 1] == *threadKeysBegin)
618  {
619  valuesOut[outIdx - 1] = f(valuesOut[outIdx - 1], *threadValuesBegin);
620  ++threadKeysBegin;
621  ++threadValuesBegin;
622  }
623 
624  // Copy reduced partition to final location (if needed)
625  if (threadKeysBegin < threadKey && threadKeysBegin != keysOut + outIdx)
626  {
627  std::copy(threadKeysBegin, threadKey, keysOut + outIdx);
628  std::copy(threadValuesBegin, threadValue, valuesOut + outIdx);
629  }
630 
631  outIdx += static_cast<viskores::Id>(threadKey - threadKeysBegin);
632 
633  } // end tid == i
634  } // end combine reduction
635  } // end parallel
636 
637  token.DetachFromAll();
638 
639  keysOutArray.Allocate(outIdx, viskores::CopyFlag::On);
640  valuesOutArray.Allocate(outIdx, viskores::CopyFlag::On);
641 }
642 
643 template <typename IterT, typename RawPredicateT>
645 {
646  using ValueType = typename std::iterator_traits<IterT>::value_type;
647  using PredicateT = internal::WrappedBinaryOperator<bool, RawPredicateT>;
648 
649  struct Node
650  {
653 
654  // Pad the node out to the size of a cache line to prevent false sharing:
655  static constexpr size_t DataSize = 2 * sizeof(viskores::Id2);
656  static constexpr size_t NumCacheLines = CeilDivide<size_t>(DataSize, VISKORES_CACHE_LINE_SIZE);
657  static constexpr size_t PaddingSize = NumCacheLines * VISKORES_CACHE_LINE_SIZE - DataSize;
658  unsigned char Padding[PaddingSize];
659  };
660 
661  IterT Data;
665  std::vector<Node> Nodes;
666  size_t NextNode;
667 
668  UniqueHelper(IterT iter, viskores::Id numValues, RawPredicateT pred)
669  : Data(iter)
670  , NumValues(numValues)
671  , Predicate(pred)
672  , LeafSize(0)
673  , NextNode(0)
674  {
675  }
676 
678  {
679  viskores::Id outSize = 0;
680 
681  VISKORES_OPENMP_DIRECTIVE(parallel default(shared))
682  {
684  {
685  this->Prepare();
686 
687  // Kick off task-based divide-and-conquer uniquification:
688  Node* rootNode = this->AllocNode();
689  rootNode->InputRange = viskores::Id2(0, this->NumValues);
690  this->Uniquify(rootNode);
691  outSize = rootNode->OutputRange[1] - rootNode->OutputRange[0];
692  }
693  }
694 
695  return outSize;
696  }
697 
698 private:
699  void Prepare()
700  {
701  // Figure out how many values each thread should handle:
702  viskores::Id numThreads = 0;
705  .GetThreads(numThreads);
706 
707  viskores::Id chunksPerThread = 8;
708  viskores::Id numChunks;
709  ComputeChunkSize(
710  this->NumValues, numThreads, chunksPerThread, sizeof(ValueType), numChunks, this->LeafSize);
711 
712  // Compute an upper-bound of the number of nodes in the tree:
713  std::size_t numNodes = static_cast<std::size_t>(numChunks);
714  while (numChunks > 1)
715  {
716  numChunks = (numChunks + 1) / 2;
717  numNodes += static_cast<std::size_t>(numChunks);
718  }
719  this->Nodes.resize(numNodes);
720  this->NextNode = 0;
721  }
722 
724  {
725  size_t nodeIdx;
726 
727 // GCC emits a false positive "value computed but not used" for this block:
728 #pragma GCC diagnostic push
729 #pragma GCC diagnostic ignored "-Wunused-value"
730 
731  VISKORES_OPENMP_DIRECTIVE(atomic capture)
732  {
733  nodeIdx = this->NextNode;
734  ++this->NextNode;
735  }
736 
737 #pragma GCC diagnostic pop
738 
739  VISKORES_ASSERT(nodeIdx < this->Nodes.size());
740 
741  return &this->Nodes[nodeIdx];
742  }
743 
744  bool IsLeaf(const viskores::Id2& range) { return (range[1] - range[0]) <= this->LeafSize; }
745 
746  // Not an strict midpoint, but ensures that the first range will always be
747  // a multiple of the leaf size.
749  {
750  const viskores::Id n = range[1] - range[0];
751  const viskores::Id np = this->LeafSize;
752 
753  return CeilDivide(n / 2, np) * np + range[0];
754  }
755 
756  void Uniquify(Node* node)
757  {
758  if (!this->IsLeaf(node->InputRange))
759  {
760  viskores::Id midpoint = this->ComputeMidpoint(node->InputRange);
761 
762  Node* right = this->AllocNode();
763  Node* left = this->AllocNode();
764 
765  right->InputRange = viskores::Id2(midpoint, node->InputRange[1]);
766 
767  // Intel compilers seem to have trouble following the 'this' pointer
768  // when launching tasks, resulting in a corrupt task environment.
769  // Explicitly copying the pointer into a local variable seems to fix this.
770  auto explicitThis = this;
771 
772  VISKORES_OPENMP_DIRECTIVE(taskgroup)
773  {
775  {
776  explicitThis->Uniquify(right);
777  }
778 
779  left->InputRange = viskores::Id2(node->InputRange[0], midpoint);
780  this->Uniquify(left);
781 
782  } // end taskgroup. Both sides of the tree will be completed here.
783 
784  // Combine the ranges in the left side:
785  if (this->Predicate(this->Data[left->OutputRange[1] - 1], this->Data[right->OutputRange[0]]))
786  {
787  ++right->OutputRange[0];
788  }
789 
790  viskores::Id numVals = right->OutputRange[1] - right->OutputRange[0];
791  DoCopy(this->Data + right->OutputRange[0], this->Data + left->OutputRange[1], numVals);
792 
793  node->OutputRange[0] = left->OutputRange[0];
794  node->OutputRange[1] = left->OutputRange[1] + numVals;
795  }
796  else
797  {
798  auto start = this->Data + node->InputRange[0];
799  auto end = this->Data + node->InputRange[1];
800  end = std::unique(start, end, this->Predicate);
801  node->OutputRange[0] = node->InputRange[0];
802  node->OutputRange[1] = node->InputRange[0] + static_cast<viskores::Id>(end - start);
803  }
804  }
805 };
806 }
807 }
808 } // end namespace viskores::cont::openmp
809 
810 #endif // viskores_cont_openmp_internal_FunctorsOpenMP_h
viskores::Product
Binary Predicate that takes two arguments argument x, and y and returns product (multiplication) of t...
Definition: BinaryOperators.h:64
viskores::Id2
viskores::Vec< viskores::Id, 2 > Id2
Id2 corresponds to a 2-dimensional index.
Definition: Types.h:935
ArrayHandle.h
viskores::cont::openmp::ReduceHelper::IsIntegral
Definition: FunctorsOpenMP.h:318
FunctorsGeneral.h
viskores::cont::openmp::UniqueHelper::Node::PaddingSize
static constexpr size_t PaddingSize
Definition: FunctorsOpenMP.h:657
viskores::Int16
int16_t Int16
Base type to use for 16-bit signed integer numbers.
Definition: Types.h:181
Types.h
viskores::cont::openmp::UniqueHelper::Node::Padding
unsigned char Padding[PaddingSize]
Definition: FunctorsOpenMP.h:658
viskores::cont::openmp::CopyIfHelper::NumThreads
viskores::Id NumThreads
Definition: FunctorsOpenMP.h:196
viskores::Subtract
Definition: Types.h:288
viskores::cont::openmp::UniqueHelper::NextNode
size_t NextNode
Definition: FunctorsOpenMP.h:666
Pair.h
viskores::cont::RuntimeDeviceInformation
A class that can be used to determine if a given device adapter is supported on the current machine a...
Definition: RuntimeDeviceInformation.h:37
viskores::BitwiseAnd
Binary Predicate that takes two arguments argument x, and y and returns the bitwise operation x&y
Definition: BinaryOperators.h:153
viskores::Int8
int8_t Int8
Base type to use for 8-bit signed integer numbers.
Definition: Types.h:173
viskores::cont::openmp::CleanArrayRefImpl::type
T type
Definition: FunctorsOpenMP.h:105
viskores::LogicalAnd
Binary Predicate that takes two arguments argument x, and y and returns True if and only if x and y a...
Definition: BinaryPredicates.h:79
viskores::cont::openmp::UniqueHelper::Node::NumCacheLines
static constexpr size_t NumCacheLines
Definition: FunctorsOpenMP.h:656
viskores::cont::openmp::UniqueHelper::Node::DataSize
static constexpr size_t DataSize
Definition: FunctorsOpenMP.h:655
viskores::UInt16
uint16_t UInt16
Base type to use for 16-bit unsigned integer numbers.
Definition: Types.h:185
viskores::cont::openmp::CopyIfHelper::Reduce
viskores::Id Reduce(OutIterT data)
Definition: FunctorsOpenMP.h:244
viskores::cont::openmp::CleanArrayRef
typename CleanArrayRefImpl< T >::type CleanArrayRef
Definition: FunctorsOpenMP.h:115
viskores::cont::openmp::CleanArrayRefImpl
Definition: FunctorsOpenMP.h:103
VISKORES_OPENMP_DIRECTIVE
#define VISKORES_OPENMP_DIRECTIVE(directive)
Definition: FunctorsOpenMP.h:45
viskores::cont::openmp::ReduceHelper::DoParallelReduction
static ReturnType DoParallelReduction(IterType data, const viskores::Id &numVals, const int &tid, const int &numThreads, FunctorType f, std::false_type)
Definition: FunctorsOpenMP.h:398
viskores::cont::openmp::UniqueHelper::Node
Definition: FunctorsOpenMP.h:649
viskores::Multiply
Definition: Types.h:308
viskores::Add
Definition: Types.h:268
viskores::cont::DeviceAdapterTagOpenMP
Tag for a device adapter that uses OpenMP compiler extensions to run algorithms on multiple threads.
Definition: DeviceAdapterTagOpenMP.h:35
viskores::BitwiseXor
Binary Predicate that takes two arguments argument x, and y and returns the bitwise operation x^y
Definition: BinaryOperators.h:199
viskores::cont::openmp::CopyIfHelper::CopyIf
void CopyIf(InIterT inIter, StencilIterT stencilIter, OutIterT outIter, PredicateT pred, viskores::Id chunk)
Definition: FunctorsOpenMP.h:222
viskores::Maximum
Binary Predicate that takes two arguments argument x, and y and returns the x if x > y otherwise retu...
Definition: BinaryOperators.h:93
viskores::Int64
signed long long Int64
Base type to use for 64-bit signed integer numbers.
Definition: Types.h:212
DeviceAdapterTagOpenMP.h
viskores::cont::openmp::UniqueHelper::Node::OutputRange
viskores::Id2 OutputRange
Definition: FunctorsOpenMP.h:652
viskores::cont::openmp::UniqueHelper::LeafSize
viskores::Id LeafSize
Definition: FunctorsOpenMP.h:664
viskores::Id
viskores::Int64 Id
Base type to use to index arrays.
Definition: Types.h:235
viskores::BitwiseOr
Binary Predicate that takes two arguments argument x, and y and returns the bitwise operation x|y
Definition: BinaryOperators.h:176
viskores
Groups connected points that have the same field value.
Definition: Atomic.h:27
viskores::cont::openmp::ReduceHelper::Execute
static ReturnType Execute(PortalT portal, ReturnType init, Functor functorIn, std::false_type)
Definition: FunctorsOpenMP.h:335
viskores::cont::openmp::UniqueHelper::IsLeaf
bool IsLeaf(const viskores::Id2 &range)
Definition: FunctorsOpenMP.h:744
viskores::cont::Token::DetachFromAll
void DetachFromAll()
Detaches this Token from all resources to allow them to be used elsewhere or deleted.
viskores::cont::openmp::CopyIfHelper::NumChunks
viskores::Id NumChunks
Definition: FunctorsOpenMP.h:199
viskores::cont::openmp::UniqueHelper
Definition: FunctorsOpenMP.h:644
viskores::CopyFlag::On
@ On
viskores::Float32
float Float32
Base type to use for 32-bit floating-point numbers.
Definition: Types.h:165
viskores::cont::ArrayPortalToIteratorBegin
viskores::cont::ArrayPortalToIterators< PortalType >::IteratorType ArrayPortalToIteratorBegin(const PortalType &portal)
Convenience function for converting an ArrayPortal to a begin iterator.
Definition: ArrayPortalToIterators.h:189
viskores::cont::openmp::ReduceByKeyHelper
void ReduceByKeyHelper(KeysInArray keysInArray, ValuesInArray valuesInArray, KeysOutArray keysOutArray, ValuesOutArray valuesOutArray, BinaryFunctor functor)
Definition: FunctorsOpenMP.h:522
viskores::cont::openmp::UniqueHelper::Prepare
void Prepare()
Definition: FunctorsOpenMP.h:699
VISKORES_ASSERT
#define VISKORES_ASSERT(condition)
Definition: Assert.h:51
viskores::cont::openmp::UniqueHelper::Nodes
std::vector< Node > Nodes
Definition: FunctorsOpenMP.h:665
viskores::Pair
A viskores::Pair is essentially the same as an STL pair object except that the methods (constructors ...
Definition: Pair.h:37
viskores::cont::openmp::UniqueHelper::Node::InputRange
viskores::Id2 InputRange
Definition: FunctorsOpenMP.h:651
ErrorExecution.h
BinaryOperators.h
viskores::cont::openmp::ReduceHelper
Definition: FunctorsOpenMP.h:314
viskores::Sum
Binary Predicate that takes two arguments argument x, and y and returns sum (addition) of the two val...
Definition: BinaryOperators.h:41
viskores::UInt64
unsigned long long UInt64
Base type to use for 64-bit signed integer numbers.
Definition: Types.h:215
viskores::UInt8
uint8_t UInt8
Base type to use for 8-bit unsigned integer numbers.
Definition: Types.h:177
viskores::Int32
int32_t Int32
Base type to use for 32-bit signed integer numbers.
Definition: Types.h:189
viskores::LogicalOr
Binary Predicate that takes two arguments argument x, and y and returns True if and only if x or y is...
Definition: BinaryPredicates.h:91
viskores::cont::openmp::UniqueHelper::Execute
viskores::Id Execute()
Definition: FunctorsOpenMP.h:677
viskores::cont::openmp::UniqueHelper::NumValues
viskores::Id NumValues
Definition: FunctorsOpenMP.h:662
viskores::cont::openmp::CopyIfHelper::ChunkSize
viskores::Id ChunkSize
Definition: FunctorsOpenMP.h:200
BinaryPredicates.h
viskores::cont::openmp::UniqueHelper::ValueType
typename std::iterator_traits< IterT >::value_type ValueType
Definition: FunctorsOpenMP.h:646
viskores::cont::openmp::CopyIfHelper
Definition: FunctorsOpenMP.h:193
viskores::Minimum
Binary Predicate that takes two arguments argument x, and y and returns the x if x < y otherwise retu...
Definition: BinaryOperators.h:107
viskores::cont::openmp::UniqueHelper::UniqueHelper
UniqueHelper(IterT iter, viskores::Id numValues, RawPredicateT pred)
Definition: FunctorsOpenMP.h:668
RuntimeDeviceInformation.h
viskores::cont::openmp::UniqueHelper::AllocNode
Node * AllocNode()
Definition: FunctorsOpenMP.h:723
viskores::cont::openmp::UniqueHelper::Data
IterT Data
Definition: FunctorsOpenMP.h:661
viskores::cont::openmp::UniqueHelper::ComputeMidpoint
viskores::Id ComputeMidpoint(const viskores::Id2 &range)
Definition: FunctorsOpenMP.h:748
viskores::Float64
double Float64
Base type to use for 64-bit floating-point numbers.
Definition: Types.h:169
viskores::cont::openmp::UniqueHelper::Predicate
PredicateT Predicate
Definition: FunctorsOpenMP.h:663
viskores::cont::openmp::CopyIfHelper::NumValues
viskores::Id NumValues
Definition: FunctorsOpenMP.h:195
viskores::cont::openmp::OpenMPReductionSupported
std::false_type OpenMPReductionSupported
Definition: FunctorsOpenMP.h:311
viskores::cont::openmp::UniqueHelper::Uniquify
void Uniquify(Node *node)
Definition: FunctorsOpenMP.h:756
viskores::Vec
A short fixed-length array.
Definition: Types.h:365
viskores::cont::Token
A token to hold the scope of an ArrayHandle or other object.
Definition: Token.h:43
viskores::cont::openmp::ReduceHelper::DoParallelReduction
static ReturnType DoParallelReduction(IterType data, const viskores::Id &numVals, const int &tid, const int &numThreads, FunctorType f, std::true_type)
Definition: FunctorsOpenMP.h:443
viskores::UInt32
uint32_t UInt32
Base type to use for 32-bit unsigned integer numbers.
Definition: Types.h:193
viskores::cont::openmp::CopyIfHelper::Initialize
void Initialize(viskores::Id numValues, viskores::Id valueSize)
Definition: FunctorsOpenMP.h:205
viskores::cont::openmp::CopyIfHelper::ValueSize
viskores::Id ValueSize
Definition: FunctorsOpenMP.h:197
viskores::cont::openmp::CopyIfHelper::CopyIfHelper
CopyIfHelper()=default
VISKORES_OPENMP_SHARED_CONST
#define VISKORES_OPENMP_SHARED_CONST(...)
Definition: FunctorsOpenMP.h:57
viskores::cont::openmp::CopyIfHelper::EndIds
std::vector< viskores::Id > EndIds
Definition: FunctorsOpenMP.h:201
viskores::cont::RuntimeDeviceInformation::GetRuntimeConfiguration
viskores::cont::internal::RuntimeDeviceConfigurationBase & GetRuntimeConfiguration(DeviceAdapterId id, const viskores::cont::internal::RuntimeDeviceConfigurationOptions &configOptions, int &argc, char *argv[]=nullptr) const
Returns a reference to a RuntimeDeviceConfiguration that will work with the given device.
viskores::cont::openmp::UniqueHelper::PredicateT
internal::WrappedBinaryOperator< bool, RawPredicateT > PredicateT
Definition: FunctorsOpenMP.h:647