Viskores  1.0
ParallelRadixSort.h
Go to the documentation of this file.
1 //============================================================================
2 // The contents of this file are covered by the Viskores license. See
3 // LICENSE.txt for details.
4 //
5 // By contributing to this file, all contributors agree to the Developer
6 // Certificate of Origin Version 1.1 (DCO 1.1) as stated in DCO.txt.
7 //============================================================================
8 
9 //============================================================================
10 // Copyright (c) Kitware, Inc.
11 // All rights reserved.
12 // See LICENSE.txt for details.
13 //
14 // This software is distributed WITHOUT ANY WARRANTY; without even
15 // the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
16 // PURPOSE. See the above copyright notice for more information.
17 //============================================================================
18 
19 // Copyright 2010, Takuya Akiba
20 // All rights reserved.
21 //
22 // Redistribution and use in source and binary forms, with or without
23 // modification, are permitted provided that the following conditions are
24 // met:
25 //
26 // * Redistributions of source code must retain the above copyright
27 // notice, this list of conditions and the following disclaimer.
28 // * Redistributions in binary form must reproduce the above
29 // copyright notice, this list of conditions and the following disclaimer
30 // in the documentation and/or other materials provided with the
31 // distribution.
32 // * Neither the name of Takuya Akiba nor the names of its
33 // contributors may be used to endorse or promote products derived from
34 // this software without specific prior written permission.
35 //
36 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
37 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
38 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
39 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
40 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
43 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
44 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
45 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
46 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
47 
48 // Modifications of Takuya Akiba's original GitHub source code for inclusion
49 // in Viskores:
50 //
51 // - Made parallelization library details generic (see "Threading Interface" below).
52 // - Added minimum threshold for parallel, will instead invoke serial radix sort (kxsort)
53 // - Added std::greater<T> and std::less<T> to interface for descending order sorts
54 // - Added linear scaling of threads used by the algorithm for more stable performance
55 // on machines with lots of available threads (KNL and Haswell)
56 //
57 // This file contains an implementation of Satish parallel radix sort
58 // as documented in the following citation:
59 //
60 // Fast sort on CPUs and GPUs: a case for bandwidth oblivious SIMD sort.
61 // N. Satish, C. Kim, J. Chhugani, A. D. Nguyen, V. W. Lee, D. Kim, and P. Dubey.
62 // In Proc. SIGMOD, pages 351–362, 2010
63 //
64 // Threading Interface:
65 //
66 // To use this implementation, an object containing the following members should
67 // be passed as the 'threader' argument at the entry points:
68 //
69 // struct ThreaderExample
70 // {
71 // // Return the number of cores available:
72 // size_t GetAvailableCores();
73 //
74 // // Run the supplied functor in a new thread. This task is likely to
75 // // generate children (through RunChildTasks), and must block until all
76 // // children complete.
77 // template <typename TaskType>
78 // void RunParentTask(TaskType task);
79 //
80 // // Run an child task in a new thread. The function may be blocking or
81 // // non-blocking, and 'data' is an abitrary object passed to the parent
82 // // task's operator() (See the TBB implementation for details).
83 // template <typename TaskType, typename ParentTaskThreadData>
84 // void RunChildTasks(ParentTaskThreadData data, TaskType left, TaskType right);
85 // };
86 //
87 // See the sample implementations and the RunTask struct below for examples.
88 
89 #ifndef viskores_cont_internal_ParallelRadixSort_h
90 #define viskores_cont_internal_ParallelRadixSort_h
91 
93 
94 #include <algorithm>
95 #include <cassert>
96 #include <climits>
97 #include <cmath>
98 #include <cstring>
99 #include <functional>
100 #include <stdint.h>
101 #include <utility>
102 
103 #include <viskores/Types.h>
104 #include <viskores/cont/Logging.h>
105 
107 
109 
110 namespace viskores
111 {
112 namespace cont
113 {
114 namespace internal
115 {
116 namespace radix
117 {
118 
119 namespace utility
120 {
121 // Return the number of threads that would be executed in parallel regions
122 inline size_t GetMaxThreads(size_t num_bytes, size_t available_cores)
123 {
124  const double CORES_PER_BYTE =
125  double(available_cores - 1) / double(BYTES_FOR_MAX_PARALLELISM - MIN_BYTES_FOR_PARALLEL);
126  const double Y_INTERCEPT = 1.0 - CORES_PER_BYTE * MIN_BYTES_FOR_PARALLEL;
127 
128  const size_t num_cores = (size_t)(CORES_PER_BYTE * double(num_bytes) + Y_INTERCEPT);
129  if (num_cores < 1)
130  {
131  return 1;
132  }
133  if (num_cores > available_cores)
134  {
135  return available_cores;
136  }
137  return num_cores;
138 }
139 } // namespace utility
140 
141 namespace internal
142 {
143 // Size of the software managed buffer
144 const size_t kOutBufferSize = 32;
145 
146 // Ascending order radix sort is a no-op
147 template <typename PlainType,
148  typename UnsignedType,
149  typename CompareType,
150  typename ValueManager,
151  unsigned int Base>
152 struct ParallelRadixCompareInternal
153 {
154  inline static void reverse(UnsignedType& t) { (void)t; }
155 };
156 
157 // Handle descending order radix sort
158 template <typename PlainType, typename UnsignedType, typename ValueManager, unsigned int Base>
159 struct ParallelRadixCompareInternal<PlainType,
160  UnsignedType,
161  std::greater<PlainType>,
162  ValueManager,
163  Base>
164 {
165  inline static void reverse(UnsignedType& t) { t = ((1 << Base) - 1) - t; }
166 };
167 
168 // The algorithm is implemented in this internal class
169 template <typename PlainType,
170  typename CompareType,
171  typename UnsignedType,
172  typename Encoder,
173  typename ValueManager,
174  typename ThreaderType,
175  unsigned int Base>
176 class ParallelRadixSortInternal
177 {
178 public:
179  using CompareInternal =
180  ParallelRadixCompareInternal<PlainType, UnsignedType, CompareType, ValueManager, Base>;
181 
182  ParallelRadixSortInternal();
183  ~ParallelRadixSortInternal();
184 
185  void Init(PlainType* data, size_t num_elems, const ThreaderType& threader);
186 
187  PlainType* Sort(PlainType* data, ValueManager* value_manager);
188 
189  static void InitAndSort(PlainType* data,
190  size_t num_elems,
191  const ThreaderType& threader,
192  ValueManager* value_manager);
193 
194 private:
195  CompareInternal compare_internal_;
196  size_t num_elems_;
197  size_t num_threads_;
198 
199  UnsignedType* tmp_;
200  size_t** histo_;
201  UnsignedType*** out_buf_;
202  size_t** out_buf_n_;
203 
204  size_t *pos_bgn_, *pos_end_;
205  ValueManager* value_manager_;
206  ThreaderType threader_;
207 
208  void DeleteAll();
209 
210  UnsignedType* SortInternal(UnsignedType* data, ValueManager* value_manager);
211 
212  // Compute |pos_bgn_| and |pos_end_| (associated ranges for each threads)
213  void ComputeRanges();
214 
215  // First step of each iteration of sorting
216  // Compute the histogram of |src| using bits in [b, b + Base)
217  void ComputeHistogram(unsigned int b, UnsignedType* src);
218 
219  // Second step of each iteration of sorting
220  // Scatter elements of |src| to |dst| using the histogram
221  void Scatter(unsigned int b, UnsignedType* src, UnsignedType* dst);
222 };
223 
224 template <typename PlainType,
225  typename CompareType,
226  typename UnsignedType,
227  typename Encoder,
228  typename ValueManager,
229  typename ThreaderType,
230  unsigned int Base>
231 ParallelRadixSortInternal<PlainType,
232  CompareType,
233  UnsignedType,
234  Encoder,
235  ValueManager,
236  ThreaderType,
237  Base>::ParallelRadixSortInternal()
238  : num_elems_(0)
239  , num_threads_(0)
240  , tmp_(NULL)
241  , histo_(NULL)
242  , out_buf_(NULL)
243  , out_buf_n_(NULL)
244  , pos_bgn_(NULL)
245  , pos_end_(NULL)
246 {
247  assert(sizeof(PlainType) == sizeof(UnsignedType));
248 }
249 
250 template <typename PlainType,
251  typename CompareType,
252  typename UnsignedType,
253  typename Encoder,
254  typename ValueManager,
255  typename ThreaderType,
256  unsigned int Base>
257 ParallelRadixSortInternal<PlainType,
258  CompareType,
259  UnsignedType,
260  Encoder,
261  ValueManager,
262  ThreaderType,
263  Base>::~ParallelRadixSortInternal()
264 {
265  DeleteAll();
266 }
267 
268 template <typename PlainType,
269  typename CompareType,
270  typename UnsignedType,
271  typename Encoder,
272  typename ValueManager,
273  typename ThreaderType,
274  unsigned int Base>
275 void ParallelRadixSortInternal<PlainType,
276  CompareType,
277  UnsignedType,
278  Encoder,
279  ValueManager,
280  ThreaderType,
281  Base>::DeleteAll()
282 {
283  delete[] tmp_;
284  tmp_ = NULL;
285 
286  for (size_t i = 0; i < num_threads_; ++i)
287  delete[] histo_[i];
288  delete[] histo_;
289  histo_ = NULL;
290 
291  for (size_t i = 0; i < num_threads_; ++i)
292  {
293  for (size_t j = 0; j < 1 << Base; ++j)
294  {
295  delete[] out_buf_[i][j];
296  }
297  delete[] out_buf_n_[i];
298  delete[] out_buf_[i];
299  }
300  delete[] out_buf_;
301  delete[] out_buf_n_;
302  out_buf_ = NULL;
303  out_buf_n_ = NULL;
304 
305  delete[] pos_bgn_;
306  delete[] pos_end_;
307  pos_bgn_ = pos_end_ = NULL;
308 
309  num_elems_ = 0;
310  num_threads_ = 0;
311 }
312 
313 template <typename PlainType,
314  typename CompareType,
315  typename UnsignedType,
316  typename Encoder,
317  typename ValueManager,
318  typename ThreaderType,
319  unsigned int Base>
320 void ParallelRadixSortInternal<PlainType,
321  CompareType,
322  UnsignedType,
323  Encoder,
324  ValueManager,
325  ThreaderType,
326  Base>::Init(PlainType* data,
327  size_t num_elems,
328  const ThreaderType& threader)
329 {
330  (void)data;
331  DeleteAll();
332 
333  threader_ = threader;
334 
335  num_elems_ = num_elems;
336 
337  num_threads_ =
338  utility::GetMaxThreads(num_elems_ * sizeof(PlainType), threader_.GetAvailableCores());
339 
340  tmp_ = new UnsignedType[num_elems_];
341  histo_ = new size_t*[num_threads_];
342  for (size_t i = 0; i < num_threads_; ++i)
343  {
344  histo_[i] = new size_t[1 << Base];
345  }
346 
347  out_buf_ = new UnsignedType**[num_threads_];
348  out_buf_n_ = new size_t*[num_threads_];
349  for (size_t i = 0; i < num_threads_; ++i)
350  {
351  out_buf_[i] = new UnsignedType*[1 << Base];
352  out_buf_n_[i] = new size_t[1 << Base];
353  for (size_t j = 0; j < 1 << Base; ++j)
354  {
355  out_buf_[i][j] = new UnsignedType[kOutBufferSize];
356  }
357  }
358 
359  pos_bgn_ = new size_t[num_threads_];
360  pos_end_ = new size_t[num_threads_];
361 }
362 
363 template <typename PlainType,
364  typename CompareType,
365  typename UnsignedType,
366  typename Encoder,
367  typename ValueManager,
368  typename ThreaderType,
369  unsigned int Base>
370 PlainType* ParallelRadixSortInternal<PlainType,
371  CompareType,
372  UnsignedType,
373  Encoder,
374  ValueManager,
375  ThreaderType,
376  Base>::Sort(PlainType* data, ValueManager* value_manager)
377 {
378  UnsignedType* src = reinterpret_cast<UnsignedType*>(data);
379  UnsignedType* res = SortInternal(src, value_manager);
380  return reinterpret_cast<PlainType*>(res);
381 }
382 
383 template <typename PlainType,
384  typename CompareType,
385  typename UnsignedType,
386  typename Encoder,
387  typename ValueManager,
388  typename ThreaderType,
389  unsigned int Base>
390 void ParallelRadixSortInternal<PlainType,
391  CompareType,
392  UnsignedType,
393  Encoder,
394  ValueManager,
395  ThreaderType,
396  Base>::InitAndSort(PlainType* data,
397  size_t num_elems,
398  const ThreaderType& threader,
399  ValueManager* value_manager)
400 {
401  ParallelRadixSortInternal prs;
402  prs.Init(data, num_elems, threader);
403  const PlainType* res = prs.Sort(data, value_manager);
404  if (res != data)
405  {
406  for (size_t i = 0; i < num_elems; ++i)
407  data[i] = res[i];
408  }
409 }
410 
411 template <typename PlainType,
412  typename CompareType,
413  typename UnsignedType,
414  typename Encoder,
415  typename ValueManager,
416  typename ThreaderType,
417  unsigned int Base>
418 UnsignedType* ParallelRadixSortInternal<PlainType,
419  CompareType,
420  UnsignedType,
421  Encoder,
422  ValueManager,
423  ThreaderType,
424  Base>::SortInternal(UnsignedType* data,
425  ValueManager* value_manager)
426 {
427 
428  value_manager_ = value_manager;
429 
430  // Compute |pos_bgn_| and |pos_end_|
431  ComputeRanges();
432 
433  // Iterate from lower bits to higher bits
434  const size_t bits = CHAR_BIT * sizeof(UnsignedType);
435  UnsignedType *src = data, *dst = tmp_;
436  for (unsigned int b = 0; b < bits; b += Base)
437  {
438  ComputeHistogram(b, src);
439  Scatter(b, src, dst);
440 
441  std::swap(src, dst);
442  value_manager->Next();
443  }
444 
445  return src;
446 }
447 
448 template <typename PlainType,
449  typename CompareType,
450  typename UnsignedType,
451  typename Encoder,
452  typename ValueManager,
453  typename ThreaderType,
454  unsigned int Base>
455 void ParallelRadixSortInternal<PlainType,
456  CompareType,
457  UnsignedType,
458  Encoder,
459  ValueManager,
460  ThreaderType,
461  Base>::ComputeRanges()
462 {
463  pos_bgn_[0] = 0;
464  for (size_t i = 0; i < num_threads_ - 1; ++i)
465  {
466  const size_t t = (num_elems_ - pos_bgn_[i]) / (num_threads_ - i);
467  pos_bgn_[i + 1] = pos_end_[i] = pos_bgn_[i] + t;
468  }
469  pos_end_[num_threads_ - 1] = num_elems_;
470 }
471 
472 template <typename PlainType,
473  typename UnsignedType,
474  typename Encoder,
475  unsigned int Base,
476  typename Function,
477  typename ThreaderType>
478 struct RunTask
479 {
480  RunTask(size_t binary_tree_height,
481  size_t binary_tree_position,
482  Function f,
483  size_t num_elems,
484  size_t num_threads,
485  const ThreaderType& threader)
486  : binary_tree_height_(binary_tree_height)
487  , binary_tree_position_(binary_tree_position)
488  , f_(f)
489  , num_elems_(num_elems)
490  , num_threads_(num_threads)
491  , threader_(threader)
492  {
493  }
494 
495  template <typename ThreaderData = void*>
496  void operator()(ThreaderData tData = nullptr) const
497  {
498  size_t num_nodes_at_current_height = (size_t)pow(2, (double)binary_tree_height_);
499  if (num_threads_ <= num_nodes_at_current_height)
500  {
501  const size_t my_id = binary_tree_position_ - num_nodes_at_current_height;
502  if (my_id < num_threads_)
503  {
504  f_(my_id);
505  }
506  }
507  else
508  {
509  RunTask left(binary_tree_height_ + 1,
510  2 * binary_tree_position_,
511  f_,
512  num_elems_,
513  num_threads_,
514  threader_);
515  RunTask right(binary_tree_height_ + 1,
516  2 * binary_tree_position_ + 1,
517  f_,
518  num_elems_,
519  num_threads_,
520  threader_);
521  threader_.RunChildTasks(tData, left, right);
522  }
523  }
524 
525  size_t binary_tree_height_;
526  size_t binary_tree_position_;
527  Function f_;
528  size_t num_elems_;
529  size_t num_threads_;
530  ThreaderType threader_;
531 };
532 
533 template <typename PlainType,
534  typename CompareType,
535  typename UnsignedType,
536  typename Encoder,
537  typename ValueManager,
538  typename ThreaderType,
539  unsigned int Base>
540 void ParallelRadixSortInternal<PlainType,
541  CompareType,
542  UnsignedType,
543  Encoder,
544  ValueManager,
545  ThreaderType,
546  Base>::ComputeHistogram(unsigned int b, UnsignedType* src)
547 {
548  // Compute local histogram
549 
550  auto lambda = [=](const size_t my_id)
551  {
552  const size_t my_bgn = pos_bgn_[my_id];
553  const size_t my_end = pos_end_[my_id];
554  size_t* my_histo = histo_[my_id];
555 
556  memset(my_histo, 0, sizeof(size_t) * (1 << Base));
557  for (size_t i = my_bgn; i < my_end; ++i)
558  {
559  const UnsignedType s = Encoder::encode(src[i]);
560  UnsignedType t = (s >> b) & ((1 << Base) - 1);
561  compare_internal_.reverse(t);
562  ++my_histo[t];
563  }
564  };
565 
566  using RunTaskType =
567  RunTask<PlainType, UnsignedType, Encoder, Base, std::function<void(size_t)>, ThreaderType>;
568 
569  RunTaskType root(0, 1, lambda, num_elems_, num_threads_, threader_);
570  this->threader_.RunParentTask(root);
571 
572  // Compute global histogram
573  size_t s = 0;
574  for (size_t i = 0; i < 1 << Base; ++i)
575  {
576  for (size_t j = 0; j < num_threads_; ++j)
577  {
578  const size_t t = s + histo_[j][i];
579  histo_[j][i] = s;
580  s = t;
581  }
582  }
583 }
584 
585 template <typename PlainType,
586  typename CompareType,
587  typename UnsignedType,
588  typename Encoder,
589  typename ValueManager,
590  typename ThreaderType,
591  unsigned int Base>
592 void ParallelRadixSortInternal<PlainType,
593  CompareType,
594  UnsignedType,
595  Encoder,
596  ValueManager,
597  ThreaderType,
598  Base>::Scatter(unsigned int b, UnsignedType* src, UnsignedType* dst)
599 {
600 
601  auto lambda = [=](const size_t my_id)
602  {
603  const size_t my_bgn = pos_bgn_[my_id];
604  const size_t my_end = pos_end_[my_id];
605  size_t* my_histo = histo_[my_id];
606  UnsignedType** my_buf = out_buf_[my_id];
607  size_t* my_buf_n = out_buf_n_[my_id];
608 
609  memset(my_buf_n, 0, sizeof(size_t) * (1 << Base));
610  for (size_t i = my_bgn; i < my_end; ++i)
611  {
612  const UnsignedType s = Encoder::encode(src[i]);
613  UnsignedType t = (s >> b) & ((1 << Base) - 1);
614  compare_internal_.reverse(t);
615  my_buf[t][my_buf_n[t]] = src[i];
616  value_manager_->Push(my_id, t, my_buf_n[t], i);
617  ++my_buf_n[t];
618 
619  if (my_buf_n[t] == kOutBufferSize)
620  {
621  size_t p = my_histo[t];
622  for (size_t j = 0; j < kOutBufferSize; ++j)
623  {
624  size_t tp = p++;
625  dst[tp] = my_buf[t][j];
626  }
627  value_manager_->Flush(my_id, t, kOutBufferSize, my_histo[t]);
628 
629  my_histo[t] += kOutBufferSize;
630  my_buf_n[t] = 0;
631  }
632  }
633 
634  // Flush everything
635  for (size_t i = 0; i < 1 << Base; ++i)
636  {
637  size_t p = my_histo[i];
638  for (size_t j = 0; j < my_buf_n[i]; ++j)
639  {
640  size_t tp = p++;
641  dst[tp] = my_buf[i][j];
642  }
643  value_manager_->Flush(my_id, i, my_buf_n[i], my_histo[i]);
644  }
645  };
646 
647  using RunTaskType =
648  RunTask<PlainType, UnsignedType, Encoder, Base, std::function<void(size_t)>, ThreaderType>;
649  RunTaskType root(0, 1, lambda, num_elems_, num_threads_, threader_);
650  this->threader_.RunParentTask(root);
651 }
652 } // namespace internal
653 
654 // Encoders encode signed/unsigned integers and floating point numbers
655 // to correctly ordered unsigned integers
656 namespace encoder
657 {
658 class EncoderDummy
659 {
660 };
661 
662 class EncoderUnsigned
663 {
664 public:
665  template <typename UnsignedType>
666  inline static UnsignedType encode(UnsignedType x)
667  {
668  return x;
669  }
670 };
671 
672 class EncoderSigned
673 {
674 public:
675  template <typename UnsignedType>
676  inline static UnsignedType encode(UnsignedType x)
677  {
678  return x ^ (UnsignedType(1) << (CHAR_BIT * sizeof(UnsignedType) - 1));
679  }
680 };
681 
682 class EncoderDecimal
683 {
684 public:
685  template <typename UnsignedType>
686  inline static UnsignedType encode(UnsignedType x)
687  {
688  static const size_t bits = CHAR_BIT * sizeof(UnsignedType);
689  const UnsignedType a = x >> (bits - 1);
690  const UnsignedType b = (-static_cast<int>(a)) | (UnsignedType(1) << (bits - 1));
691  return x ^ b;
692  }
693 };
694 } // namespace encoder
695 
696 // Value managers are used to generalize the sorting algorithm
697 // to sorting of keys and sorting of pairs
698 namespace value_manager
699 {
700 class DummyValueManager
701 {
702 public:
703  inline void Push(int thread, size_t bucket, size_t num, size_t from_pos)
704  {
705  (void)thread;
706  (void)bucket;
707  (void)num;
708  (void)from_pos;
709  }
710 
711  inline void Flush(int thread, size_t bucket, size_t num, size_t to_pos)
712  {
713  (void)thread;
714  (void)bucket;
715  (void)num;
716  (void)to_pos;
717  }
718 
719  void Next() {}
720 };
721 
722 template <typename PlainType, typename ValueType, int Base>
723 class PairValueManager
724 {
725 public:
726  PairValueManager()
727  : max_elems_(0)
728  , max_threads_(0)
729  , original_(NULL)
730  , tmp_(NULL)
731  , src_(NULL)
732  , dst_(NULL)
733  , out_buf_(NULL)
734  , tmp_size(0)
735  {
736  }
737 
738  ~PairValueManager() { DeleteAll(); }
739 
740  void Init(size_t max_elems, size_t available_threads);
741 
742  void Start(ValueType* original, size_t num_elems)
743  {
744  assert(num_elems <= max_elems_);
745  src_ = original_ = original;
746  dst_ = tmp_;
747  }
748 
749  inline void Push(int thread, size_t bucket, size_t num, size_t from_pos)
750  {
751  out_buf_[thread][bucket][num] = src_[from_pos];
752  }
753 
754  inline void Flush(int thread, size_t bucket, size_t num, size_t to_pos)
755  {
756  for (size_t i = 0; i < num; ++i)
757  {
758  dst_[to_pos++] = out_buf_[thread][bucket][i];
759  }
760  }
761 
762  void Next() { std::swap(src_, dst_); }
763 
764  ValueType* GetResult() { return src_; }
765 
766 private:
767  size_t max_elems_;
768  int max_threads_;
769 
770  static constexpr size_t kOutBufferSize = internal::kOutBufferSize;
771  ValueType *original_, *tmp_;
772  ValueType *src_, *dst_;
773  ValueType*** out_buf_;
774  viskores::UInt64 tmp_size;
775 
776  void DeleteAll();
777 };
778 
779 template <typename PlainType, typename ValueType, int Base>
780 void PairValueManager<PlainType, ValueType, Base>::Init(size_t max_elems, size_t available_cores)
781 {
782  DeleteAll();
783 
784  max_elems_ = max_elems;
785  max_threads_ = utility::GetMaxThreads(max_elems_ * sizeof(PlainType), available_cores);
786 
787  { // This allocation can be quite large, so log it:
788  tmp_size = max_elems * sizeof(ValueType);
790  "Allocating working memory for radix sort-by-key: %s.",
791  viskores::cont::GetSizeString(tmp_size).c_str());
792  tmp_ = new ValueType[max_elems];
793  }
794 
795  out_buf_ = new ValueType**[max_threads_];
796  for (int i = 0; i < max_threads_; ++i)
797  {
798  out_buf_[i] = new ValueType*[1 << Base];
799  for (size_t j = 0; j < 1 << Base; ++j)
800  {
801  out_buf_[i][j] = new ValueType[kOutBufferSize];
802  }
803  }
804 }
805 
806 template <typename PlainType, typename ValueType, int Base>
807 void PairValueManager<PlainType, ValueType, Base>::DeleteAll()
808 {
809  { // This allocation can be quite large, so log it:
811  "Freeing working memory for radix sort-by-key: %s.",
812  viskores::cont::GetSizeString(tmp_size).c_str());
813  delete[] tmp_;
814  tmp_ = NULL;
815  tmp_size = 0;
816  }
817 
818  for (int i = 0; i < max_threads_; ++i)
819  {
820  for (size_t j = 0; j < 1 << Base; ++j)
821  {
822  delete[] out_buf_[i][j];
823  }
824  delete[] out_buf_[i];
825  }
826  delete[] out_buf_;
827  out_buf_ = NULL;
828 
829  max_elems_ = 0;
830  max_threads_ = 0;
831 }
832 } // namespace value_manager
833 
834 // Frontend class for sorting keys
835 template <typename ThreaderType,
836  typename PlainType,
837  typename CompareType,
838  typename UnsignedType = PlainType,
839  typename Encoder = encoder::EncoderDummy,
840  unsigned int Base = 8>
841 class KeySort
842 {
843  using DummyValueManager = value_manager::DummyValueManager;
844  using Internal = internal::ParallelRadixSortInternal<PlainType,
845  CompareType,
846  UnsignedType,
847  Encoder,
848  DummyValueManager,
849  ThreaderType,
850  Base>;
851 
852 public:
853  void InitAndSort(PlainType* data,
854  size_t num_elems,
855  const ThreaderType& threader,
856  const CompareType& comp)
857  {
858  (void)comp;
859  DummyValueManager dvm;
860  Internal::InitAndSort(data, num_elems, threader, &dvm);
861  }
862 };
863 
864 // Frontend class for sorting pairs
865 template <typename ThreaderType,
866  typename PlainType,
867  typename ValueType,
868  typename CompareType,
869  typename UnsignedType = PlainType,
870  typename Encoder = encoder::EncoderDummy,
871  int Base = 8>
872 class PairSort
873 {
874  using ValueManager = value_manager::PairValueManager<PlainType, ValueType, Base>;
875  using Internal = internal::ParallelRadixSortInternal<PlainType,
876  CompareType,
877  UnsignedType,
878  Encoder,
879  ValueManager,
880  ThreaderType,
881  Base>;
882 
883 public:
884  void InitAndSort(PlainType* keys,
885  ValueType* vals,
886  size_t num_elems,
887  const ThreaderType& threader,
888  const CompareType& comp)
889  {
890  (void)comp;
891  ValueManager vm;
892  vm.Init(num_elems, threader.GetAvailableCores());
893  vm.Start(vals, num_elems);
894  Internal::InitAndSort(keys, num_elems, threader, &vm);
895  ValueType* res_vals = vm.GetResult();
896  if (res_vals != vals)
897  {
898  for (size_t i = 0; i < num_elems; ++i)
899  {
900  vals[i] = res_vals[i];
901  }
902  }
903  }
904 
905 private:
906 };
907 
908 #define KEY_SORT_CASE(plain_type, compare_type, unsigned_type, encoder_type) \
909  template <typename ThreaderType> \
910  class KeySort<ThreaderType, plain_type, compare_type> \
911  : public KeySort<ThreaderType, \
912  plain_type, \
913  compare_type, \
914  unsigned_type, \
915  encoder::Encoder##encoder_type> \
916  { \
917  }; \
918  template <typename V, typename ThreaderType> \
919  class PairSort<ThreaderType, plain_type, V, compare_type> \
920  : public PairSort<ThreaderType, \
921  plain_type, \
922  V, \
923  compare_type, \
924  unsigned_type, \
925  encoder::Encoder##encoder_type> \
926  { \
927  };
928 
929 // Unsigned integers
930 KEY_SORT_CASE(unsigned int, std::less<unsigned int>, unsigned int, Unsigned);
931 KEY_SORT_CASE(unsigned int, std::greater<unsigned int>, unsigned int, Unsigned);
932 KEY_SORT_CASE(unsigned short int, std::less<unsigned short int>, unsigned short int, Unsigned);
933 KEY_SORT_CASE(unsigned short int, std::greater<unsigned short int>, unsigned short int, Unsigned);
934 KEY_SORT_CASE(unsigned long int, std::less<unsigned long int>, unsigned long int, Unsigned);
935 KEY_SORT_CASE(unsigned long int, std::greater<unsigned long int>, unsigned long int, Unsigned);
936 KEY_SORT_CASE(unsigned long long int,
937  std::less<unsigned long long int>,
938  unsigned long long int,
939  Unsigned);
940 KEY_SORT_CASE(unsigned long long int,
941  std::greater<unsigned long long int>,
942  unsigned long long int,
943  Unsigned);
944 
945 // Unsigned char
946 KEY_SORT_CASE(unsigned char, std::less<unsigned char>, unsigned char, Unsigned);
947 KEY_SORT_CASE(unsigned char, std::greater<unsigned char>, unsigned char, Unsigned);
948 KEY_SORT_CASE(char16_t, std::less<char16_t>, uint16_t, Unsigned);
949 KEY_SORT_CASE(char16_t, std::greater<char16_t>, uint16_t, Unsigned);
950 KEY_SORT_CASE(char32_t, std::less<char32_t>, uint32_t, Unsigned);
951 KEY_SORT_CASE(char32_t, std::greater<char32_t>, uint32_t, Unsigned);
952 KEY_SORT_CASE(wchar_t, std::less<wchar_t>, uint32_t, Unsigned);
953 KEY_SORT_CASE(wchar_t, std::greater<wchar_t>, uint32_t, Unsigned);
954 
955 // Signed integers
956 KEY_SORT_CASE(char, std::less<char>, unsigned char, Signed);
957 KEY_SORT_CASE(char, std::greater<char>, unsigned char, Signed);
958 KEY_SORT_CASE(short, std::less<short>, unsigned short, Signed);
959 KEY_SORT_CASE(short, std::greater<short>, unsigned short, Signed);
960 KEY_SORT_CASE(int, std::less<int>, unsigned int, Signed);
961 KEY_SORT_CASE(int, std::greater<int>, unsigned int, Signed);
962 KEY_SORT_CASE(long, std::less<long>, unsigned long, Signed);
963 KEY_SORT_CASE(long, std::greater<long>, unsigned long, Signed);
964 KEY_SORT_CASE(long long, std::less<long long>, unsigned long long, Signed);
965 KEY_SORT_CASE(long long, std::greater<long long>, unsigned long long, Signed);
966 
967 // |signed char| and |char| are treated as different types
968 KEY_SORT_CASE(signed char, std::less<signed char>, unsigned char, Signed);
969 KEY_SORT_CASE(signed char, std::greater<signed char>, unsigned char, Signed);
970 
971 // Floating point numbers
972 KEY_SORT_CASE(float, std::less<float>, uint32_t, Decimal);
973 KEY_SORT_CASE(float, std::greater<float>, uint32_t, Decimal);
974 KEY_SORT_CASE(double, std::less<double>, uint64_t, Decimal);
975 KEY_SORT_CASE(double, std::greater<double>, uint64_t, Decimal);
976 
977 #undef KEY_SORT_CASE
978 
979 template <typename T, typename CompareType>
980 struct run_kx_radix_sort_keys
981 {
982  static void run(T* data, size_t num_elems, const CompareType& comp)
983  {
984  std::sort(data, data + num_elems, comp);
985  }
986 };
987 
988 #define KX_SORT_KEYS(key_type) \
989  template <> \
990  struct run_kx_radix_sort_keys<key_type, std::less<key_type>> \
991  { \
992  static void run(key_type* data, size_t num_elems, const std::less<key_type>& comp) \
993  { \
994  (void)comp; \
995  kx::radix_sort(data, data + num_elems); \
996  } \
997  };
998 
999 KX_SORT_KEYS(unsigned short int);
1000 KX_SORT_KEYS(int);
1001 KX_SORT_KEYS(unsigned int);
1002 KX_SORT_KEYS(long int);
1003 KX_SORT_KEYS(unsigned long int);
1004 KX_SORT_KEYS(long long int);
1005 KX_SORT_KEYS(unsigned long long int);
1006 KX_SORT_KEYS(unsigned char);
1007 
1008 #undef KX_SORT_KEYS
1009 
1010 template <typename T, typename CompareType>
1011 bool use_serial_sort_keys(T* data, size_t num_elems, const CompareType& comp)
1012 {
1013  size_t total_bytes = (num_elems) * sizeof(T);
1014  if (total_bytes < MIN_BYTES_FOR_PARALLEL)
1015  {
1016  run_kx_radix_sort_keys<T, CompareType>::run(data, num_elems, comp);
1017  return true;
1018  }
1019  return false;
1020 }
1021 
1022 // Generate radix sort interfaces for key and key value sorts.
1023 #define VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(threader_type, key_type) \
1024  VISKORES_CONT_EXPORT void parallel_radix_sort_key_values( \
1025  key_type* keys, viskores::Id* vals, size_t num_elems, const std::greater<key_type>& comp) \
1026  { \
1027  using namespace viskores::cont::internal::radix; \
1028  PairSort<threader_type, key_type, viskores::Id, std::greater<key_type>> ps; \
1029  ps.InitAndSort(keys, vals, num_elems, threader_type(), comp); \
1030  } \
1031  VISKORES_CONT_EXPORT void parallel_radix_sort_key_values( \
1032  key_type* keys, viskores::Id* vals, size_t num_elems, const std::less<key_type>& comp) \
1033  { \
1034  using namespace viskores::cont::internal::radix; \
1035  PairSort<threader_type, key_type, viskores::Id, std::less<key_type>> ps; \
1036  ps.InitAndSort(keys, vals, num_elems, threader_type(), comp); \
1037  } \
1038  VISKORES_CONT_EXPORT void parallel_radix_sort( \
1039  key_type* data, size_t num_elems, const std::greater<key_type>& comp) \
1040  { \
1041  using namespace viskores::cont::internal::radix; \
1042  if (!use_serial_sort_keys(data, num_elems, comp)) \
1043  { \
1044  KeySort<threader_type, key_type, std::greater<key_type>> ks; \
1045  ks.InitAndSort(data, num_elems, threader_type(), comp); \
1046  } \
1047  } \
1048  VISKORES_CONT_EXPORT void parallel_radix_sort( \
1049  key_type* data, size_t num_elems, const std::less<key_type>& comp) \
1050  { \
1051  using namespace viskores::cont::internal::radix; \
1052  if (!use_serial_sort_keys(data, num_elems, comp)) \
1053  { \
1054  KeySort<threader_type, key_type, std::less<key_type>> ks; \
1055  ks.InitAndSort(data, num_elems, threader_type(), comp); \
1056  } \
1057  }
1058 
1059 #define VISKORES_INSTANTIATE_RADIX_SORT_FOR_THREADER(ThreaderType) \
1060  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, short int) \
1061  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, unsigned short int) \
1062  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, int) \
1063  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, unsigned int) \
1064  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, long int) \
1065  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, unsigned long int) \
1066  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, long long int) \
1067  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, unsigned long long int) \
1068  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, unsigned char) \
1069  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, signed char) \
1070  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, char) \
1071  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, char16_t) \
1072  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, char32_t) \
1073  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, wchar_t) \
1074  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, float) \
1075  VISKORES_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, double)
1076 
1078 }
1079 }
1080 }
1081 } // end namespace viskores::cont::internal::radix
1082 
1083 #endif // viskores_cont_internal_ParallelRadixSort_h
Types.h
KX_SORT_KEYS
#define KX_SORT_KEYS(key_type)
Definition: ParallelRadixSort.h:988
VISKORES_THIRDPARTY_POST_INCLUDE
#define VISKORES_THIRDPARTY_POST_INCLUDE
Definition: Configure.h:200
VISKORES_LOG_F
#define VISKORES_LOG_F(level,...)
Writes a message using printf syntax to the indicated log level.
Definition: Logging.h:217
KXSort.h
viskores
Groups connected points that have the same field value.
Definition: Atomic.h:27
viskores::cont::LogLevel::MemCont
@ MemCont
Host-side resource allocations/frees (e.g. ArrayHandle control buffers).
viskores::UInt64
unsigned long long UInt64
Base type to use for 64-bit signed integer numbers.
Definition: Types.h:215
Logging.h
Logging utilities.
KEY_SORT_CASE
#define KEY_SORT_CASE(plain_type, compare_type, unsigned_type, encoder_type)
Definition: ParallelRadixSort.h:908
viskores::cont::GetSizeString
std::string GetSizeString(viskores::UInt64 bytes, int prec=2)
Returns "%1 (%2 bytes)" where %1 is the result from GetHumanReadableSize and %2 is the exact number o...
VISKORES_THIRDPARTY_PRE_INCLUDE
#define VISKORES_THIRDPARTY_PRE_INCLUDE
Definition: Configure.h:199
ParallelRadixSortInterface.h