Viskores  1.0
ThrustPatches.h
Go to the documentation of this file.
1 //============================================================================
2 // The contents of this file are covered by the Viskores license. See
3 // LICENSE.txt for details.
4 //
5 // By contributing to this file, all contributors agree to the Developer
6 // Certificate of Origin Version 1.1 (DCO 1.1) as stated in DCO.txt.
7 //============================================================================
8 
9 //============================================================================
10 // Copyright (c) Kitware, Inc.
11 // All rights reserved.
12 // See LICENSE.txt for details.
13 //
14 // This software is distributed WITHOUT ANY WARRANTY; without even
15 // the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
16 // PURPOSE. See the above copyright notice for more information.
17 //============================================================================
18 #ifndef viskores_exec_cuda_internal_ThrustPatches_h
19 #define viskores_exec_cuda_internal_ThrustPatches_h
20 
21 #include <viskores/Types.h>
22 
23 #ifdef VISKORES_ENABLE_CUDA
24 
25 // Needed so we can conditionally include components
26 #include <thrust/version.h>
27 
28 #if THRUST_VERSION >= 100900 && THRUST_VERSION < 100906
29 //So for thrust 1.9.0+ ( CUDA 9.X+ ) the aligned_reinterpret_cast has a bug
30 //where it is not marked as __host__device__. To fix this we add a new
31 //overload for void* with the correct markup (which is what everyone calls).
32 namespace thrust
33 {
34 namespace detail
35 {
36 //just in-case somebody has this fix also for primitive types
37 template <typename T, typename U>
38 T aligned_reinterpret_cast(U u);
39 
40 #define ALIGN_RE_T(RT) \
41  template <> \
42  inline __host__ __device__ RT* aligned_reinterpret_cast(void* u) \
43  { \
44  return reinterpret_cast<RT*>(reinterpret_cast<void*>(u)); \
45  } \
46  template <> \
47  inline __host__ __device__ RT* aligned_reinterpret_cast(viskores::UInt8* u) \
48  { \
49  return reinterpret_cast<RT*>(reinterpret_cast<void*>(u)); \
50  } \
51  struct SwallowSemicolon
52 
53 #define ALIGN_RE_VEC(RT) \
54  template <> \
55  inline __host__ __device__ viskores::Vec<RT, 2>* aligned_reinterpret_cast(void* u) \
56  { \
57  return reinterpret_cast<viskores::Vec<RT, 2>*>(reinterpret_cast<void*>(u)); \
58  } \
59  template <> \
60  inline __host__ __device__ viskores::Vec<RT, 3>* aligned_reinterpret_cast(void* u) \
61  { \
62  return reinterpret_cast<viskores::Vec<RT, 3>*>(reinterpret_cast<void*>(u)); \
63  } \
64  template <> \
65  inline __host__ __device__ viskores::Vec<RT, 4>* aligned_reinterpret_cast(void* u) \
66  { \
67  return reinterpret_cast<viskores::Vec<RT, 4>*>(reinterpret_cast<void*>(u)); \
68  } \
69  template <> \
70  inline __host__ __device__ viskores::Vec<viskores::Vec<RT, 3>, 2>* aligned_reinterpret_cast( \
71  void* u) \
72  { \
73  return reinterpret_cast<viskores::Vec<viskores::Vec<RT, 3>, 2>*>(reinterpret_cast<void*>(u)); \
74  } \
75  template <> \
76  inline __host__ __device__ viskores::Vec<viskores::Vec<RT, 9>, 2>* aligned_reinterpret_cast( \
77  void* u) \
78  { \
79  return reinterpret_cast<viskores::Vec<viskores::Vec<RT, 9>, 2>*>(reinterpret_cast<void*>(u)); \
80  } \
81  template <> \
82  inline __host__ __device__ viskores::Vec<RT, 2>* aligned_reinterpret_cast(viskores::UInt8* u) \
83  { \
84  return reinterpret_cast<viskores::Vec<RT, 2>*>(reinterpret_cast<void*>(u)); \
85  } \
86  template <> \
87  inline __host__ __device__ viskores::Vec<RT, 3>* aligned_reinterpret_cast(viskores::UInt8* u) \
88  { \
89  return reinterpret_cast<viskores::Vec<RT, 3>*>(reinterpret_cast<void*>(u)); \
90  } \
91  template <> \
92  inline __host__ __device__ viskores::Vec<RT, 4>* aligned_reinterpret_cast(viskores::UInt8* u) \
93  { \
94  return reinterpret_cast<viskores::Vec<RT, 4>*>(reinterpret_cast<void*>(u)); \
95  } \
96  template <> \
97  inline __host__ __device__ viskores::Vec<viskores::Vec<RT, 2>, 2>* aligned_reinterpret_cast( \
98  viskores::UInt8* u) \
99  { \
100  return reinterpret_cast<viskores::Vec<viskores::Vec<RT, 2>, 2>*>(reinterpret_cast<void*>(u)); \
101  } \
102  template <> \
103  inline __host__ __device__ viskores::Vec<viskores::Vec<RT, 3>, 2>* aligned_reinterpret_cast( \
104  viskores::UInt8* u) \
105  { \
106  return reinterpret_cast<viskores::Vec<viskores::Vec<RT, 3>, 2>*>(reinterpret_cast<void*>(u)); \
107  } \
108  template <> \
109  inline __host__ __device__ viskores::Vec<viskores::Vec<RT, 4>, 2>* aligned_reinterpret_cast( \
110  viskores::UInt8* u) \
111  { \
112  return reinterpret_cast<viskores::Vec<viskores::Vec<RT, 4>, 2>*>(reinterpret_cast<void*>(u)); \
113  } \
114  template <> \
115  inline __host__ __device__ viskores::Vec<viskores::Vec<RT, 9>, 2>* aligned_reinterpret_cast( \
116  viskores::UInt8* u) \
117  { \
118  return reinterpret_cast<viskores::Vec<viskores::Vec<RT, 9>, 2>*>(reinterpret_cast<void*>(u)); \
119  } \
120  struct SwallowSemicolon
121 
122 #define ALIGN_RE_PAIR(T, U) \
123  template <> \
124  inline __host__ __device__ viskores::Pair<T, U>* aligned_reinterpret_cast(void* u) \
125  { \
126  return reinterpret_cast<viskores::Pair<T, U>*>(reinterpret_cast<void*>(u)); \
127  } \
128  template <> \
129  inline __host__ __device__ viskores::Pair<T, U>* aligned_reinterpret_cast(viskores::UInt8* u) \
130  { \
131  return reinterpret_cast<viskores::Pair<T, U>*>(reinterpret_cast<void*>(u)); \
132  } \
133  struct SwallowSemicolon
134 
135 #ifndef VISKORES_DONT_FIX_THRUST
136 ALIGN_RE_T(bool);
137 ALIGN_RE_T(char);
138 ALIGN_RE_T(viskores::Int8);
139 ALIGN_RE_T(viskores::UInt8);
140 ALIGN_RE_T(viskores::Int16);
141 ALIGN_RE_T(viskores::UInt16);
142 ALIGN_RE_T(viskores::Int32);
143 ALIGN_RE_T(viskores::UInt32);
144 // Need these for vtk. don't need long long, since those are used for [U]Int64.
145 ALIGN_RE_T(long);
146 ALIGN_RE_T(unsigned long);
147 ALIGN_RE_T(viskores::Int64);
148 ALIGN_RE_T(viskores::UInt64);
149 ALIGN_RE_T(viskores::Float32);
150 ALIGN_RE_T(viskores::Float64);
151 #endif
152 
153 ALIGN_RE_VEC(char);
154 ALIGN_RE_VEC(viskores::Int8);
155 ALIGN_RE_VEC(viskores::UInt8);
156 ALIGN_RE_VEC(viskores::Int16);
157 ALIGN_RE_VEC(viskores::UInt16);
158 ALIGN_RE_VEC(viskores::Int32);
159 ALIGN_RE_VEC(viskores::UInt32);
160 // Need these for vtk. don't need long long, since those are used for [U]Int64.
161 ALIGN_RE_VEC(long);
162 ALIGN_RE_VEC(unsigned long);
163 ALIGN_RE_VEC(viskores::Int64);
164 ALIGN_RE_VEC(viskores::UInt64);
165 ALIGN_RE_VEC(viskores::Float32);
166 ALIGN_RE_VEC(viskores::Float64);
167 
168 ALIGN_RE_PAIR(viskores::Int32, viskores::Int32);
169 ALIGN_RE_PAIR(viskores::Int32, viskores::Int64);
170 ALIGN_RE_PAIR(viskores::Int32, viskores::Float32);
171 ALIGN_RE_PAIR(viskores::Int32, viskores::Float64);
172 
173 ALIGN_RE_PAIR(viskores::Int64, viskores::Int32);
174 ALIGN_RE_PAIR(viskores::Int64, viskores::Int64);
175 ALIGN_RE_PAIR(viskores::Int64, viskores::Float32);
176 ALIGN_RE_PAIR(viskores::Int64, viskores::Float64);
177 
178 #undef ALIGN_RE_T
179 #undef ALIGN_RE_VEC
180 #undef ALIGN_RE_PAIR
181 }
182 }
183 #endif //THRUST_VERSION >= 100900 && THRUST_VERSION < 100906
184 
185 #if (THRUST_VERSION >= 100904) && (THRUST_VERSION < 100909)
186 //So for thrust 1.9.4+ (CUDA 10.1+) the stateless_resource_allocator has a bug
187 //where it is not marked as __host__ __device__ && __thrust_exec_check_disable__.
188 //To fix this we add a new partial specialization on cuda::memory_resource
189 //which the correct markup (which is what everyone calls).
190 //See: https://github.com/thrust/thrust/issues/972
192 #include <thrust/mr/allocator.h>
193 #include <thrust/system/cuda/memory_resource.h>
195 namespace thrust
196 {
197 namespace mr
198 {
199 
200 template <typename T>
201 class stateless_resource_allocator<T, ::thrust::system::cuda::memory_resource>
202  : public thrust::mr::allocator<T, ::thrust::system::cuda::memory_resource>
203 {
204  typedef ::thrust::system::cuda::memory_resource Upstream;
205  typedef thrust::mr::allocator<T, Upstream> base;
206 
207 public:
212  template <typename U>
213  struct rebind
214  {
217  typedef stateless_resource_allocator<U, Upstream> other;
218  };
219 
223  __thrust_exec_check_disable__ //modification, required to suppress warnings
224  __host__ __device__ //modification, required to suppress warnings
225  stateless_resource_allocator()
226  : base(get_global_resource<Upstream>())
227  {
228  }
229 
231  __host__ __device__ stateless_resource_allocator(const stateless_resource_allocator& other)
232  : base(other)
233  {
234  }
235 
237  template <typename U>
238  __host__ __device__
239  stateless_resource_allocator(const stateless_resource_allocator<U, Upstream>& other)
240  : base(other)
241  {
242  }
243 
245  __host__ __device__ ~stateless_resource_allocator() {}
246 };
247 }
248 }
249 #endif //(THRUST_VERSION >= 100904) && (THRUST_VERSION < 100909)
250 
251 
252 #if THRUST_VERSION < 100900
253 //So for thrust 1.8.0 - 1.8.2 the inclusive_scan has a bug when accumulating
254 //values when the binary operators states it is not commutative.
255 //For more complex value types, we patch thrust/bulk with fix that is found
256 //in issue: https://github.com/thrust/thrust/issues/692
257 //
258 //This specialization needs to be included before ANY thrust includes otherwise
259 //other device code inside thrust that calls it will not see it
260 namespace viskores
261 {
262 namespace exec
263 {
264 namespace cuda
265 {
266 namespace internal
267 {
268 //Forward declare of WrappedBinaryOperator
269 template <typename T, typename F>
270 class WrappedBinaryOperator;
271 }
272 }
273 }
274 } //namespace viskores::exec::cuda::internal
275 
276 namespace thrust
277 {
278 namespace system
279 {
280 namespace cuda
281 {
282 namespace detail
283 {
284 namespace bulk_
285 {
286 namespace detail
287 {
288 namespace accumulate_detail
289 {
290 template <typename ConcurrentGroup,
291  typename RandomAccessIterator,
292  typename Size,
293  typename T,
294  typename F>
295 __device__ T
296 destructive_accumulate_n(ConcurrentGroup& g,
297  RandomAccessIterator first,
298  Size n,
299  T init,
300  viskores::exec::cuda::internal::WrappedBinaryOperator<T, F> binary_op)
301 {
302  using size_type = typename ConcurrentGroup::size_type;
303 
304  size_type tid = g.this_exec.index();
305 
306  T x = init;
307  if (tid < n)
308  {
309  x = first[tid];
310  }
311 
312  g.wait();
313 
314  for (size_type offset = 1; offset < g.size(); offset += offset)
315  {
316  if (tid >= offset && tid - offset < n)
317  {
318  x = binary_op(first[tid - offset], x);
319  }
320 
321  g.wait();
322 
323  if (tid < n)
324  {
325  first[tid] = x;
326  }
327 
328  g.wait();
329  }
330 
331  T result = binary_op(init, first[n - 1]);
332 
333  g.wait();
334 
335  return result;
336 }
337 }
338 }
339 } //namespace bulk_::detail::accumulate_detail
340 }
341 }
342 }
343 } //namespace thrust::system::cuda::detail
344 #endif //THRUST_VERSION < 100900
345 
346 #endif //CUDA enabled
347 
348 #endif //viskores_exec_cuda_internal_ThrustPatches_h
viskores::Int16
int16_t Int16
Base type to use for 16-bit signed integer numbers.
Definition: Types.h:181
Types.h
VISKORES_THIRDPARTY_POST_INCLUDE
#define VISKORES_THIRDPARTY_POST_INCLUDE
Definition: Configure.h:200
viskores::Int8
int8_t Int8
Base type to use for 8-bit signed integer numbers.
Definition: Types.h:173
viskores::UInt16
uint16_t UInt16
Base type to use for 16-bit unsigned integer numbers.
Definition: Types.h:185
viskores::Int64
signed long long Int64
Base type to use for 64-bit signed integer numbers.
Definition: Types.h:212
viskores
Groups connected points that have the same field value.
Definition: Atomic.h:27
viskores::Float32
float Float32
Base type to use for 32-bit floating-point numbers.
Definition: Types.h:165
viskores::UInt64
unsigned long long UInt64
Base type to use for 64-bit signed integer numbers.
Definition: Types.h:215
viskores::UInt8
uint8_t UInt8
Base type to use for 8-bit unsigned integer numbers.
Definition: Types.h:177
viskores::Int32
int32_t Int32
Base type to use for 32-bit signed integer numbers.
Definition: Types.h:189
viskores::Float64
double Float64
Base type to use for 64-bit floating-point numbers.
Definition: Types.h:169
VISKORES_THIRDPARTY_PRE_INCLUDE
#define VISKORES_THIRDPARTY_PRE_INCLUDE
Definition: Configure.h:199
viskores::UInt32
uint32_t UInt32
Base type to use for 32-bit unsigned integer numbers.
Definition: Types.h:193