gpu_basic.cu 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. // This file is part of Eigen, a lightweight C++ template library
  2. // for linear algebra.
  3. //
  4. // Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
  5. //
  6. // This Source Code Form is subject to the terms of the Mozilla
  7. // Public License v. 2.0. If a copy of the MPL was not distributed
  8. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
  9. // workaround issue between gcc >= 4.7 and cuda 5.5
  10. #if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
  11. #undef _GLIBCXX_ATOMIC_BUILTINS
  12. #undef _GLIBCXX_USE_INT128
  13. #endif
  14. #define EIGEN_TEST_NO_LONGDOUBLE
  15. #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
  16. #include "main.h"
  17. #include "gpu_common.h"
  18. // Check that dense modules can be properly parsed by nvcc
  19. #include <Eigen/Dense>
  20. // struct Foo{
  21. // EIGEN_DEVICE_FUNC
  22. // void operator()(int i, const float* mats, float* vecs) const {
  23. // using namespace Eigen;
  24. // // Matrix3f M(data);
  25. // // Vector3f x(data+9);
  26. // // Map<Vector3f>(data+9) = M.inverse() * x;
  27. // Matrix3f M(mats+i/16);
  28. // Vector3f x(vecs+i*3);
  29. // // using std::min;
  30. // // using std::sqrt;
  31. // Map<Vector3f>(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() * x) / x.x();
  32. // //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum();
  33. // }
  34. // };
  35. template<typename T>
  36. struct coeff_wise {
  37. EIGEN_DEVICE_FUNC
  38. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  39. {
  40. using namespace Eigen;
  41. T x1(in+i);
  42. T x2(in+i+1);
  43. T x3(in+i+2);
  44. Map<T> res(out+i*T::MaxSizeAtCompileTime);
  45. res.array() += (in[0] * x1 + x2).array() * x3.array();
  46. }
  47. };
  48. template<typename T>
  49. struct complex_sqrt {
  50. EIGEN_DEVICE_FUNC
  51. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  52. {
  53. using namespace Eigen;
  54. typedef typename T::Scalar ComplexType;
  55. typedef typename T::Scalar::value_type ValueType;
  56. const int num_special_inputs = 18;
  57. if (i == 0) {
  58. const ValueType nan = std::numeric_limits<ValueType>::quiet_NaN();
  59. typedef Eigen::Vector<ComplexType, num_special_inputs> SpecialInputs;
  60. SpecialInputs special_in;
  61. special_in.setZero();
  62. int idx = 0;
  63. special_in[idx++] = ComplexType(0, 0);
  64. special_in[idx++] = ComplexType(-0, 0);
  65. special_in[idx++] = ComplexType(0, -0);
  66. special_in[idx++] = ComplexType(-0, -0);
  67. // GCC's fallback sqrt implementation fails for inf inputs.
  68. // It is called when _GLIBCXX_USE_C99_COMPLEX is false or if
  69. // clang includes the GCC header (which temporarily disables
  70. // _GLIBCXX_USE_C99_COMPLEX)
  71. #if !defined(_GLIBCXX_COMPLEX) || \
  72. (_GLIBCXX_USE_C99_COMPLEX && !defined(__CLANG_CUDA_WRAPPERS_COMPLEX))
  73. const ValueType inf = std::numeric_limits<ValueType>::infinity();
  74. special_in[idx++] = ComplexType(1.0, inf);
  75. special_in[idx++] = ComplexType(nan, inf);
  76. special_in[idx++] = ComplexType(1.0, -inf);
  77. special_in[idx++] = ComplexType(nan, -inf);
  78. special_in[idx++] = ComplexType(-inf, 1.0);
  79. special_in[idx++] = ComplexType(inf, 1.0);
  80. special_in[idx++] = ComplexType(-inf, -1.0);
  81. special_in[idx++] = ComplexType(inf, -1.0);
  82. special_in[idx++] = ComplexType(-inf, nan);
  83. special_in[idx++] = ComplexType(inf, nan);
  84. #endif
  85. special_in[idx++] = ComplexType(1.0, nan);
  86. special_in[idx++] = ComplexType(nan, 1.0);
  87. special_in[idx++] = ComplexType(nan, -1.0);
  88. special_in[idx++] = ComplexType(nan, nan);
  89. Map<SpecialInputs> special_out(out);
  90. special_out = special_in.cwiseSqrt();
  91. }
  92. T x1(in + i);
  93. Map<T> res(out + num_special_inputs + i*T::MaxSizeAtCompileTime);
  94. res = x1.cwiseSqrt();
  95. }
  96. };
  97. template<typename T>
  98. struct complex_operators {
  99. EIGEN_DEVICE_FUNC
  100. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  101. {
  102. using namespace Eigen;
  103. typedef typename T::Scalar ComplexType;
  104. typedef typename T::Scalar::value_type ValueType;
  105. const int num_scalar_operators = 24;
  106. const int num_vector_operators = 23; // no unary + operator.
  107. int out_idx = i * (num_scalar_operators + num_vector_operators * T::MaxSizeAtCompileTime);
  108. // Scalar operators.
  109. const ComplexType a = in[i];
  110. const ComplexType b = in[i + 1];
  111. out[out_idx++] = +a;
  112. out[out_idx++] = -a;
  113. out[out_idx++] = a + b;
  114. out[out_idx++] = a + numext::real(b);
  115. out[out_idx++] = numext::real(a) + b;
  116. out[out_idx++] = a - b;
  117. out[out_idx++] = a - numext::real(b);
  118. out[out_idx++] = numext::real(a) - b;
  119. out[out_idx++] = a * b;
  120. out[out_idx++] = a * numext::real(b);
  121. out[out_idx++] = numext::real(a) * b;
  122. out[out_idx++] = a / b;
  123. out[out_idx++] = a / numext::real(b);
  124. out[out_idx++] = numext::real(a) / b;
  125. out[out_idx] = a; out[out_idx++] += b;
  126. out[out_idx] = a; out[out_idx++] -= b;
  127. out[out_idx] = a; out[out_idx++] *= b;
  128. out[out_idx] = a; out[out_idx++] /= b;
  129. const ComplexType true_value = ComplexType(ValueType(1), ValueType(0));
  130. const ComplexType false_value = ComplexType(ValueType(0), ValueType(0));
  131. out[out_idx++] = (a == b ? true_value : false_value);
  132. out[out_idx++] = (a == numext::real(b) ? true_value : false_value);
  133. out[out_idx++] = (numext::real(a) == b ? true_value : false_value);
  134. out[out_idx++] = (a != b ? true_value : false_value);
  135. out[out_idx++] = (a != numext::real(b) ? true_value : false_value);
  136. out[out_idx++] = (numext::real(a) != b ? true_value : false_value);
  137. // Vector versions.
  138. T x1(in + i);
  139. T x2(in + i + 1);
  140. const int res_size = T::MaxSizeAtCompileTime * num_scalar_operators;
  141. const int size = T::MaxSizeAtCompileTime;
  142. int block_idx = 0;
  143. Map<VectorX<ComplexType>> res(out + out_idx, res_size);
  144. res.segment(block_idx, size) = -x1;
  145. block_idx += size;
  146. res.segment(block_idx, size) = x1 + x2;
  147. block_idx += size;
  148. res.segment(block_idx, size) = x1 + x2.real();
  149. block_idx += size;
  150. res.segment(block_idx, size) = x1.real() + x2;
  151. block_idx += size;
  152. res.segment(block_idx, size) = x1 - x2;
  153. block_idx += size;
  154. res.segment(block_idx, size) = x1 - x2.real();
  155. block_idx += size;
  156. res.segment(block_idx, size) = x1.real() - x2;
  157. block_idx += size;
  158. res.segment(block_idx, size) = x1.array() * x2.array();
  159. block_idx += size;
  160. res.segment(block_idx, size) = x1.array() * x2.real().array();
  161. block_idx += size;
  162. res.segment(block_idx, size) = x1.real().array() * x2.array();
  163. block_idx += size;
  164. res.segment(block_idx, size) = x1.array() / x2.array();
  165. block_idx += size;
  166. res.segment(block_idx, size) = x1.array() / x2.real().array();
  167. block_idx += size;
  168. res.segment(block_idx, size) = x1.real().array() / x2.array();
  169. block_idx += size;
  170. res.segment(block_idx, size) = x1; res.segment(block_idx, size) += x2;
  171. block_idx += size;
  172. res.segment(block_idx, size) = x1; res.segment(block_idx, size) -= x2;
  173. block_idx += size;
  174. res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() *= x2.array();
  175. block_idx += size;
  176. res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() /= x2.array();
  177. block_idx += size;
  178. // Equality comparisons currently not functional on device
  179. // (std::equal_to<T> is host-only).
  180. // const T true_vector = T::Constant(true_value);
  181. // const T false_vector = T::Constant(false_value);
  182. // res.segment(block_idx, size) = (x1 == x2 ? true_vector : false_vector);
  183. // block_idx += size;
  184. // res.segment(block_idx, size) = (x1 == x2.real() ? true_vector : false_vector);
  185. // block_idx += size;
  186. // res.segment(block_idx, size) = (x1.real() == x2 ? true_vector : false_vector);
  187. // block_idx += size;
  188. // res.segment(block_idx, size) = (x1 != x2 ? true_vector : false_vector);
  189. // block_idx += size;
  190. // res.segment(block_idx, size) = (x1 != x2.real() ? true_vector : false_vector);
  191. // block_idx += size;
  192. // res.segment(block_idx, size) = (x1.real() != x2 ? true_vector : false_vector);
  193. // block_idx += size;
  194. }
  195. };
  196. template<typename T>
  197. struct replicate {
  198. EIGEN_DEVICE_FUNC
  199. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  200. {
  201. using namespace Eigen;
  202. T x1(in+i);
  203. int step = x1.size() * 4;
  204. int stride = 3 * step;
  205. typedef Map<Array<typename T::Scalar,Dynamic,Dynamic> > MapType;
  206. MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2);
  207. MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3);
  208. MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3);
  209. }
  210. };
  211. template<typename T>
  212. struct alloc_new_delete {
  213. EIGEN_DEVICE_FUNC
  214. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  215. {
  216. int offset = 2*i*T::MaxSizeAtCompileTime;
  217. T* x = new T(in + offset);
  218. Eigen::Map<T> u(out + offset);
  219. u = *x;
  220. delete x;
  221. offset += T::MaxSizeAtCompileTime;
  222. T* y = new T[1];
  223. y[0] = T(in + offset);
  224. Eigen::Map<T> v(out + offset);
  225. v = y[0];
  226. delete[] y;
  227. }
  228. };
  229. template<typename T>
  230. struct redux {
  231. EIGEN_DEVICE_FUNC
  232. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  233. {
  234. using namespace Eigen;
  235. int N = 10;
  236. T x1(in+i);
  237. out[i*N+0] = x1.minCoeff();
  238. out[i*N+1] = x1.maxCoeff();
  239. out[i*N+2] = x1.sum();
  240. out[i*N+3] = x1.prod();
  241. out[i*N+4] = x1.matrix().squaredNorm();
  242. out[i*N+5] = x1.matrix().norm();
  243. out[i*N+6] = x1.colwise().sum().maxCoeff();
  244. out[i*N+7] = x1.rowwise().maxCoeff().sum();
  245. out[i*N+8] = x1.matrix().colwise().squaredNorm().sum();
  246. }
  247. };
  248. template<typename T1, typename T2>
  249. struct prod_test {
  250. EIGEN_DEVICE_FUNC
  251. void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
  252. {
  253. using namespace Eigen;
  254. typedef Matrix<typename T1::Scalar, T1::RowsAtCompileTime, T2::ColsAtCompileTime> T3;
  255. T1 x1(in+i);
  256. T2 x2(in+i+1);
  257. Map<T3> res(out+i*T3::MaxSizeAtCompileTime);
  258. res += in[i] * x1 * x2;
  259. }
  260. };
  261. template<typename T1, typename T2>
  262. struct diagonal {
  263. EIGEN_DEVICE_FUNC
  264. void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
  265. {
  266. using namespace Eigen;
  267. T1 x1(in+i);
  268. Map<T2> res(out+i*T2::MaxSizeAtCompileTime);
  269. res += x1.diagonal();
  270. }
  271. };
  272. template<typename T>
  273. struct eigenvalues_direct {
  274. EIGEN_DEVICE_FUNC
  275. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  276. {
  277. using namespace Eigen;
  278. typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
  279. T M(in+i);
  280. Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
  281. T A = M*M.adjoint();
  282. SelfAdjointEigenSolver<T> eig;
  283. eig.computeDirect(A);
  284. res = eig.eigenvalues();
  285. }
  286. };
  287. template<typename T>
  288. struct eigenvalues {
  289. EIGEN_DEVICE_FUNC
  290. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  291. {
  292. using namespace Eigen;
  293. typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
  294. T M(in+i);
  295. Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
  296. T A = M*M.adjoint();
  297. SelfAdjointEigenSolver<T> eig;
  298. eig.compute(A);
  299. res = eig.eigenvalues();
  300. }
  301. };
  302. template<typename T>
  303. struct matrix_inverse {
  304. EIGEN_DEVICE_FUNC
  305. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  306. {
  307. using namespace Eigen;
  308. T M(in+i);
  309. Map<T> res(out+i*T::MaxSizeAtCompileTime);
  310. res = M.inverse();
  311. }
  312. };
  313. template<typename T>
  314. struct numeric_limits_test {
  315. EIGEN_DEVICE_FUNC
  316. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  317. {
  318. EIGEN_UNUSED_VARIABLE(in)
  319. int out_idx = i * 5;
  320. out[out_idx++] = numext::numeric_limits<float>::epsilon();
  321. out[out_idx++] = (numext::numeric_limits<float>::max)();
  322. out[out_idx++] = (numext::numeric_limits<float>::min)();
  323. out[out_idx++] = numext::numeric_limits<float>::infinity();
  324. out[out_idx++] = numext::numeric_limits<float>::quiet_NaN();
  325. }
  326. };
  327. template<typename Type1, typename Type2>
  328. bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only
  329. {
  330. if (a.rows() != b.rows()) {
  331. return false;
  332. }
  333. if (a.cols() != b.cols()) {
  334. return false;
  335. }
  336. for (Index r = 0; r < a.rows(); ++r) {
  337. for (Index c = 0; c < a.cols(); ++c) {
  338. if (a(r, c) != b(r, c)
  339. && !((numext::isnan)(a(r, c)) && (numext::isnan)(b(r, c)))
  340. && !test_isApprox(a(r, c), b(r, c))) {
  341. return false;
  342. }
  343. }
  344. }
  345. return true;
  346. }
  347. template<typename Kernel, typename Input, typename Output>
  348. void test_with_infs_nans(const Kernel& ker, int n, const Input& in, Output& out)
  349. {
  350. Output out_ref, out_gpu;
  351. #if !defined(EIGEN_GPU_COMPILE_PHASE)
  352. out_ref = out_gpu = out;
  353. #else
  354. EIGEN_UNUSED_VARIABLE(in);
  355. EIGEN_UNUSED_VARIABLE(out);
  356. #endif
  357. run_on_cpu (ker, n, in, out_ref);
  358. run_on_gpu(ker, n, in, out_gpu);
  359. #if !defined(EIGEN_GPU_COMPILE_PHASE)
  360. verifyIsApproxWithInfsNans(out_ref, out_gpu);
  361. #endif
  362. }
  363. EIGEN_DECLARE_TEST(gpu_basic)
  364. {
  365. ei_test_init_gpu();
  366. int nthreads = 100;
  367. Eigen::VectorXf in, out;
  368. Eigen::VectorXcf cfin, cfout;
  369. #if !defined(EIGEN_GPU_COMPILE_PHASE)
  370. int data_size = nthreads * 512;
  371. in.setRandom(data_size);
  372. out.setConstant(data_size, -1);
  373. cfin.setRandom(data_size);
  374. cfout.setConstant(data_size, -1);
  375. #endif
  376. CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) );
  377. CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Array44f>(), nthreads, in, out) );
  378. #if !defined(EIGEN_USE_HIP)
  379. // FIXME
  380. // These subtests result in a compile failure on the HIP platform
  381. //
  382. // eigen-upstream/Eigen/src/Core/Replicate.h:61:65: error:
  383. // base class 'internal::dense_xpr_base<Replicate<Array<float, 4, 1, 0, 4, 1>, -1, -1> >::type'
  384. // (aka 'ArrayBase<Eigen::Replicate<Eigen::Array<float, 4, 1, 0, 4, 1>, -1, -1> >') has protected default constructor
  385. CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array4f>(), nthreads, in, out) );
  386. CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array33f>(), nthreads, in, out) );
  387. // HIP does not support new/delete on device.
  388. CALL_SUBTEST( run_and_compare_to_gpu(alloc_new_delete<Vector3f>(), nthreads, in, out) );
  389. #endif
  390. CALL_SUBTEST( run_and_compare_to_gpu(redux<Array4f>(), nthreads, in, out) );
  391. CALL_SUBTEST( run_and_compare_to_gpu(redux<Matrix3f>(), nthreads, in, out) );
  392. CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
  393. CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
  394. CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
  395. CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
  396. CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix2f>(), nthreads, in, out) );
  397. CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix3f>(), nthreads, in, out) );
  398. CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix4f>(), nthreads, in, out) );
  399. CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
  400. CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
  401. // Test std::complex.
  402. CALL_SUBTEST( run_and_compare_to_gpu(complex_operators<Vector3cf>(), nthreads, cfin, cfout) );
  403. CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) );
  404. // numeric_limits
  405. CALL_SUBTEST( test_with_infs_nans(numeric_limits_test<Vector3f>(), 1, in, out) );
  406. #if defined(__NVCC__)
  407. // FIXME
  408. // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
  409. CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
  410. typedef Matrix<float,6,6> Matrix6f;
  411. CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) );
  412. #endif
  413. }