condtion_test.cu 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. /*
  2. * nvbio
  3. * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the NVIDIA CORPORATION nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. // condition_test.cu
  28. //
  29. #define NVBIO_CUDA_DEBUG
  30. #include <stdio.h>
  31. #include <stdlib.h>
  32. #include <vector>
  33. #include <algorithm>
  34. #include <nvbio/basic/timer.h>
  35. #include <nvbio/basic/console.h>
  36. #include <nvbio/basic/vector.h>
  37. #include <nvbio/basic/cuda/arch.h>
  38. #include <nvbio/basic/cuda/condition.h>
  39. #include <cub/cub.cuh>
  40. namespace nvbio {
  41. namespace condition {
  42. __global__
  43. void scan_kernel(const uint32 n_tile_grids, cuda::condition_set_view conditions, uint32* block_values, const uint32 initial_state)
  44. {
  45. const uint32 prev_block = blockIdx.x ? blockIdx.x-1 : gridDim.x-1;
  46. uint32 prev_state = blockIdx.x ? initial_state+1 : initial_state;
  47. for (uint32 i = 0; i < n_tile_grids; ++i, ++prev_state)
  48. {
  49. // compute the tile number
  50. const uint32 tile_idx = blockIdx.x + i * gridDim.x;
  51. if (threadIdx.x == 0)
  52. {
  53. if (tile_idx)
  54. {
  55. // wait on the previous block to post its value
  56. conditions[ prev_block ].wait( prev_state );
  57. block_values[tile_idx] = block_values[tile_idx-1] + 1;
  58. }
  59. else
  60. {
  61. // set the value for the first tile
  62. block_values[0] = 0;
  63. }
  64. // release the condition for this block
  65. conditions[ blockIdx.x ].signal(); // equivalent to set( initial_state+i+1 )
  66. }
  67. }
  68. }
  69. __global__
  70. void chained_kernel(const uint32 n_tile_grids, cuda::condition_set_view conditions, const uint32 initial_state)
  71. {
  72. const uint32 prev_block = blockIdx.x ? blockIdx.x-1 : gridDim.x-1;
  73. uint32 prev_state = blockIdx.x ? initial_state+1 : initial_state;
  74. for (uint32 i = 0; i < n_tile_grids; ++i, ++prev_state)
  75. {
  76. // compute the tile number
  77. const uint32 tile_idx = blockIdx.x + i * gridDim.x;
  78. if (threadIdx.x == 0)
  79. {
  80. if (tile_idx)
  81. {
  82. // wait on the previous block to be ready
  83. conditions[ prev_block ].wait( prev_state );
  84. }
  85. // release the condition for this block
  86. conditions[ blockIdx.x ].signal(); // equivalent to set( initial_state+i+1 )
  87. }
  88. }
  89. }
  90. template <uint32 BLOCKDIM, bool DO_WORK>
  91. __global__
  92. void fast_scan_kernel(const uint32 n_tile_grids, cuda::condition_set_view conditions, volatile uint32* partials, volatile uint32* prefixes, const uint32 initial_state)
  93. {
  94. //
  95. // This scan skeleton performs longer range chaining rather than chaining each CTA
  96. // to its direct predecessor: the idea is that each CTA writes out its partial,
  97. // and then checks for the availability of prefix of predecessor (without blocking),
  98. // if not available, it waits for the previous BLOCKDIM-1 partials and the prefix BLOCKDIM
  99. // CTAs away, and uses all its threads to reduce them.
  100. // Hence, CTA[i] depends on the prefix of a CTA[i-BLOCKDIM].
  101. //
  102. typedef cub::BlockReduce<uint32,BLOCKDIM> BlockReduce;
  103. const uint32 PARTIAL_READY = initial_state + 1;
  104. const uint32 PREFIX_READY = initial_state + 2;
  105. __shared__ typename BlockReduce::TempStorage smem_storage;
  106. __shared__ uint32 previous_done;
  107. const bool is_thread0 = (threadIdx.x == 0);
  108. for (uint32 i = 0; i < n_tile_grids; ++i)
  109. {
  110. //__syncthreads();
  111. // compute the tile number
  112. const uint32 tile_idx = blockIdx.x + i * gridDim.x;
  113. // write out the partial for this tile
  114. if (DO_WORK)
  115. {
  116. if (is_thread0)
  117. partials[tile_idx] = 1;
  118. }
  119. if (is_thread0 && tile_idx)
  120. previous_done = conditions[tile_idx-1].test( PREFIX_READY );
  121. __syncthreads();
  122. if (tile_idx == 0)
  123. {
  124. // set the value for the first tile
  125. if (DO_WORK)
  126. {
  127. if (is_thread0)
  128. prefixes[0] = 1;
  129. }
  130. }
  131. else if (previous_done)
  132. {
  133. // sum to previous prefix
  134. if (DO_WORK)
  135. {
  136. if (is_thread0)
  137. prefixes[tile_idx] = prefixes[tile_idx-1] + 1;
  138. }
  139. }
  140. else
  141. {
  142. // release the condition variable for the partial
  143. if (is_thread0)
  144. conditions[tile_idx].set( PARTIAL_READY );
  145. int32 prefix = 0;
  146. int32 last_tile = tile_idx;
  147. int32 prefix_tile = tile_idx;
  148. // keep looking back until we find a 'ready' prefix
  149. do
  150. {
  151. //
  152. // lookback up to BLOCKDIM predecessors in parallel, check if any
  153. // of them is done (i.e. their prefix is ready), and otherwise
  154. // wait on their partial to arrive.
  155. //
  156. previous_done = 0;
  157. __syncthreads();
  158. // compute the first tile in this batch
  159. prefix_tile = nvbio::max( int32( prefix_tile - blockDim.x ), 0 );
  160. // check if the any of the predecessors in this block is done
  161. if (prefix_tile + threadIdx.x < last_tile)
  162. {
  163. if (conditions[ prefix_tile + threadIdx.x ].test( PREFIX_READY ))
  164. previous_done = prefix_tile + threadIdx.x;
  165. }
  166. __syncthreads();
  167. // let all threads update the prefix tile
  168. if (previous_done)
  169. prefix_tile = previous_done;
  170. int32 partial = 0;
  171. // lookback the predecessors in parallel
  172. if (prefix_tile + threadIdx.x < last_tile)
  173. {
  174. if (previous_done && threadIdx.x == 0)
  175. {
  176. // let thread0 read the ready prefix
  177. if (DO_WORK)
  178. partial = prefixes[ prefix_tile ];
  179. }
  180. else
  181. {
  182. // wait on the partials
  183. conditions[ prefix_tile + threadIdx.x ].wait( PARTIAL_READY );
  184. if (DO_WORK)
  185. partial = partials[ prefix_tile + threadIdx.x ];
  186. }
  187. }
  188. if (DO_WORK)
  189. {
  190. // reduce the prefixes
  191. prefix += BlockReduce( smem_storage ).Sum( partial );
  192. }
  193. last_tile = prefix_tile;
  194. }
  195. while (prefix_tile && !previous_done);
  196. if (DO_WORK)
  197. {
  198. if (is_thread0)
  199. {
  200. // write out the final values
  201. prefixes[tile_idx] = prefix + 1;
  202. }
  203. }
  204. }
  205. // release the condition for the scanned value for this tile
  206. if (is_thread0)
  207. conditions[tile_idx].set( PREFIX_READY );
  208. }
  209. }
  210. } // condition namespace
  211. int condition_test()
  212. {
  213. const uint32 n_tile_grids = 100;
  214. log_info( stderr, "condition test... started\n" );
  215. const uint32 blockdim = 128;
  216. const uint32 n_blocks = (uint32)cuda::max_active_blocks( condition::scan_kernel, blockdim, 0u );
  217. cuda::condition_set_storage condition_st( n_blocks );
  218. cuda::condition_set_view condition_set = condition_st.get();
  219. log_info( stderr, " %u blocks\n", n_blocks );
  220. thrust::device_vector<uint32> dvalues( n_tile_grids*n_blocks );
  221. uint32* dvalues_ptr = thrust::raw_pointer_cast( &dvalues.front() );
  222. thrust::host_vector<uint32> hvalues;
  223. log_info( stderr, " correctness test... started\n" );
  224. for (uint32 i = 0; i < 20; ++i)
  225. {
  226. // call the testing kernel
  227. condition::scan_kernel<<<n_blocks,blockdim>>>( n_tile_grids, condition_set, dvalues_ptr, i*n_tile_grids );
  228. cudaDeviceSynchronize();
  229. nvbio::cuda::thrust_copy_vector(hvalues, dvalues);
  230. for (uint32 n = 0; n < n_tile_grids*n_blocks; ++n)
  231. {
  232. const uint32 val = hvalues[n];
  233. if (val != n)
  234. {
  235. log_error( stderr, " found %u at position %u, launch %u\n", val, n, i );
  236. return 1;
  237. }
  238. }
  239. }
  240. log_info( stderr, " correctness test... done\n" );
  241. const uint32 n_tests = 20;
  242. log_info( stderr, " speed test... started\n" );
  243. condition_st.set(0);
  244. cudaDeviceSynchronize();
  245. Timer timer;
  246. timer.start();
  247. for (uint32 i = 0; i < n_tests; ++i)
  248. condition::chained_kernel<<<n_blocks,blockdim>>>( n_tile_grids, condition_set, i*n_tile_grids );
  249. cudaDeviceSynchronize();
  250. timer.stop();
  251. const float time = timer.seconds() / float(n_tests*n_tile_grids);
  252. log_info( stderr, " speed test... done:\n %.3f ns\n %.3f ns/CTA\n %.1fM CTAs retired/s\n",
  253. time * 1.0e6f,
  254. 1.0e6f * (time/float(n_blocks)),
  255. 1.0e-6f * (float(n_blocks)/time) );
  256. {
  257. const uint32 blockdim = 128;
  258. const uint32 n_blocks = (uint32)cuda::max_active_blocks( condition::fast_scan_kernel<blockdim,true>, blockdim, 0u );
  259. cuda::condition_set_storage condition_st( n_blocks*n_tile_grids );
  260. cuda::condition_set_view condition_set = condition_st.get();
  261. cudaDeviceSynchronize();
  262. log_info( stderr, " fast scan... started (%u CTAs)\n", n_blocks );
  263. thrust::device_vector<uint32> dpartials( n_tile_grids*n_blocks );
  264. uint32* dpartials_ptr = thrust::raw_pointer_cast( &dpartials.front() );
  265. for (uint32 i = 0; i < 20; ++i)
  266. {
  267. thrust::fill( dpartials.begin(), dpartials.end(), 0 );
  268. thrust::fill( dvalues.begin(), dvalues.end(), 0 );
  269. condition::fast_scan_kernel<blockdim,true><<<n_blocks,blockdim>>>( n_tile_grids, condition_set, dpartials_ptr, dvalues_ptr, i*2 );
  270. cudaDeviceSynchronize();
  271. nvbio::cuda::thrust_copy_vector(hvalues, dvalues);
  272. for (uint32 n = 0; n < n_tile_grids*n_blocks; ++n)
  273. {
  274. const uint32 val = hvalues[n];
  275. if (val != n+1)
  276. {
  277. log_error( stderr, " found %u at position %u, launch %u\n", val, n, i );
  278. return 1;
  279. }
  280. }
  281. }
  282. condition_st.set(0);
  283. cudaDeviceSynchronize();
  284. Timer timer;
  285. timer.start();
  286. for (uint32 i = 0; i < n_tests; ++i)
  287. condition::fast_scan_kernel<blockdim,true><<<n_blocks,blockdim>>>( n_tile_grids, condition_set, dpartials_ptr, dvalues_ptr, i*2 );
  288. cudaDeviceSynchronize();
  289. timer.stop();
  290. {
  291. const float time = timer.seconds() / float(n_tests*n_tile_grids);
  292. log_info( stderr, " fast scan test... done:\n %.3f ns\n %.3f ns/CTA\n %.1fM CTAs retired/s\n",
  293. time * 1.0e6f,
  294. 1.0e6f * (time/float(n_blocks)),
  295. 1.0e-6f * (float(n_blocks)/time) );
  296. }
  297. log_info( stderr, " fast chaining... started\n" );
  298. condition_st.set(0);
  299. cudaDeviceSynchronize();
  300. timer.start();
  301. for (uint32 i = 0; i < n_tests; ++i)
  302. condition::fast_scan_kernel<blockdim,false><<<n_blocks,blockdim>>>( n_tile_grids, condition_set, NULL, NULL, i*2 );
  303. cudaDeviceSynchronize();
  304. timer.stop();
  305. {
  306. const float time = timer.seconds() / float(n_tests*n_tile_grids);
  307. log_info( stderr, " fast chaining test... done:\n %.3f ns\n %.3f ns/CTA\n %.1fM CTAs retired/s\n",
  308. time * 1.0e6f,
  309. 1.0e6f * (time/float(n_blocks)),
  310. 1.0e-6f * (float(n_blocks)/time) );
  311. }
  312. }
  313. log_info( stderr, "condition test... done\n" );
  314. return 0;
  315. }
  316. } // namespace nvbio