pipeline.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. /*
  2. * nvbio
  3. * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the NVIDIA CORPORATION nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. #pragma once
  28. #include <nvbio/io/output/output_file.h>
  29. #include <nvbio/io/sequence/sequence.h>
  30. #include <nvbio/io/fmindex/fmindex.h>
  31. #include <nvbio/alignment/sink.h>
  32. #include <nvbio/fmindex/mem.h>
  33. #include "mem-search.h"
  34. using namespace nvbio;
  35. /// pipeline stats
  36. ///
  37. struct pipeline_stats
  38. {
  39. pipeline_stats() :
  40. time ( 0.0f ),
  41. io_time ( 0.0f ),
  42. search_time ( 0.0f ),
  43. locate_time ( 0.0f ),
  44. chain_time ( 0.0f ),
  45. n_reads ( 0 ),
  46. n_mems ( 0 ),
  47. n_chains ( 0 )
  48. {}
  49. float time;
  50. float io_time;
  51. float search_time;
  52. float locate_time;
  53. float chain_time;
  54. uint64 n_reads;
  55. uint64 n_mems;
  56. uint64 n_chains;
  57. };
  58. /// the MEM-searching pipeline state
  59. ///
  60. struct mem_state
  61. {
  62. typedef nvbio::io::SequenceDataAccess<DNA>::sequence_stream_type genome_type;
  63. typedef nvbio::io::FMIndexDataDevice::fm_index_type fm_index_type;
  64. typedef nvbio::MEMFilterDevice<fm_index_type> mem_filter_type;
  65. typedef mem_filter_type::mem_type mem_type;
  66. nvbio::io::SequenceData *reference_data_host;
  67. nvbio::io::SequenceDataDevice *reference_data_device;
  68. nvbio::io::FMIndexData *fmindex_data_host;
  69. nvbio::io::FMIndexDataDevice *fmindex_data_device;
  70. fm_index_type f_index; ///< the forward FM-index object
  71. fm_index_type r_index; ///< the reverse FM-index object
  72. mem_filter_type mem_filter; ///< our MEM filter object, used to rank and locate MEMs and keep track of statistics
  73. };
  74. /// the state of the MEM chains relative to a set of reads
  75. ///
  76. template <typename system_tag>
  77. struct chains_state
  78. {
  79. typedef mem_state::mem_type mem_type;
  80. typedef nvbio::vector<system_tag, mem_type> mem_vector_type;
  81. template <typename other_tag>
  82. chains_state& operator=(const chains_state<other_tag>& other);
  83. mem_vector_type mems; ///< the result vector for mem_search
  84. uint32 n_mems; ///< the number of mems
  85. nvbio::vector<system_tag,uint32> mems_index; ///< a sorting index into the mems (initially by reference location, then by chain id)
  86. nvbio::vector<system_tag,uint64> mems_chain; ///< the chain IDs of each mem
  87. // the list of chains
  88. nvbio::vector<system_tag,uint32> chain_offsets; ///< the first seed of each chain
  89. nvbio::vector<system_tag,uint32> chain_lengths; ///< the number of seeds in each chain
  90. nvbio::vector<system_tag,uint32> chain_reads; ///< the read (strand) id of each chain
  91. uint32 n_chains; ///< the number of chains
  92. };
  93. /// a small object acting as a "reference" for a chain
  94. ///
  95. struct chain_reference;
  96. /// a POD structure to view chains as if they were stored in AOS format;
  97. /// unlike chains_state, this class can be passed as a kernel parameter and its members
  98. /// can be accessed from the device.
  99. ///
  100. struct chains_view
  101. {
  102. typedef mem_state::mem_type mem_type;
  103. typedef const uint32* index_vector_type;
  104. typedef const mem_type* mem_vector_type;
  105. /// constructor
  106. ///
  107. template <typename system_tag>
  108. chains_view(const chains_state<system_tag>& state) :
  109. mems( plain_view( state.mems ) ),
  110. mems_index( plain_view( state.mems_index ) ),
  111. chain_offsets( plain_view( state.chain_offsets ) ),
  112. chain_lengths( plain_view( state.chain_lengths ) ),
  113. chain_reads( plain_view( state.chain_reads ) ),
  114. n_chains( state.n_chains )
  115. {}
  116. /// return a "reference" to the i-th chain
  117. ///
  118. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  119. chain_reference operator[] (const uint32 i) const;
  120. mem_vector_type mems; ///< the result vector for mem_search
  121. index_vector_type mems_index; ///< a sorting index into the mems (initially by reference location, then by chain id)
  122. index_vector_type chain_offsets; ///< the first seed of each chain
  123. index_vector_type chain_lengths; ///< the number of seeds in each chain
  124. index_vector_type chain_reads; ///< the read (strand) id of each chain
  125. uint32 n_chains; ///< the number of chains
  126. };
  127. /// a small object acting as a "reference" for a chain, allowing to view it as if was a single object
  128. ///
  129. struct chain_reference
  130. {
  131. typedef mem_state::mem_type mem_type;
  132. /// constructor
  133. ///
  134. NVBIO_HOST_DEVICE
  135. chain_reference(const chains_view& _chains, const uint32 _i) : chains(_chains), idx(_i) {}
  136. /// return the read this chain belongs to
  137. ///
  138. NVBIO_HOST_DEVICE
  139. uint32 read() const { return chains.chain_reads[ idx ]; }
  140. /// return the number of seeds in this chain
  141. ///
  142. NVBIO_HOST_DEVICE
  143. uint32 size() const { return chains.chain_lengths[ idx ]; }
  144. /// return the i-th seed in this chain
  145. ///
  146. NVBIO_HOST_DEVICE
  147. mem_type operator[] (const uint32 i) const
  148. {
  149. // grab the offset to the first seed of this chain
  150. const uint32 offset = chains.chain_offsets[ idx ];
  151. // return the requested seed, remembering they are sorted by chain through an index
  152. return chains.mems[ chains.mems_index[ offset + i ] ];
  153. }
  154. const chains_view& chains;
  155. const uint32 idx;
  156. };
  157. // return a "reference" to the i-th chain
  158. //
  159. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  160. chain_reference chains_view::operator[] (const uint32 i) const
  161. {
  162. return chain_reference( *this, i );
  163. }
  164. /// a contiguous subset of reads from a batch associated with all their MEMs
  165. ///
  166. struct read_chunk
  167. {
  168. read_chunk() :
  169. read_begin(0),
  170. read_end(0),
  171. mem_begin(0),
  172. mem_end(0) {}
  173. uint32 read_begin; ///< ID of the first read in this chunk
  174. uint32 read_end; ///< ID of the ending read in this chunk
  175. uint32 mem_begin; ///< index of the first hit for the first read
  176. uint32 mem_end; ///< index of the last hit for the last read
  177. };
  178. /// the alignment sub-pipeline state
  179. ///
  180. /// during alignment, we essentially keep a queue of "active" reads, corresponding
  181. /// to those reads for which there's more chains to process; at every step, we select
  182. /// one new chain from each read as an alignment candidate, removing it from the set.
  183. /// This is done keeping a set of (begin,end) pointers per read and advancing the
  184. /// begin field - when a range becomes empty, it's removed
  185. ///
  186. template <typename system_tag>
  187. struct alignment_state
  188. {
  189. typedef aln::Best2Sink<int16> sink_type;
  190. template <typename other_tag>
  191. alignment_state& operator=(const alignment_state<other_tag>& other);
  192. uint32 n_active; ///< the number of active reads in the alignment queue
  193. nvbio::vector<system_tag,uint32> begin_chains; ///< the first chain for each read in the processing queue
  194. nvbio::vector<system_tag,uint32> end_chains; ///< the ending chain for each read in the processing queue
  195. nvbio::vector<system_tag,uint2> query_spans; ///< the query chain spans
  196. nvbio::vector<system_tag,uint2> ref_spans; ///< the reference chain spans
  197. nvbio::vector<system_tag,uint32> temp_queue; ///< a temporary queue
  198. nvbio::vector<system_tag,uint8> stencil; ///< a temporary stencil vector
  199. nvbio::vector<system_tag,sink_type> sinks; ///< a temporary stencil vector
  200. };
  201. /// a flag to identify the system in use
  202. ///
  203. enum SystemFlag { DEVICE = 0, HOST = 1 };
  204. /// the state of the pipeline
  205. ///
  206. struct pipeline_state
  207. {
  208. SystemFlag system; ///< specify whether we are using the device or the host
  209. nvbio::io::OutputFile* output; ///< the alignment output
  210. mem_state mem; ///< the mem state
  211. chains_state<device_tag> chn; ///< the device chains state
  212. alignment_state<device_tag> aln; ///< the device alignment state
  213. chains_state<host_tag> h_chn; ///< the host chains state
  214. alignment_state<host_tag> h_aln; ///< the host alignment state
  215. read_chunk chunk; ///< the current read chunk
  216. pipeline_stats stats; ///< the pipeline stats
  217. };
  218. template <typename system_tag>
  219. template <typename other_tag>
  220. chains_state<system_tag>& chains_state<system_tag>::operator=(const chains_state<other_tag>& other)
  221. {
  222. n_mems = other.n_mems;
  223. n_chains = other.n_chains;
  224. mems.resize( n_mems );
  225. mems_index.resize( n_mems );
  226. chain_offsets.resize( n_chains );
  227. chain_lengths.resize( n_chains );
  228. chain_reads.resize( n_chains );
  229. thrust::copy( other.mems.begin(), other.mems.begin() + n_mems, mems.begin() );
  230. thrust::copy( other.mems_index.begin(), other.mems_index.begin() + n_mems, mems_index.begin() );
  231. thrust::copy( other.chain_offsets.begin(), other.chain_offsets.begin() + n_chains, chain_offsets.begin() );
  232. thrust::copy( other.chain_lengths.begin(), other.chain_lengths.begin() + n_chains, chain_lengths.begin() );
  233. thrust::copy( other.chain_reads.begin(), other.chain_reads.begin() + n_chains, chain_reads.begin() );
  234. return *this;
  235. }
  236. template <typename system_tag>
  237. template <typename other_tag>
  238. alignment_state<system_tag>& alignment_state<system_tag>::operator=(const alignment_state<other_tag>& other)
  239. {
  240. n_active = other.n_active;
  241. begin_chains.resize( n_active );
  242. end_chains.resize( n_active );
  243. query_spans.resize( n_active );
  244. ref_spans.resize( n_active );
  245. temp_queue.resize( n_active );
  246. stencil.resize( n_active );
  247. thrust::copy( other.begin_chains.begin(), other.begin_chains.begin() + n_active, begin_chains.begin() );
  248. thrust::copy( other.end_chains.begin(), other.end_chains.begin() + n_active, end_chains.begin() );
  249. thrust::copy( other.query_spans.begin(), other.query_spans.begin() + n_active, query_spans.begin() );
  250. thrust::copy( other.ref_spans.begin(), other.ref_spans.begin() + n_active, ref_spans.begin() );
  251. return *this;
  252. }