qmap.cu 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515
  1. /*
  2. * nvbio
  3. * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the NVIDIA CORPORATION nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. // qmap.cu
  28. //
  29. #include <stdio.h>
  30. #include <stdlib.h>
  31. #include <nvbio/basic/timer.h>
  32. #include <nvbio/basic/console.h>
  33. #include <nvbio/basic/vector.h>
  34. #include <nvbio/basic/shared_pointer.h>
  35. #include <nvbio/basic/dna.h>
  36. #include <nvbio/strings/string_set.h>
  37. #include <nvbio/strings/infix.h>
  38. #include <nvbio/strings/seeds.h>
  39. #include <nvbio/qgram/qgram.h>
  40. #include <nvbio/qgram/filter.h>
  41. #include <nvbio/io/sequence/sequence.h>
  42. #include "alignment.h"
  43. #include "util.h"
  44. using namespace nvbio;
  45. // query stats
  46. //
  47. struct Stats
  48. {
  49. Stats() :
  50. time(0),
  51. build_time(0),
  52. extract_time(0),
  53. rank_time(0),
  54. locate_time(0),
  55. align_time(0),
  56. reads(0),
  57. aligned(0),
  58. queries(0),
  59. matches(0),
  60. occurrences(0),
  61. merged(0) {}
  62. float time;
  63. float build_time;
  64. float extract_time;
  65. float rank_time;
  66. float locate_time;
  67. float align_time;
  68. uint64 reads;
  69. uint64 aligned;
  70. uint64 queries;
  71. uint64 matches;
  72. uint64 occurrences;
  73. uint64 merged;
  74. };
  75. // build a set of q-grams from a given string, together with their sorted counterpart
  76. //
  77. template <typename genome_string, typename qgram_vector_type, typename index_vector_type>
  78. void build_qgrams(
  79. const uint32 Q,
  80. const uint32 genome_len,
  81. const uint32 genome_offset,
  82. const genome_string genome,
  83. const uint32 n_queries,
  84. qgram_vector_type& qgrams,
  85. qgram_vector_type& sorted_qgrams,
  86. index_vector_type& sorted_indices)
  87. {
  88. // build the q-grams
  89. qgrams.resize( n_queries );
  90. generate_qgrams( Q, 2u, genome_len, genome, n_queries, thrust::make_counting_iterator<uint32>(genome_offset), qgrams.begin() );
  91. // sort the q-grams
  92. sorted_qgrams = qgrams;
  93. sorted_indices.resize( n_queries );
  94. thrust::copy(
  95. thrust::make_counting_iterator<uint32>(genome_offset),
  96. thrust::make_counting_iterator<uint32>(genome_offset) + n_queries,
  97. sorted_indices.begin() );
  98. thrust::sort_by_key( sorted_qgrams.begin(), sorted_qgrams.end(), sorted_indices.begin() );
  99. }
  100. // build a q-gram set-index from a string-set
  101. //
  102. template <typename string_set_type>
  103. void qgram_set_index_build(
  104. const uint32 Q,
  105. const uint32 seed_interval,
  106. const string_set_type string_set,
  107. QGramSetIndexDevice& qgram_index)
  108. {
  109. log_verbose(stderr, " building q-gram set-index... started\n");
  110. Timer timer;
  111. timer.start();
  112. // build the q-gram set index
  113. qgram_index.build(
  114. Q, // q-gram size
  115. 2u, // implicitly convert N to A
  116. string_set,
  117. uniform_seeds_functor<>( Q, seed_interval ),
  118. 12u );
  119. cudaDeviceSynchronize();
  120. timer.stop();
  121. const float time = timer.seconds();
  122. log_verbose(stderr, " building q-gram set-index... done\n");
  123. log_verbose(stderr, " indexed q-grams : %6.2f M q-grams\n", 1.0e-6f * float( qgram_index.n_qgrams ));
  124. log_verbose(stderr, " unique q-grams : %6.2f M q-grams\n", 1.0e-6f * float( qgram_index.n_unique_qgrams ));
  125. log_verbose(stderr, " throughput : %5.1f M q-grams/s\n", 1.0e-6f * float( qgram_index.n_qgrams ) / time);
  126. log_verbose(stderr, " memory usage : %5.1f MB\n", float( qgram_index.used_device_memory() ) / float(1024*1024) );
  127. }
  128. // perform q-gram index mapping
  129. //
  130. template <typename qgram_index_type, typename qgram_filter_type, typename genome_string>
  131. void map(
  132. qgram_index_type& qgram_index,
  133. qgram_filter_type& qgram_filter,
  134. const uint32 merge_intv,
  135. const io::SequenceDataDevice& reads,
  136. const uint32 n_queries,
  137. const uint32 genome_len,
  138. const uint32 genome_offset,
  139. const genome_string genome,
  140. nvbio::vector<device_tag,int16>& best_scores,
  141. Stats& stats)
  142. {
  143. typedef typename qgram_index_type::system_tag system_tag;
  144. // prepare some vectors to store the query qgrams
  145. nvbio::vector<system_tag,uint64> qgrams( n_queries );
  146. nvbio::vector<system_tag,uint64> sorted_qgrams( n_queries );
  147. nvbio::vector<system_tag,uint32> sorted_indices( n_queries );
  148. const uint32 Q = qgram_index.Q;
  149. Timer timer;
  150. timer.start();
  151. build_qgrams(
  152. Q,
  153. genome_len,
  154. genome_offset,
  155. genome,
  156. n_queries,
  157. qgrams,
  158. sorted_qgrams,
  159. sorted_indices );
  160. cudaDeviceSynchronize();
  161. timer.stop();
  162. const float extract_time = timer.seconds();
  163. stats.queries += n_queries;
  164. stats.extract_time += extract_time;
  165. //
  166. // search the sorted query q-grams with a q-gram filter
  167. //
  168. const uint32 batch_size = 32*1024*1024;
  169. typedef typename qgram_filter_type::hit_type hit_type;
  170. typedef typename qgram_filter_type::diagonal_type diagonal_type;
  171. // prepare storage for the output hits
  172. nvbio::vector<system_tag,hit_type> hits( batch_size );
  173. nvbio::vector<system_tag,diagonal_type> merged_hits( batch_size );
  174. nvbio::vector<system_tag,uint16> merged_counts( batch_size );
  175. nvbio::vector<system_tag,int16> scores( batch_size );
  176. nvbio::vector<system_tag,uint32> out_reads( batch_size );
  177. nvbio::vector<system_tag,int16> out_scores( batch_size );
  178. nvbio::vector<system_tag,uint8> temp_storage;
  179. timer.start();
  180. // first step: rank the query q-grams
  181. const uint64 n_hits = qgram_filter.rank(
  182. qgram_index,
  183. n_queries,
  184. nvbio::raw_pointer( sorted_qgrams ),
  185. nvbio::raw_pointer( sorted_indices ) );
  186. cudaDeviceSynchronize();
  187. timer.stop();
  188. stats.rank_time += timer.seconds();
  189. stats.occurrences += n_hits;
  190. nvbio::vector<device_tag, aln::BestSink<int16> > sinks( batch_size );
  191. nvbio::vector<device_tag,string_infix_coord_type> genome_infix_coords( batch_size );
  192. nvbio::vector<device_tag,string_infix_coord_type> read_infix_coords( batch_size );
  193. const static uint32 BAND_LEN = 31;
  194. // loop through large batches of hits and locate & merge them
  195. for (uint64 hits_begin = 0; hits_begin < n_hits; hits_begin += batch_size)
  196. {
  197. typedef io::SequenceDataAccess<DNA_N> read_access_type;
  198. typedef read_access_type::sequence_string_set_type read_string_set_type;
  199. typedef read_access_type::sequence_stream_type read_stream;
  200. // build an access pointer to the sequence data
  201. const read_access_type reads_access( reads );
  202. const uint64 hits_end = nvbio::min( hits_begin + batch_size, n_hits );
  203. timer.start();
  204. qgram_filter.locate(
  205. hits_begin,
  206. hits_end,
  207. hits.begin() );
  208. const uint32 n_merged = qgram_filter.merge(
  209. merge_intv,
  210. hits_end - hits_begin,
  211. hits.begin(),
  212. merged_hits.begin(),
  213. merged_counts.begin() );
  214. cudaDeviceSynchronize();
  215. timer.stop();
  216. stats.locate_time += timer.seconds();
  217. stats.merged += n_merged;
  218. timer.start();
  219. // build the set of read infixes
  220. thrust::transform(
  221. merged_hits.begin(),
  222. merged_hits.begin() + hits_end - hits_begin,
  223. read_infix_coords.begin(),
  224. read_infixes( nvbio::plain_view( reads ) ) );
  225. // build the set of genome infixes
  226. thrust::transform(
  227. merged_hits.begin(),
  228. merged_hits.begin() + hits_end - hits_begin,
  229. genome_infix_coords.begin(),
  230. genome_infixes<BAND_LEN>( genome_len, nvbio::plain_view( reads ) ) );
  231. typedef nvbio::vector<device_tag,string_infix_coord_type>::const_iterator infix_iterator;
  232. // build a view of the reads
  233. const SparseStringSet<read_stream,infix_iterator> read_infix_set(
  234. hits_end - hits_begin,
  235. reads_access.sequence_stream(),
  236. read_infix_coords.begin() );
  237. const SparseStringSet<genome_string,infix_iterator> genome_infix_set(
  238. hits_end - hits_begin,
  239. genome,
  240. genome_infix_coords.begin() );
  241. typedef aln::MyersTag<5u> myers_dna5_tag;
  242. aln::batch_banded_alignment_score<BAND_LEN>(
  243. aln::make_edit_distance_aligner<aln::SEMI_GLOBAL, myers_dna5_tag>(),
  244. read_infix_set,
  245. genome_infix_set,
  246. sinks.begin(),
  247. aln::DeviceThreadScheduler(),
  248. reads.max_sequence_len(),
  249. reads.max_sequence_len() + BAND_LEN );
  250. cudaDeviceSynchronize();
  251. timer.stop();
  252. stats.align_time += timer.seconds();
  253. // compute the best score for each read in this batch;
  254. // note that we divide the string-id by 2 to merge results coming from the forward
  255. // and reverse-complemented strands
  256. cuda::reduce_by_key(
  257. n_merged,
  258. thrust::make_transform_iterator(
  259. merged_hits.begin(),
  260. make_composition_functor( divide_by_two(), component_functor<diagonal_type>( 1u ) ) ), // take the second component divided by 2
  261. thrust::make_transform_iterator( sinks.begin(), sink_score() ),
  262. out_reads.begin(),
  263. out_scores.begin(),
  264. thrust::maximum<int16>(),
  265. temp_storage );
  266. // and keep track of the global best
  267. update_scores(
  268. n_merged,
  269. nvbio::plain_view( out_reads ),
  270. nvbio::plain_view( out_scores ),
  271. nvbio::plain_view( best_scores ) );
  272. }
  273. }
  274. // main test entry point
  275. //
  276. int main(int argc, char* argv[])
  277. {
  278. //
  279. // perform some basic option parsing
  280. //
  281. const uint32 batch_reads = 1*1024*1024;
  282. const uint32 batch_bps = 100*1024*1024;
  283. const uint32 queries_batch = 16*1024*1024;
  284. const char* reads = argv[argc-1];
  285. const char* index = argv[argc-2];
  286. uint32 Q = 20;
  287. uint32 Q_intv = 10;
  288. uint32 merge_intv = 16;
  289. uint32 max_reads = uint32(-1);
  290. int16 score_threshold = -20;
  291. for (int i = 0; i < argc; ++i)
  292. {
  293. if (strcmp( argv[i], "-q" ) == 0)
  294. {
  295. Q = uint32( atoi( argv[++i] ) );
  296. Q_intv = uint32( atoi( argv[++i] ) );
  297. }
  298. if (strcmp( argv[i], "-m" ) == 0)
  299. merge_intv = uint32( atoi( argv[++i] ) );
  300. else if (strcmp( argv[i], "-max-reads" ) == 0)
  301. max_reads = uint32( atoi( argv[++i] ) );
  302. else if (strcmp( argv[i], "-t" ) == 0)
  303. score_threshold = int16( atoi( argv[++i] ) );
  304. }
  305. log_info(stderr, "qmap... started\n");
  306. // load a genome archive...
  307. log_visible(stderr, " loading reference index ... started\n");
  308. log_info(stderr, " file: \"%s\"\n", index);
  309. io::SequenceDataHost h_genome_data;
  310. if (io::load_sequence_file( DNA, &h_genome_data, index ) == false)
  311. {
  312. log_error(stderr, " failed loading index \"%s\"\n", index);
  313. return 1u;
  314. }
  315. log_visible(stderr, " loading reference index ... done\n");
  316. log_verbose(stderr, " sequences : %u\n", h_genome_data.size() );
  317. log_verbose(stderr, " bps : %u\n", h_genome_data.bps() );
  318. log_verbose(stderr, " avg bps : %u (min: %u, max: %u)\n",
  319. h_genome_data.avg_sequence_len(),
  320. h_genome_data.min_sequence_len(),
  321. h_genome_data.max_sequence_len() );
  322. // build its device version
  323. const io::SequenceDataDevice d_genome_data( h_genome_data );
  324. const io::SequenceDataAccess<DNA> d_genome_access( d_genome_data );
  325. typedef io::SequenceDataAccess<DNA>::sequence_stream_type genome_type;
  326. // fetch the genome string
  327. const uint32 genome_len = d_genome_data.bps();
  328. const genome_type d_genome( d_genome_access.sequence_stream() );
  329. // open a read file
  330. log_info(stderr, " opening reads file... started\n");
  331. SharedPointer<io::SequenceDataStream> read_data_file(
  332. io::open_sequence_file(
  333. reads,
  334. io::Phred33,
  335. 2*max_reads,
  336. uint32(-1),
  337. io::SequenceEncoding( io::FORWARD | io::REVERSE_COMPLEMENT ) ) );
  338. // check whether the file opened correctly
  339. if (read_data_file == NULL || read_data_file->is_ok() == false)
  340. {
  341. log_error(stderr, " failed opening file \"%s\"\n", reads);
  342. return 1u;
  343. }
  344. log_info(stderr, " opening reads file... done\n");
  345. // keep stats
  346. Stats stats;
  347. io::SequenceDataHost h_read_data;
  348. while (1)
  349. {
  350. // load a batch of reads
  351. if (io::next( DNA_N, &h_read_data, read_data_file.get(), batch_reads, batch_bps ) == 0)
  352. break;
  353. log_info(stderr, " loading reads... started\n");
  354. // copy it to the device
  355. const io::SequenceDataDevice d_read_data( h_read_data );
  356. const io::SequenceDataAccess<DNA_N> d_read_access( d_read_data );
  357. const uint32 n_reads = d_read_data.size() / 2;
  358. log_info(stderr, " loading reads... done\n");
  359. log_info(stderr, " %u reads\n", n_reads);
  360. // prepare some typedefs for the involved string-sets and infixes
  361. typedef io::SequenceDataAccess<DNA_N> read_access_type; // the read view type
  362. typedef read_access_type::sequence_string_set_type string_set_type; // the read string-set
  363. typedef string_set_infix_coord_type infix_coord_type; // the infix coordinate type, for string-sets
  364. typedef nvbio::vector<device_tag,infix_coord_type> infix_vector_type; // the device vector type for infix coordinates
  365. typedef InfixSet<string_set_type, const string_set_infix_coord_type*> seed_set_type; // the infix-set type for representing seeds
  366. // fetch the actual read string-set
  367. const string_set_type d_read_string_set = d_read_access.sequence_string_set();
  368. // build the q-gram index
  369. QGramSetIndexDevice qgram_index;
  370. qgram_set_index_build(
  371. Q,
  372. Q_intv,
  373. d_read_string_set,
  374. qgram_index );
  375. typedef QGramFilterDevice<QGramSetIndexDevice,const uint64*,const uint32*> qgram_filter_type;
  376. qgram_filter_type qgram_filter;
  377. float time = 0.0f;
  378. const int16 worst_score = Field_traits<int16>::min();
  379. nvbio::vector<device_tag,int16> best_scores( n_reads, worst_score );
  380. nvbio::vector<device_tag,uint8> temp_storage;
  381. // stream through the genome
  382. for (uint32 genome_begin = 0; genome_begin < genome_len; genome_begin += queries_batch)
  383. {
  384. const uint32 genome_end = nvbio::min( genome_begin + queries_batch, genome_len );
  385. Timer timer;
  386. timer.start();
  387. map(
  388. qgram_index,
  389. qgram_filter,
  390. merge_intv,
  391. d_read_data,
  392. genome_end - genome_begin,
  393. genome_len,
  394. genome_begin,
  395. d_genome,
  396. best_scores,
  397. stats );
  398. cudaDeviceSynchronize();
  399. timer.stop();
  400. time += timer.seconds();
  401. const float genome_ratio = float( genome_end ) / float( genome_len );
  402. log_verbose(stderr, "\r aligned %5.2f%% of genome (%6.2f K reads/s)", 100.0f * genome_ratio, (1.0e-3f * n_reads) * genome_ratio / time );
  403. }
  404. log_verbose_cont(stderr, "\n");
  405. // accumulate the number of aligned reads
  406. stats.reads += n_reads;
  407. stats.time += time;
  408. // count how many reads have a score >= score_threshold
  409. const uint32 n_aligned = cuda::reduce(
  410. n_reads,
  411. thrust::make_transform_iterator( best_scores.begin(), above_threshold( score_threshold ) ),
  412. thrust::plus<uint32>(),
  413. temp_storage );
  414. stats.aligned += n_aligned;
  415. log_info(stderr, " aligned %6.2f %% reads (%6.2f K reads/s)\n", 100.0f * float( stats.aligned ) / float( stats.reads ), (1.0e-3f * float( stats.reads )) / stats.time);
  416. log_verbose(stderr, " breakdown:\n");
  417. log_verbose(stderr, " extract throughput : %.2f B q-grams/s\n", (1.0e-9f * float( stats.queries )) / stats.extract_time);
  418. log_verbose(stderr, " rank throughput : %6.2f K reads/s\n", (1.0e-3f * float( stats.reads )) / stats.rank_time);
  419. log_verbose(stderr, " : %6.2f B seeds/s\n", (1.0e-9f * float( stats.queries )) / stats.rank_time);
  420. log_verbose(stderr, " locate throughput : %6.2f K reads/s\n", (1.0e-3f * float( stats.reads )) / stats.locate_time);
  421. log_verbose(stderr, " align throughput : %6.2f K reads/s\n", (1.0e-3f * float( stats.reads )) / stats.align_time);
  422. log_verbose(stderr, " : %6.2f M hits/s\n", (1.0e-6f * float( stats.merged )) / stats.align_time);
  423. log_verbose(stderr, " occurrences : %.3f B\n", 1.0e-9f * float( stats.occurrences ) );
  424. log_verbose(stderr, " merged occurrences : %.3f B (%.1f %%)\n", 1.0e-9f * float( stats.merged ), 100.0f * float(stats.merged)/float(stats.occurrences));
  425. }
  426. log_info(stderr, "qmap... done\n");
  427. return 0;
  428. }