nvExtractReads.cu 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. /*
  2. * nvbio
  3. * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the NVIDIA CORPORATION nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. // nvExtractReads.cu
  28. //
  29. #include <nvbio/basic/timer.h>
  30. #include <nvbio/basic/shared_pointer.h>
  31. #include <nvbio/io/sequence/sequence.h>
  32. #include <nvbio/basic/dna.h>
  33. #include <thrust/host_vector.h>
  34. #include <thrust/device_vector.h>
  35. #include <zlib/zlib.h>
  36. #include <stdio.h>
  37. #include <stdlib.h>
  38. #include <vector>
  39. #include <algorithm>
  40. using namespace nvbio;
  41. bool to_ascii(const char* reads_name, void* output_file, void* output_index, const io::QualityEncoding qencoding, const io::SequenceEncoding flags)
  42. {
  43. log_visible(stderr, "opening read file \"%s\"\n", reads_name);
  44. SharedPointer<nvbio::io::SequenceDataStream> read_data_file(
  45. nvbio::io::open_sequence_file(reads_name,
  46. qencoding,
  47. uint32(-1),
  48. uint32(-1),
  49. flags )
  50. );
  51. if (read_data_file == NULL || read_data_file->is_ok() == false)
  52. {
  53. log_error(stderr, " failed opening file \"%s\"\n", reads_name);
  54. return false;
  55. }
  56. const uint32 batch_size = 512*1024;
  57. std::vector<char> char_read( 1024*1024 );
  58. std::vector<uint64> index( 512*1024 + 1u );
  59. uint64 offset = 0u;
  60. uint32 n_reads = 0;
  61. io::SequenceDataHost h_read_data;
  62. // loop through all read batches
  63. while (1)
  64. {
  65. // load a new batch of reads
  66. if (io::next( DNA_N, &h_read_data, read_data_file.get(), batch_size ) == 0)
  67. break;
  68. const io::SequenceDataAccess<DNA_N> h_read_access( h_read_data );
  69. // loop through all reads
  70. for (uint32 i = 0; i < h_read_data.size(); ++i)
  71. {
  72. const io::SequenceDataAccess<DNA_N>::sequence_string read = h_read_access.get_read(i);
  73. dna_to_string( read, read.length(), &char_read[0] );
  74. char_read[ read.length() ] = '\n';
  75. const uint32 n_written = (uint32)gzwrite( output_file, &char_read[0], sizeof(char) * (read.length()+1) );
  76. if (n_written < read.length()+1)
  77. {
  78. log_error( stderr, "unable to write to output\n");
  79. return false;
  80. }
  81. }
  82. if (output_index)
  83. {
  84. // collect the sequence offsets
  85. for (uint32 i = 0; i < h_read_data.size(); ++i)
  86. index[i] = offset + h_read_data.sequence_index()[i+1];
  87. // write the sequence offsets
  88. gzwrite( output_file, &index[0], sizeof(uint64) * h_read_data.size() );
  89. }
  90. // update the global sequence offset
  91. offset += h_read_data.bps();
  92. // update the global number of output reads
  93. n_reads += h_read_data.size();
  94. const uint64 n_bytes = gzoffset( output_file );
  95. log_verbose(stderr,"\r %u reads (%.2fGB - %.2fB/read - %.2fB/bp) ", n_reads, float( n_bytes ) / float(1024*1024*1024), float(n_bytes)/float(n_reads), float(n_bytes)/float(offset));
  96. }
  97. log_verbose_cont(stderr,"\n");
  98. return true;
  99. }
  100. template <uint32 SYMBOL_SIZE>
  101. bool to_packed(const char* reads_name, void* output_file, void* output_index, const io::QualityEncoding qencoding, const io::SequenceEncoding flags)
  102. {
  103. log_visible(stderr, "opening read file \"%s\"\n", reads_name);
  104. SharedPointer<nvbio::io::SequenceDataStream> read_data_file(
  105. nvbio::io::open_sequence_file(reads_name,
  106. qencoding,
  107. uint32(-1),
  108. uint32(-1),
  109. flags )
  110. );
  111. if (read_data_file == NULL || read_data_file->is_ok() == false)
  112. {
  113. log_error(stderr, " failed opening file \"%s\"\n", reads_name);
  114. return false;
  115. }
  116. static const uint32 SYMBOLS_PER_WORD = 32u / SYMBOL_SIZE;
  117. const uint32 batch_size = 512*1024;
  118. uint32 n_reads = 0;
  119. io::SequenceDataHost h_read_data;
  120. typedef PackedStream<uint32*,uint8,SYMBOL_SIZE,true> packed_stream_type;
  121. std::vector<uint32> words( 1024*1024 );
  122. std::vector<uint64> index( 512*1024 + 1u );
  123. uint32 rem = 0u;
  124. uint64 offset = 0u;
  125. // loop through all read batches
  126. while (1)
  127. {
  128. // load a new batch of reads
  129. if (io::next( DNA_N, &h_read_data, read_data_file.get(), batch_size ) == 0)
  130. break;
  131. // reserve enough storage
  132. words.resize( h_read_data.words() + 1u );
  133. packed_stream_type packed_reads( &words[0] );
  134. const io::SequenceDataAccess<DNA_N> h_read_access( h_read_data );
  135. nvbio::assign( h_read_access.bps(), h_read_access.sequence_stream(), packed_reads + rem );
  136. // write all whole words
  137. const uint32 n_bps = h_read_access.bps() + rem;
  138. const uint32 whole_words = n_bps / SYMBOLS_PER_WORD;
  139. gzwrite( output_file, &words[0], sizeof(uint32) * whole_words );
  140. // save the last non-whole word
  141. words[0] = words[ whole_words ];
  142. // save the number of unwritten symbols left
  143. rem = n_bps & (SYMBOLS_PER_WORD-1);
  144. if (output_index)
  145. {
  146. // collect the sequence offsets
  147. for (uint32 i = 0; i < h_read_data.size(); ++i)
  148. index[i] = offset + h_read_data.sequence_index()[i+1];
  149. // write the sequence offsets
  150. gzwrite( output_file, &index[0], sizeof(uint64) * h_read_data.size() );
  151. }
  152. // update the global sequence offset
  153. offset += h_read_data.bps();
  154. // update the global number of output reads
  155. n_reads += h_read_data.size();
  156. const uint64 n_bytes = gzoffset( output_file );
  157. log_verbose(stderr,"\r %u reads (%.2fGB - %.2fB/read - %.2fB/bp) ", n_reads, float( n_bytes ) / float(1024*1024*1024), float(n_bytes)/float(n_reads), float(n_bytes)/float(offset));
  158. }
  159. log_verbose_cont(stderr,"\n");
  160. return true;
  161. }
  162. enum Format
  163. {
  164. ASCII_FORMAT = 0u,
  165. PACKED2_FORMAT = 1u,
  166. PACKED4_FORMAT = 2u,
  167. };
  168. int main(int argc, char* argv[])
  169. {
  170. if (argc < 2)
  171. {
  172. log_info(stderr, "nvExtractReads [options] input output\n");
  173. log_info(stderr, " extract a set of reads to a plain ASCII_FORMAT or packed file with one read per line (.txt)\n\n");
  174. log_info(stderr, "options:\n");
  175. log_info(stderr, " --verbosity\n");
  176. log_info(stderr, " -F | --skip-forward skip forward strand\n");
  177. log_info(stderr, " -R | --skip-reverse skip forward strand\n");
  178. log_info(stderr, " -a | --ascii ASCII_FORMAT output\n");
  179. log_info(stderr, " -p2 | --packed-2 2-bits packed output\n");
  180. log_info(stderr, " -p4 | --packed-4 4-bits packed output\n");
  181. log_info(stderr, " -i | --idx string save an index file\n");
  182. exit(0);
  183. }
  184. const char* reads_name = argv[argc-2];
  185. const char* out_name = argv[argc-1];
  186. const char* idx_name = NULL;
  187. bool forward = true;
  188. bool reverse = true;
  189. Format format = ASCII_FORMAT;
  190. io::QualityEncoding qencoding = io::Phred33;
  191. for (int i = 0; i < argc - 2; ++i)
  192. {
  193. if (strcmp( argv[i], "-verbosity" ) == 0 ||
  194. strcmp( argv[i], "--verbosity" ) == 0)
  195. {
  196. set_verbosity( Verbosity( atoi( argv[++i] ) ) );
  197. }
  198. else if (strcmp( argv[i], "-F" ) == 0 ||
  199. strcmp( argv[i], "--skip-forward" ) == 0) // skip forward strand
  200. {
  201. forward = false;
  202. }
  203. else if (strcmp( argv[i], "-R" ) == 0 ||
  204. strcmp( argv[i], "--skip-reverse" ) == 0) // skip reverse strand
  205. {
  206. reverse = false;
  207. }
  208. else if (strcmp( argv[i], "-a" ) == 0 ||
  209. strcmp( argv[i], "--ascii" ) == 0) // ascii format
  210. {
  211. format = ASCII_FORMAT;
  212. }
  213. else if (strcmp( argv[i], "-p2" ) == 0 ||
  214. strcmp( argv[i], "--packed-2" ) == 0) // 2-bits packed
  215. {
  216. format = PACKED2_FORMAT;
  217. }
  218. else if (strcmp( argv[i], "-p4" ) == 0 ||
  219. strcmp( argv[i], "--packed-4" ) == 0) // 4-bits packed
  220. {
  221. format = PACKED4_FORMAT;
  222. }
  223. else if (strcmp( argv[i], "-i" ) == 0 ||
  224. strcmp( argv[i], "--idx" ) == 0) // index file
  225. {
  226. idx_name = argv[++i];
  227. }
  228. }
  229. std::string out_string = out_name;
  230. // parse out file extension; look for .fastq.gz, .fastq suffixes
  231. uint32 len = uint32( strlen(out_name) );
  232. bool is_gzipped = false;
  233. // do we have a .gz suffix?
  234. if (len >= strlen(".gz"))
  235. {
  236. if (strcmp(&out_name[len - strlen(".gz")], ".gz") == 0)
  237. {
  238. is_gzipped = true;
  239. len = uint32(len - strlen(".gz"));
  240. }
  241. }
  242. void* output_file = NULL;
  243. void* output_index = NULL;
  244. if (format == ASCII_FORMAT)
  245. {
  246. // open a plain ASCII_FORMAT file
  247. output_file = gzopen( out_name, is_gzipped ? "w1R" : "w" );
  248. }
  249. else
  250. {
  251. // open a binary file
  252. output_file = gzopen( out_name, is_gzipped ? "wb1R" : "wbT" );
  253. }
  254. if (output_file == NULL)
  255. {
  256. log_error(stderr, " failed opening file \"%s\"\n", out_name);
  257. return 1;
  258. }
  259. if (idx_name)
  260. {
  261. output_index = fopen( idx_name, "wb" );
  262. if (output_index == NULL)
  263. {
  264. log_error(stderr, " failed opening file \"%s\"\n", idx_name);
  265. return 1;
  266. }
  267. }
  268. log_visible(stderr,"nvExtractReads... started\n");
  269. uint32 encoding_flags = 0u;
  270. if (forward) encoding_flags |= io::FORWARD;
  271. if (reverse) encoding_flags |= io::REVERSE_COMPLEMENT;
  272. bool success;
  273. switch (format)
  274. {
  275. case ASCII_FORMAT:
  276. success = to_ascii( reads_name, output_file, output_index, qencoding, io::SequenceEncoding(encoding_flags) );
  277. break;
  278. case PACKED2_FORMAT:
  279. success = to_packed<2u>( reads_name, output_file, output_index, qencoding, io::SequenceEncoding(encoding_flags) );
  280. break;
  281. case PACKED4_FORMAT:
  282. success = to_packed<4u>( reads_name, output_file, output_index, qencoding, io::SequenceEncoding(encoding_flags) );
  283. break;
  284. }
  285. if (output_file) gzclose( output_file );
  286. if (output_index) gzclose( output_index );
  287. log_visible(stderr,"nvExtractReads... done\n");
  288. return success ? 0u : 1u;
  289. }