seeding.cu 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. /*
  2. * nvbio
  3. * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the NVIDIA CORPORATION nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. // seeding.cu
  28. //
  29. #include <stdio.h>
  30. #include <stdlib.h>
  31. #include <nvbio/basic/console.h>
  32. #include <nvbio/basic/vector.h>
  33. #include <nvbio/basic/shared_pointer.h>
  34. #include <nvbio/strings/string_set.h>
  35. #include <nvbio/strings/infix.h>
  36. #include <nvbio/strings/seeds.h>
  37. #include <nvbio/io/sequence/sequence.h>
  38. using namespace nvbio;
  39. // extract a set of uniformly spaced seeds from a string-set and return it as an InfixSet
  40. //
  41. template <typename system_tag, typename string_set_type>
  42. InfixSet<string_set_type, const string_set_infix_coord_type*>
  43. extract_seeds(
  44. const string_set_type string_set, // the input string-set
  45. const uint32 seed_len, // the seeds length
  46. const uint32 seed_interval, // the spacing between seeds
  47. nvbio::vector<system_tag,string_set_infix_coord_type>& seed_coords) // the output vector of seed coordinates
  48. {
  49. // enumerate all seeds
  50. const uint32 n_seeds = enumerate_string_set_seeds(
  51. string_set,
  52. uniform_seeds_functor<>( seed_len, seed_interval ),
  53. seed_coords );
  54. // and build the output infix-set
  55. return InfixSet<string_set_type, const string_set_infix_coord_type*>(
  56. n_seeds,
  57. string_set,
  58. nvbio::plain_view( seed_coords ) );
  59. }
  60. // main test entry point
  61. //
  62. int main(int argc, char* argv[])
  63. {
  64. //
  65. // perform some basic option parsing
  66. //
  67. uint32 n_bps = 10000000;
  68. const char* reads = "./data/SRR493095_1.fastq.gz";
  69. for (int i = 0; i < argc; ++i)
  70. {
  71. if (strcmp( argv[i], "-bps" ) == 0)
  72. n_bps = uint32( atoi( argv[++i] ) )*1000u;
  73. else if (strcmp( argv[i], "-reads" ) == 0)
  74. reads = argv[++i];
  75. }
  76. // start our program
  77. log_info(stderr, "seeding... started\n");
  78. // open a read file
  79. log_info(stderr, " loading reads... started\n");
  80. SharedPointer<io::SequenceDataStream> read_data_file(
  81. io::open_sequence_file(
  82. reads,
  83. io::Phred33,
  84. uint32(-1),
  85. uint32(-1) ) );
  86. // check whether the file opened correctly
  87. if (read_data_file == NULL || read_data_file->is_ok() == false)
  88. {
  89. log_error(stderr, " failed opening file \"%s\"\n", reads);
  90. return 1u;
  91. }
  92. const uint32 batch_size = uint32(-1);
  93. const uint32 batch_bps = n_bps;
  94. // load a batch of reads
  95. io::SequenceDataHost h_read_data;
  96. io::next( DNA_N, &h_read_data, read_data_file.get(), batch_size, batch_bps );
  97. // copy it to the device
  98. const io::SequenceDataDevice d_read_data( h_read_data );
  99. log_info(stderr, " loading reads... done\n");
  100. log_info(stderr, " %u reads\n", d_read_data.size());
  101. // prepare some typedefs for the involved string-sets and infixes
  102. typedef io::SequenceDataAccess<DNA_N> read_access_type;
  103. typedef read_access_type::sequence_string_set_type string_set_type; // the read string-set
  104. typedef string_set_infix_coord_type infix_coord_type; // the infix coordinate type, for string-sets
  105. typedef nvbio::vector<device_tag,infix_coord_type> infix_vector_type; // the device vector type for infix coordinates
  106. typedef InfixSet<string_set_type, const string_set_infix_coord_type*> seed_set_type; // the infix-set type for representing seeds
  107. // build a read accessor
  108. const read_access_type d_read_access( d_read_data );
  109. // fetch the actual read string-set
  110. const string_set_type d_read_string_set = d_read_access.sequence_string_set();
  111. // prepare enough storage for the seed coordinates
  112. infix_vector_type d_seed_coords;
  113. // extract the seeds and get the corresponding string-set representation
  114. const seed_set_type d_seed_set = extract_seeds(
  115. d_read_string_set,
  116. 20u,
  117. 10u,
  118. d_seed_coords );
  119. // output some stats
  120. log_info(stderr, "seeding... done\n");
  121. log_info(stderr, " %u seeds\n", d_seed_set.size());
  122. return 0;
  123. }