waveletfm.cu 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. /*
  2. * nvbio
  3. * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the NVIDIA CORPORATION nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. // waveletfm.cu
  28. //
  29. // Protein Search using a Wavelet Tree and an FM-index
  30. //
  31. #include <stdio.h>
  32. #include <stdlib.h>
  33. #include <nvbio/basic/console.h>
  34. #include <nvbio/basic/vector.h>
  35. #include <nvbio/basic/packed_vector.h>
  36. #include <nvbio/strings/alphabet.h>
  37. #include <nvbio/strings/wavelet_tree.h>
  38. #include <nvbio/sufsort/sufsort.h>
  39. #include <nvbio/fmindex/fmindex.h>
  40. using namespace nvbio;
  41. // main test entry point
  42. //
  43. int main(int argc, char* argv[])
  44. {
  45. log_info(stderr, "waveletfm... started\n");
  46. const char* proteins = "ACDEFGHIKLMNOPQRSTVWYBZX";
  47. const uint32 text_len = 24;
  48. const uint32 alphabet_bits = AlphabetTraits<PROTEIN>::SYMBOL_SIZE;
  49. const uint32 alphabet_size = 1u << alphabet_bits;
  50. // print the text
  51. log_info(stderr, " text: %s\n", proteins);
  52. // allocate a host packed vector
  53. PackedVector<host_tag,alphabet_bits,true> h_text( text_len );
  54. // pack the string
  55. from_string<PROTEIN>( proteins, proteins + text_len, h_text.begin() );
  56. // copy it to the device
  57. PackedVector<device_tag,alphabet_bits,true> d_text( h_text );
  58. // allocate a vector for the BWT
  59. PackedVector<device_tag,alphabet_bits,true> d_bwt( text_len + 1 );
  60. BWTParams bwt_params;
  61. // build the BWT
  62. const uint32 primary = cuda::bwt( text_len, d_text.begin(), d_bwt.begin(), &bwt_params );
  63. // print the BWT
  64. {
  65. char proteins_bwt[ 25 ];
  66. to_string<PROTEIN>( d_bwt.begin(), d_bwt.begin() + text_len, proteins_bwt );
  67. log_info(stderr, " bwt: %s (primary %u)\n", proteins_bwt, primary);
  68. }
  69. // define our wavelet tree storage type and its plain view
  70. typedef WaveletTreeStorage<device_tag> wavelet_tree_type;
  71. typedef WaveletTreeStorage<device_tag>::const_plain_view_type wavelet_tree_view_type;
  72. // build a wavelet tree
  73. wavelet_tree_type wavelet_bwt;
  74. // setup the wavelet tree
  75. setup( text_len, d_bwt.begin(), wavelet_bwt );
  76. typedef nvbio::vector<device_tag,uint32>::const_iterator l2_iterator;
  77. typedef fm_index<wavelet_tree_view_type, null_type, l2_iterator> fm_index_type;
  78. // take the const view of the wavelet tree
  79. const wavelet_tree_view_type wavelet_bwt_view = plain_view( (const wavelet_tree_type&)wavelet_bwt );
  80. // build the L2 vector
  81. nvbio::vector<device_tag,uint32> L2(alphabet_size+1);
  82. // note we perform this on the host, even though the wavelet tree is on the device -
  83. // gonna be slow, but it's not much stuff, and this is just an example anyway...
  84. // and we just want to show that NVBIO is designed to make everything work!
  85. L2[0] = 0;
  86. for (uint32 i = 1; i <= alphabet_size; ++i)
  87. L2[i] = L2[i-1] + rank( wavelet_bwt_view, text_len, i-1u );
  88. // build the FM-index
  89. const fm_index_type fmi(
  90. text_len,
  91. primary,
  92. L2.begin(),
  93. wavelet_bwt_view,
  94. null_type() );
  95. // do some string matching using our newly built FM-index - once again
  96. // we are doing it on the host, though all data is on the device: the entire
  97. // loop would be better moved to the device in a real app.
  98. log_info(stderr, " string matching:\n");
  99. for (uint32 i = 0; i < text_len; ++i)
  100. {
  101. // match the i-th suffix of the text
  102. const uint32 pattern_len = text_len - i;
  103. // compute the SA range containing the occurrences of the pattern we are after
  104. const uint2 range = match( fmi, d_text.begin() + i, pattern_len );
  105. // print the number of occurrences of our pattern, equal to the SA range size
  106. log_info(stderr, " rank(%s): %u\n", proteins + i, 1u + range.y - range.x);
  107. }
  108. log_info(stderr, "waveletfm... done\n");
  109. return 0;
  110. }