kmers.h 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. /*
  2. * nvbio
  3. * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the NVIDIA CORPORATION nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. #pragma once
  28. #include "assembly_types.h"
  29. using namespace nvbio;
  30. #define MAX_TEMP_SPACE_BATCHES 5
  31. // set of kmers from a set of sequences
  32. template <typename string_set_type>
  33. struct D_KmerSet
  34. {
  35. const string_set_type string_set; // string set from which the kmers are extracted
  36. D_VectorU32 active_region_ids; // active region id of each sequence in the set
  37. uint32 kmer_size; // kmer size used to extract the kmers
  38. D_VectorSetKmerCoord coords; // coordinates in the sequence set
  39. D_VectorU64 kmers_64b; // kmer keys: 64-bit compacted kmer sequence
  40. D_VectorU64 kmers_64b_distinct; // distinct kmer keys
  41. D_VectorU32 kmers_64b_unique_idxs; // indices to unique kmers in the sorted coordinate vector
  42. D_VectorU32 kmers_64b_repeat_idxs; // indices to unique kmers in the sorted coordinate vector
  43. D_VectorU32 global_to_sorted_id_map;// map[i] = sorted array index of coord with global id i
  44. D_VectorU32 global_to_UID_map; // map from global id to the kmer UID
  45. D_VectorU8 global_unique_flag_map; // map[i] = 1 if kmer with global id i is unique; 0, otherwise
  46. D_VectorU32 kmers_counts; // number of occurrences of each kmer in the set
  47. uint32 n_kmers; // total number of kmers in the set
  48. uint32 n_distinct; // number of distinct kmer keys in the set
  49. uint32 n_unique; // number of distinct unique kmers (unique := kmer does not occur more than once in a sequence)
  50. uint32 n_repeat; // number of non-unique kmers that are extracted as graph nodes
  51. // super kmers (to handle non-unique kmers)
  52. D_VectorSetKmerCoord super_coords;
  53. D_VectorU32 super_prefix_uids;
  54. D_VectorU32 super_suffix_uids;
  55. uint32 n_super_coords;
  56. // scratch space
  57. D_VectorU32 scratch_u32;
  58. uint32 n_alloc;
  59. uint32 selector;
  60. D_KmerSet(): kmer_size(0), n_kmers(0), n_distinct(0), n_unique(0), n_repeat(0), n_super_coords(0),
  61. n_alloc(0), selector(0) {}
  62. D_KmerSet(const string_set_type _string_set, const uint32 _kmer_size):
  63. string_set(_string_set), kmer_size(_kmer_size),
  64. n_kmers(0), n_distinct(0), n_unique(0), n_repeat(0), n_super_coords(0),
  65. n_alloc(0), selector(0) { }
  66. D_KmerSet(const string_set_type _string_set, const D_VectorU32& _active_region_ids):
  67. string_set(_string_set), active_region_ids(_active_region_ids),
  68. kmer_size(0), n_kmers(0), n_distinct(0), n_unique(0), n_repeat(0), n_super_coords(0),
  69. n_alloc(0), selector(0) { }
  70. void gen_kmer_coords();
  71. void gen_kmer_64b_keys();
  72. void sort_kmers_by_64b_keys();
  73. void segmented_sort_kmers_by_64b_keys();
  74. template <typename meta_iterator_type>
  75. void sort_kmers_by_64b_keys_meta(const meta_iterator_type meta_data);
  76. void sort_kmers_by_64b_keys_seqid();
  77. template <typename meta_iterator_type>
  78. void sort_kmers_by_64b_keys_seqid_meta(const meta_iterator_type meta_data);
  79. void count_kmers_rle();
  80. void count_kmers();
  81. void count_distinct_by_prefix(D_VectorU32& prefix_unique_id_map);
  82. void partition_kmers_by_uniqueness();
  83. void gen_prefix_map();
  84. void gen_global_unique_map();
  85. void gen_global_UID_map();
  86. void gen_global_to_sorted_id_map();
  87. void mark_unique_kmers();
  88. void filter_coords_by_prefix_uniqueness(const D_VectorU8& unique_map);
  89. void extract_super_kmers();
  90. void collapse_and_extract_non_overlaps(D_VectorSetKmerCoord& kmers_out, D_VectorU32& prefix_ids_out,
  91. D_VectorU32& suffix_ids_out, D_VectorU32& counts_out);
  92. void count_distinct_by_prefix();
  93. void init_alloc_temp_space()
  94. {
  95. n_alloc = MAX_TEMP_SPACE_BATCHES;
  96. scratch_u32.resize(n_alloc*n_kmers);
  97. }
  98. D_VectorU32::iterator get_scratch_space(uint32 size)
  99. {
  100. if((size > n_kmers) || ((selector + 1) > MAX_TEMP_SPACE_BATCHES)) {
  101. printf("Requested more memory than batch size \n");
  102. exit(-1);
  103. }
  104. selector++;
  105. return scratch_u32.begin() + (selector-1)*n_kmers;
  106. }
  107. void reset_scratch_space()
  108. {
  109. selector = 0;
  110. }
  111. };
  112. #include "kmers_inl.h"