kmers_inl.h 57 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630
  1. /*
  2. * nvbio
  3. * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the NVIDIA CORPORATION nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. #include <cstdlib>
  28. #include <nvbio/basic/thrust_view.h>
  29. #include <nvbio/basic/primitives.h>
  30. #include <nvbio/basic/atomics.h>
  31. #include <nvbio/basic/cuda/sort.h>
  32. #include <nvbio/basic/cuda/primitives.h>
  33. #include <nvbio/strings/seeds.h>
  34. #include "assembly_types.h"
  35. /* K-mer Extraction/Counting/Uniqueness Functionality */
  36. // extract the string set sequence id from the kmer coordinates
  37. struct get_kmer_seq_id : public thrust::unary_function<SequenceSetKmerCoord,uint32>
  38. {
  39. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  40. uint32 operator() (const SequenceSetKmerCoord kmer_coord) const
  41. {
  42. return kmer_coord.x;
  43. }
  44. };
  45. struct get_kmer_reg_id
  46. {
  47. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  48. uint32 operator() (const SequenceSetKmerCoord kmer_coord) const
  49. {
  50. return kmer_coord.w;
  51. }
  52. };
  53. // extract the kmer size from the kmer coordinates
  54. struct get_kmer_size : public thrust::unary_function<SequenceSetKmerCoord,uint32>
  55. {
  56. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  57. uint32 operator() (const SequenceSetKmerCoord kmer_coord) const
  58. {
  59. return kmer_coord.z - kmer_coord.y;
  60. }
  61. };
  62. struct get_global_id : public thrust::unary_function<SequenceSetKmerCoord,uint32>
  63. {
  64. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  65. uint32 operator() (const SequenceSetKmerCoord coord) const
  66. {
  67. return coord.z;
  68. }
  69. };
  70. // store the global kmer id in its coordinate
  71. struct set_global_id
  72. {
  73. SequenceSetKmerCoord* coords;
  74. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  75. set_global_id(SequenceSetKmerCoord* coords) : coords(coords) { }
  76. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  77. void operator() (const uint32 idx) const
  78. {
  79. coords[idx].z = idx;
  80. }
  81. };
  82. // store the global kmer id and the active region id in its coordinate
  83. struct set_global_id_region_id
  84. {
  85. SequenceSetKmerCoord* coords;
  86. const uint32* active_region_ids;
  87. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  88. set_global_id_region_id(SequenceSetKmerCoord* coords, const uint32* _active_region_ids) :
  89. coords(coords), active_region_ids(_active_region_ids) { }
  90. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  91. void operator() (const uint32 idx) const
  92. {
  93. SequenceSetKmerCoord& coord = coords[idx];
  94. coord.z = idx; // since the kmer size is fixed, can reuse this field
  95. coord.w = active_region_ids[coord.x];
  96. }
  97. };
  98. // maps from the coordinate of a kmer
  99. // to the global id of its prefix
  100. struct get_prefix_global_id : public thrust::unary_function<uint32,uint32>
  101. {
  102. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  103. uint32 operator() (const SequenceSetKmerCoord coord) const
  104. {
  105. return coord.z + coord.x;
  106. }
  107. };
  108. // maps from the id of a kmer
  109. // to the global id of its prefix
  110. struct get_prefix_global_id_by_idx : public thrust::unary_function<uint32,uint32>
  111. {
  112. const SequenceSetKmerCoord* coords;
  113. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  114. get_prefix_global_id_by_idx(const SequenceSetKmerCoord* _coords):
  115. coords(_coords) { }
  116. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  117. uint32 operator() (const uint32 idx) const
  118. {
  119. const SequenceSetKmerCoord coord = coords[idx];
  120. return coord.z + coord.x;
  121. }
  122. };
  123. // maps from the id of a kmer
  124. // to the global id of its suffix
  125. struct get_suffix_global_id_by_idx : public thrust::unary_function<uint32,uint32>
  126. {
  127. const SequenceSetKmerCoord* coords;
  128. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  129. get_suffix_global_id_by_idx(const SequenceSetKmerCoord* _coords):
  130. coords(_coords) { }
  131. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  132. uint32 operator() (const uint32 idx) const
  133. {
  134. const SequenceSetKmerCoord coord = coords[idx];
  135. return coord.z + coord.x + 1;
  136. }
  137. };
  138. // creates a map from the id of a kmer
  139. // to the global id of its prefix
  140. struct compute_prefix_global_id
  141. {
  142. const SequenceSetKmerCoord* coords;
  143. uint32* prefix_ids;
  144. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  145. compute_prefix_global_id(const SequenceSetKmerCoord* _coords, uint32* _prefix_ids):
  146. coords(_coords), prefix_ids(_prefix_ids) { }
  147. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  148. void operator() (const uint32 id) const
  149. {
  150. const SequenceSetKmerCoord coord = coords[id];
  151. prefix_ids[id] = coord.z + coord.x;
  152. }
  153. };
  154. // creates a map from the id of a kmer
  155. // to the global id of its suffix
  156. struct compute_suffix_global_id
  157. {
  158. const SequenceSetKmerCoord* coords;
  159. uint32* suffix_ids;
  160. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  161. compute_suffix_global_id(const SequenceSetKmerCoord* _coords, uint32* _suffix_ids):
  162. coords(_coords), suffix_ids(_suffix_ids) { }
  163. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  164. void operator() (const uint32 id) const
  165. {
  166. const SequenceSetKmerCoord coord = coords[id];
  167. suffix_ids[id] = coord.z + coord.x + 1;
  168. }
  169. };
  170. // maps kmer global id to its UID
  171. struct global_to_uid : public thrust::unary_function<uint64,uint64>
  172. {
  173. const uint32* uid_map;
  174. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  175. global_to_uid(const uint32* _uid_map) : uid_map(_uid_map) { }
  176. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  177. uint32 operator() (const uint32 global_id) const
  178. {
  179. return uid_map[global_id];
  180. }
  181. };
  182. // creates a map from the global id to the id in the sorted array
  183. struct global_to_sorted_id
  184. {
  185. uint32* global_to_sorted_id_map;
  186. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  187. global_to_sorted_id(uint32* _map) : global_to_sorted_id_map(_map) { }
  188. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  189. void operator() (const thrust::tuple<uint32, SequenceSetKmerCoord> idx_coord) const
  190. {
  191. const uint32 idx = thrust::get<0>(idx_coord);
  192. const SequenceSetKmerCoord coord = thrust::get<1>(idx_coord);
  193. global_to_sorted_id_map[coord.z] = idx; // TODO: not coalesced
  194. }
  195. };
  196. // compute the kmer keys: 64-bit compacted kmer sequence
  197. template <typename string_set_type>
  198. struct kmers_to_64b_functor
  199. {
  200. typedef typename string_set_type::string_type sequence;
  201. const string_set_type string_set;
  202. const uint32 kmer_size;
  203. const uint32 dna_symbol_size;
  204. const uint32 symbol_mask;
  205. NVBIO_HOST_DEVICE
  206. kmers_to_64b_functor(const uint32 _kmer_size, const uint32 _dna_symbol_size, const string_set_type _string_set) :
  207. kmer_size(_kmer_size), dna_symbol_size(_dna_symbol_size), symbol_mask((1u << dna_symbol_size) - 1u),
  208. string_set(_string_set) {}
  209. NVBIO_HOST_DEVICE
  210. uint64 operator() (const SequenceSetKmerCoord kmer_coord) const
  211. {
  212. const sequence seq = string_set[kmer_coord.x];
  213. const uint32 seq_pos = kmer_coord.y;
  214. const uint32 seq_len = seq.length();
  215. uint64 kmer_key = 0u;
  216. for (uint32 i = 0; i < kmer_size; i++) {
  217. kmer_key |= uint64(seq_pos + i < seq_len ? (seq[seq_pos + i] & symbol_mask) : 0u) << ((kmer_size-1-i)*dna_symbol_size); //(i*dna_symbol_size);
  218. }
  219. return kmer_key;
  220. }
  221. };
  222. // equality based on kmer key and sequence id coordinate
  223. struct kmer_key_sid_eq
  224. {
  225. const uint64* keys;
  226. const SequenceSetKmerCoord* coords;
  227. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  228. kmer_key_sid_eq(const uint64* _keys, const SequenceSetKmerCoord* _coords):
  229. keys(_keys), coords(_coords) { }
  230. NVBIO_HOST_DEVICE bool operator()(const uint32 i, const uint32 j)
  231. {
  232. return keys[i] == keys[j] && coords[i].x == coords[j].x;
  233. }
  234. };
  235. // equality based on kmer key and region id coordinate
  236. struct kmer_key_rid_eq
  237. {
  238. const uint64* keys;
  239. const SequenceSetKmerCoord* coords;
  240. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  241. kmer_key_rid_eq(const uint64* _keys, const SequenceSetKmerCoord* _coords):
  242. keys(_keys), coords(_coords) { }
  243. NVBIO_HOST_DEVICE bool operator()(const uint32 i, const uint32 j)
  244. {
  245. return keys[i] == keys[j] && coords[i].w == coords[j].w;
  246. }
  247. };
  248. // equality based on sequence id and position
  249. struct kmer_pos_sid_eq
  250. {
  251. const SequenceSetKmerCoord* coords;
  252. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  253. kmer_pos_sid_eq(const SequenceSetKmerCoord* _coords): coords(_coords) { }
  254. NVBIO_HOST_DEVICE bool operator()(const uint32 i, const uint32 j)
  255. {
  256. const SequenceSetKmerCoord ci = coords[i];
  257. const SequenceSetKmerCoord cj = coords[j];
  258. return ci.y == cj.y && ci.x == cj.x;
  259. }
  260. };
  261. // equality based on kmer key only
  262. struct kmer_uid_eq
  263. {
  264. const uint32* keys;
  265. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  266. kmer_uid_eq(const uint32* _keys) : keys(_keys) { }
  267. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  268. bool operator()(const uint32 i, const uint32 j)
  269. {
  270. return keys[i] == keys[j];
  271. }
  272. };
  273. // extract i-th word from the kmer in a string set
  274. template <typename string_set_type>
  275. struct kmer_word_extractor_functor
  276. {
  277. typedef typename string_set_type::string_type sequence;
  278. const string_set_type string_set;
  279. const uint32 dna_symbol_size;
  280. const uint32 symbol_offset;
  281. const uint32 symbols_per_word;
  282. const SequenceSetKmerCoord* coords;
  283. const uint32 i;
  284. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  285. kmer_word_extractor_functor(const string_set_type _string_set, const uint32 _dna_symbol_size,
  286. const SequenceSetKmerCoord* _coords, const uint32 _i) :
  287. dna_symbol_size(_dna_symbol_size),
  288. symbol_offset(uint32(8u*sizeof(uint32)) - dna_symbol_size),
  289. symbols_per_word(uint32(8u * sizeof(uint32))/dna_symbol_size),
  290. coords(_coords), string_set(_string_set), i(_i) {}
  291. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  292. uint32 operator() (const uint32 kmer_idx) const
  293. {
  294. SequenceSetKmerCoord kmer_coord = coords[kmer_idx];
  295. const uint32 kmer_pos = kmer_coord.y;
  296. const uint32 kmer_size = kmer_coord.z - kmer_coord.y; // TODO: z-field is currently used for the global id, use kmer_size if fixed
  297. const sequence seq = string_set[kmer_coord.x];
  298. const uint32 seq_len = seq.length();
  299. uint32 word = 0u;
  300. for (uint32 j = 0; j < kmer_size; j++) {
  301. const uint32 jj = kmer_pos + i*symbols_per_word + j;
  302. const uint32 c = jj < seq_len ? char_to_iupac16(dna_to_char(seq[jj])) : 0u;
  303. word |= (c << (symbol_offset - j*dna_symbol_size));
  304. }
  305. return word;
  306. }
  307. };
  308. // add tuples of kmer and sequence counts
  309. struct kmer_count_tuple_sum
  310. {
  311. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  312. thrust::tuple<uint32, uint32> operator() (const thrust::tuple<uint32, uint32>& x, const thrust::tuple<uint32, uint32>& y) const
  313. {
  314. return thrust::tuple<uint32, uint32>(thrust::get<0>(x) + thrust::get<0>(y), thrust::get<1>(x) + thrust::get<1>(y));
  315. }
  316. };
  317. // check if a kmer occurs more than once in at least one sequence
  318. // given a tuple of the number of sequences that contain it and
  319. // the number of times it occurs in all the sequences
  320. struct is_unique_kmer
  321. {
  322. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  323. bool operator() (const thrust::tuple<uint32, uint32>& nSeq_nOcc) const
  324. {
  325. return thrust::get<0>(nSeq_nOcc) >= thrust::get<1>(nSeq_nOcc);
  326. }
  327. };
  328. // given the last id of a kmer in the group of consecutive equal kmers
  329. // mark all identical consecutive kmers as unique in the map
  330. struct mark_kmer_uniqueness
  331. {
  332. uint8* is_unique_map;
  333. const uint64* sorted_kmer_keys;
  334. const SequenceSetKmerCoord* sorted_coords;
  335. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  336. mark_kmer_uniqueness(uint8* _is_unique_map, const uint64* _sorted_kmer_keys, const SequenceSetKmerCoord* _sorted_coords) :
  337. is_unique_map(_is_unique_map), sorted_kmer_keys(_sorted_kmer_keys), sorted_coords(_sorted_coords) { }
  338. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  339. void operator() (const uint32 kmer_id) const
  340. {
  341. const uint64 kmer_key = sorted_kmer_keys[kmer_id];
  342. const uint32 rid = sorted_coords[kmer_id].w;
  343. int32 idx = kmer_id;
  344. while(idx >= 0) {
  345. if(sorted_coords[idx].w != rid || sorted_kmer_keys[idx] != kmer_key) break;
  346. is_unique_map[idx] = 1; //TODO: thrust documentation states first, but last in experiments
  347. idx--;
  348. }
  349. }
  350. };
  351. struct store_kmer_unique_ids
  352. {
  353. uint32* unique_id_map;
  354. const uint64* sorted_kmer_keys;
  355. const SequenceSetKmerCoord* sorted_coords;
  356. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  357. store_kmer_unique_ids(uint32* _unique_id_map, const uint64* _sorted_kmer_keys, const SequenceSetKmerCoord* _sorted_coords) :
  358. unique_id_map(_unique_id_map), sorted_kmer_keys(_sorted_kmer_keys), sorted_coords(_sorted_coords) { }
  359. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  360. void operator() (const thrust::tuple<uint32, uint32>& uid_kid) const
  361. {
  362. const uint32 uid = thrust::get<0>(uid_kid);
  363. const uint32 kid = thrust::get<1>(uid_kid);
  364. const uint64 kmer_key = sorted_kmer_keys[kid];
  365. const uint32 rid = sorted_coords[kid].w;
  366. int32 idx = kid;
  367. while(idx >= 0) {
  368. if(sorted_coords[idx].w != rid || sorted_kmer_keys[idx] != kmer_key) break;
  369. unique_id_map[idx] = uid;
  370. idx--; //TODO: thrust documentation states first, but last in experiments
  371. }
  372. }
  373. };
  374. //struct populate_unique_kmer_data
  375. //{
  376. // const uint64* sorted_kmer_keys;
  377. // const SequenceSetKmerCoord* sorted_coords;
  378. // const uint32* unique_kmer_idxs;
  379. //
  380. // // fill in unique kmer annotations
  381. // uint8* unique_flag_map;
  382. // uint32* unique_UID_map;
  383. //
  384. // NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  385. // populate_unique_kmer_data(const uint64* _sorted_kmer_keys, const SequenceSetKmerCoord* _coords,
  386. // const uint32* _unique_kmer_idxs, uint8* _unique_flag_map, uint32* _unique_UID_map) :
  387. // sorted_kmer_keys(_sorted_kmer_keys), sorted_coords(_coords), unique_kmer_idxs(_unique_kmer_idxs),
  388. // unique_flag_map(_unique_flag_map), unique_UID_map (_unique_UID_map) { }
  389. //
  390. // NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  391. // void operator() (const uint32 uid) const
  392. // {
  393. // const uint32 unique_kmer_idx = unique_kmer_idxs[uid];
  394. // const uint64 key = sorted_kmer_keys[unique_kmer_idx]; // find the group of coordinates with this key and region
  395. // const uint32 region = sorted_coords[unique_kmer_idx].w;
  396. // uint32 global_kmer_id = sorted_coords[unique_kmer_idx].z;
  397. // unique_flag_map[global_kmer_id] = 1;
  398. // unique_UID_map[global_kmer_id] = uid;
  399. //
  400. // int32 idx = unique_kmer_idx - 1;
  401. // while(idx >= 0) {
  402. // if(sorted_kmer_keys[idx] != key || sorted_coords[idx].w != region) break;
  403. // global_kmer_id = sorted_coords[idx].z;
  404. // unique_flag_map[global_kmer_id] = 1; //TODO: thrust documentation states first, but last in experiments
  405. // unique_UID_map[global_kmer_id] = uid;
  406. // idx--;
  407. // }
  408. // }
  409. //};
  410. struct populate_unique_kmer_data
  411. {
  412. const uint32* kmer_counts;
  413. const SequenceSetKmerCoord* sorted_coords;
  414. const uint32* unique_kmer_idxs;
  415. // fill in unique kmer annotations
  416. uint8* unique_flag_map;
  417. uint32* unique_UID_map;
  418. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  419. populate_unique_kmer_data(const uint32* _kmer_counts, const SequenceSetKmerCoord* _coords,
  420. const uint32* _unique_kmer_idxs, uint8* _unique_flag_map, uint32* _unique_UID_map) :
  421. kmer_counts(_kmer_counts), sorted_coords(_coords), unique_kmer_idxs(_unique_kmer_idxs),
  422. unique_flag_map(_unique_flag_map), unique_UID_map (_unique_UID_map) { }
  423. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  424. void operator() (const uint32 uid) const
  425. {
  426. const uint32 unique_kmer_idx = unique_kmer_idxs[uid];
  427. const uint32 count = kmer_counts[uid];
  428. uint32 global_kmer_id = sorted_coords[unique_kmer_idx].z;
  429. unique_flag_map[global_kmer_id] = 1;
  430. unique_UID_map[global_kmer_id] = uid;
  431. for(uint32 i = 1; i < count; i++) {
  432. global_kmer_id = sorted_coords[unique_kmer_idx - i].z; //TODO: thrust bug: pointer to last instead of first
  433. unique_flag_map[global_kmer_id] = 1; // TODO: not coalesced
  434. unique_UID_map[global_kmer_id] = uid;
  435. }
  436. }
  437. };
  438. // marks kmers that have a prefix satisfying the following:
  439. // the prefix is unique and has a matching kmer
  440. // (i.e. it is not the last (k-1)mer in the sequence)
  441. struct mark_unique_prefix_kmer
  442. {
  443. uint8* unique_prefix_map;
  444. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  445. mark_unique_prefix_kmer(uint8* _unique_prefix_map) : unique_prefix_map(_unique_prefix_map) { }
  446. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  447. void operator() (const thrust::tuple<uint8, uint64>& uflag_prefix) const
  448. {
  449. const uint8 is_unique_prefix = thrust::get<0>(uflag_prefix);
  450. const uint32 kmer_id = thrust::get<1>(uflag_prefix);
  451. if((is_unique_prefix == 1) && (kmer_id != (uint32) -1)) {
  452. unique_prefix_map[kmer_id] = 1; // TODO: writes not coalesced
  453. }
  454. }
  455. };
  456. // ---------- Super-kmers: chains of consecutive kmers
  457. // given a non-unique kmer N, finds the closest unique predecessor and successor kmers in the sequence, P and S
  458. // returns the new coordinate for the chain starting with the P and ending with S [P...N...S]
  459. // returns a dummy coordinate (N.x, -1, -1, 0) if no such previous unique kmer exists
  460. // TODO: could use shared memory here
  461. template <typename string_set_type>
  462. struct extract_super_kmers_functor
  463. {
  464. typedef typename string_set_type::string_type sequence;
  465. const string_set_type string_set;
  466. const uint32* repeat_global_ids; // global ids of the repeat kmers
  467. const uint8* global_unique_flag_map; // unsorted
  468. const uint32* global_UID_map;
  469. const uint32* global_sorted_idx_map;
  470. const SequenceSetKmerCoord* coords; // sorted
  471. const uint32 kmer_size;
  472. // structures to populate
  473. SequenceSetKmerCoord* super_coords;
  474. uint32* prefix_uids;
  475. uint32* suffix_uids;
  476. NVBIO_HOST_DEVICE
  477. extract_super_kmers_functor(const uint32* _repeat_global_ids,
  478. const uint8* _is_unique_map, const uint32* _global_UID_map, const uint32* _sorted_idx_map,
  479. const SequenceSetKmerCoord* _coords, const string_set_type _string_set, const uint32 _kmer_size,
  480. SequenceSetKmerCoord* _super_coords, uint32* _prefix_uids, uint32* _suffix_uids):
  481. repeat_global_ids(_repeat_global_ids), global_unique_flag_map(_is_unique_map), global_UID_map(_global_UID_map),
  482. global_sorted_idx_map(_sorted_idx_map),
  483. coords(_coords), string_set(_string_set), kmer_size(_kmer_size),
  484. super_coords(_super_coords), prefix_uids(_prefix_uids), suffix_uids(_suffix_uids){ }
  485. NVBIO_HOST_DEVICE
  486. void operator()(const uint32 idx) const {
  487. const uint32 global_id = repeat_global_ids[idx];
  488. const SequenceSetKmerCoord coord = coords[global_sorted_idx_map[global_id]];
  489. const sequence seq = string_set[coord.x];
  490. const uint32 seq_len = seq.length();
  491. SequenceSetKmerCoord super_coord = make_uint4(coord.x, (uint32)-1, (uint32)-1, coord.w);
  492. uint32 prefix_id = (uint32) -1;
  493. uint32 suffix_id = (uint32) -1;
  494. // find the unique predecessor
  495. uint32 i = 1u;
  496. while(i <= coord.y) { // search only this sequence
  497. if(global_unique_flag_map[global_id-i] == 1) {
  498. super_coord.y = coord.y - i;
  499. super_coord.z = coord.y + kmer_size;
  500. prefix_id = global_UID_map[global_id-i];
  501. break;
  502. }
  503. i++;
  504. }
  505. if(super_coord.y != (uint32) -1) { // otherwise this chain will be ignored
  506. // find the unique successor
  507. i = 1u;
  508. while(i < seq_len - coord.y - kmer_size + 1) { // search only until the end of the sequence
  509. if(global_unique_flag_map[global_id+i] == 1) {
  510. super_coord.z = coord.y + i + kmer_size;
  511. suffix_id = global_UID_map[global_id+i];
  512. break;
  513. }
  514. i++;
  515. }
  516. }
  517. super_coords[idx] = super_coord;
  518. prefix_uids[idx] = prefix_id;
  519. suffix_uids[idx] = suffix_id;
  520. }
  521. };
  522. template <typename string_set_type>
  523. struct print_super_kmer_functor
  524. {
  525. typedef typename string_set_type::string_type sequence;
  526. const string_set_type string_set;
  527. const SequenceSetKmerCoord* coords;
  528. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  529. print_super_kmer_functor(const string_set_type _string_set, const SequenceSetKmerCoord* _coords) :
  530. coords(_coords), string_set(_string_set) {}
  531. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  532. void operator() (const uint32 kmer_idx) const
  533. {
  534. SequenceSetKmerCoord kmer_coord = coords[kmer_idx];
  535. const uint32 kmer_pos = kmer_coord.y;
  536. const uint32 kmer_size = kmer_coord.z - kmer_coord.y;
  537. const sequence seq = string_set[kmer_coord.x];
  538. const uint32 seq_len = seq.length();
  539. uint8 kmer_seq[14];
  540. for (uint32 k = 0; k < kmer_size; k++) {
  541. uint8 c = iupac16_to_char(char_to_iupac16(dna_to_char(seq[kmer_pos + k])));
  542. kmer_seq[k] = c;
  543. }
  544. kmer_seq[kmer_size] = '\0';
  545. printf("id: %llu, reg %u, seq %s \n", kmer_idx, kmer_coord.w, kmer_seq);
  546. }
  547. };
  548. // returns true if the super kmer does not represent a valid chain of kmers
  549. // based on the value of its unique predecessor id
  550. struct is_invalid_super_kmer
  551. {
  552. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  553. bool operator()(const uint32 chain_prefix_id)
  554. {
  555. return chain_prefix_id == (uint32) -1;
  556. }
  557. };
  558. struct is_unique_suffix_id : public thrust::unary_function<uint32,bool>
  559. {
  560. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  561. bool operator()(const uint32 suffix_id)
  562. {
  563. return suffix_id != (uint32) -1;
  564. }
  565. };
  566. struct is_unique_uid : public thrust::unary_function<uint32,uint8>
  567. {
  568. const uint32 max_unique_uid;
  569. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  570. is_unique_uid(const uint32 _max_unique_uid) : max_unique_uid(_max_unique_uid) { }
  571. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  572. uint8 operator()(const uint32 uid)
  573. {
  574. return uid < max_unique_uid;
  575. }
  576. };
  577. struct is_repeat_kmer
  578. {
  579. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  580. bool operator()(const uint8 flag)
  581. {
  582. return (flag != 1);
  583. }
  584. };
  585. // collapse kmers that start at the same position and belong to the same sequence
  586. // by just returning the one with max length
  587. struct collapse_same_start_kmers
  588. {
  589. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  590. thrust::tuple<SequenceSetKmerCoord, uint32, uint32> operator() (
  591. const thrust::tuple<SequenceSetKmerCoord, uint32, uint32>& x,
  592. const thrust::tuple<SequenceSetKmerCoord, uint32, uint32>& y) const
  593. {
  594. const SequenceSetKmerCoord i = thrust::get<0>(x);
  595. const SequenceSetKmerCoord j = thrust::get<0>(y);
  596. if(i.z > j.z) {
  597. return x;
  598. }
  599. return y;
  600. }
  601. };
  602. // computes the max prefix overlap with the closest lexicographically smaller super-kmer
  603. // assumes that the function is not called for idx == 0
  604. // returns the number of kmers by which the chains overlap
  605. // no overlap if the previous kmer is from a different region
  606. template <typename string_set_type>
  607. struct find_max_kmer_overlaps
  608. {
  609. typedef typename string_set_type::string_type sequence;
  610. const string_set_type string_set;
  611. const SequenceSetKmerCoord* coords; // sorted in lexicographic order
  612. const uint32 kmer_size;
  613. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  614. find_max_kmer_overlaps(const string_set_type _string_set, const SequenceSetKmerCoord* _coords,
  615. const uint32 _kmer_size): string_set(_string_set), coords(_coords), kmer_size(_kmer_size) { }
  616. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  617. uint32 operator() (const uint32 idx) const
  618. {
  619. const SequenceSetKmerCoord coord_curr = coords[idx];
  620. const SequenceSetKmerCoord coord_prev = coords[idx-1];
  621. if(coord_curr.w != coord_prev.w) return 0;
  622. const sequence seq_curr = string_set[coord_curr.x];
  623. const sequence seq_prev = string_set[coord_prev.x];
  624. const uint32 max_overlap =
  625. (coord_curr.z - coord_curr.y) > (coord_prev.z - coord_prev.y) ?
  626. (coord_prev.z - coord_prev.y) : (coord_curr.z - coord_curr.y);
  627. uint32 overlap = 0u;
  628. for (uint32 j = 0; j < max_overlap; j++)
  629. {
  630. if(seq_curr[coord_curr.y + j] == seq_prev[coord_prev.y +j]) {
  631. overlap++;
  632. } else {
  633. break;
  634. }
  635. }
  636. uint32 n_ovp = (overlap >= kmer_size) ? overlap - kmer_size + 1 : 0;
  637. return n_ovp;
  638. }
  639. };
  640. // returns the number of kmers that did not overlap with the lexicographic
  641. // super-kmer predecessor and are not unique
  642. // i.e. total number of kmers in the chain - number of overlapping kmers
  643. // does not count the unique first and last kmers
  644. struct num_non_overlapped_repeats : public thrust::unary_function<uint32,uint32>
  645. {
  646. const SequenceSetKmerCoord* coords;
  647. const uint32* suffix_kmer_ids;
  648. const uint32* overlaps;
  649. const uint32 kmer_size;
  650. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  651. num_non_overlapped_repeats(const SequenceSetKmerCoord* _coords,
  652. const uint32* _suffix_kmer_ids,
  653. const uint32* _overlaps,
  654. const uint32 _kmer_size):
  655. coords(_coords), suffix_kmer_ids(_suffix_kmer_ids), overlaps(_overlaps), kmer_size(_kmer_size) { }
  656. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  657. uint32 operator() (const uint32 idx) const
  658. {
  659. const SequenceSetKmerCoord coord = coords[idx];
  660. uint32 len = coord.z - coord.y;
  661. const uint32 n_kmers = (len >= kmer_size) ? len - kmer_size + 1 : 0;
  662. uint32 n_unique = 0;
  663. if(overlaps[idx] == 0) {
  664. n_unique++; // skip the header unique kmer if this is the first chain starting with this kmer
  665. // otherwise, it will be included in the overlap
  666. }
  667. if(overlaps[idx] != n_kmers && suffix_kmer_ids[idx] != (uint32) -1) { // if not all the kmers overlapped
  668. n_unique++; // skip the last unique kmer
  669. }
  670. return n_kmers - overlaps[idx] - n_unique;
  671. }
  672. };
  673. struct num_adj_repeats : public thrust::unary_function<uint32,uint32>
  674. {
  675. const SequenceSetKmerCoord* coords;
  676. const uint32* suffix_kmer_ids;
  677. const uint32* overlaps;
  678. const uint32 kmer_size;
  679. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  680. num_adj_repeats(const SequenceSetKmerCoord* _coords,
  681. const uint32* _suffix_kmer_ids,
  682. const uint32* _overlaps,
  683. const uint32 _kmer_size):
  684. coords(_coords), suffix_kmer_ids(_suffix_kmer_ids), overlaps(_overlaps), kmer_size(_kmer_size) { }
  685. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  686. uint32 operator() (const uint32 idx) const
  687. {
  688. const SequenceSetKmerCoord coord = coords[idx];
  689. uint32 len = coord.z - coord.y;
  690. const uint32 n_kmers = (len >= kmer_size) ? len - kmer_size + 1 : 0; // TODO: remove len check
  691. uint32 n_repeat_kmers_to_extract = n_kmers - overlaps[idx];
  692. if(overlaps[idx] == 0) {
  693. n_repeat_kmers_to_extract--; // skip the first unique kmer
  694. }
  695. if(overlaps[idx] != n_kmers && suffix_kmer_ids[idx] != (uint32) -1) { // if not all the kmers overlapped
  696. n_repeat_kmers_to_extract--; // skip the last unique kmer
  697. }
  698. // number of kmers to extract will be the same as number of edges, plus RU edge if exists
  699. uint32 n_adj = n_repeat_kmers_to_extract;
  700. if(overlaps[idx] != n_kmers && suffix_kmer_ids[idx] != (uint32) -1) { // if not all the kmers overlapped
  701. n_adj++; // add the last R-U edge
  702. }
  703. return n_adj;
  704. }
  705. };
  706. template <typename string_set_type>
  707. struct extract_non_overlapped_repeats
  708. {
  709. typedef typename string_set_type::string_type sequence;
  710. const string_set_type string_set;
  711. const SequenceSetKmerCoord* super_coords;
  712. const uint32* super_prefix_global_ids;
  713. const uint32* super_suffix_global_ids;
  714. const uint32* chain_offsets;
  715. const uint32* adj_extraction_offsets;
  716. const uint32* chain_overlaps;
  717. const uint32 kmer_size;
  718. const uint32 uid_offset;
  719. const uniform_seeds_functor<> seeder;
  720. SequenceSetKmerCoord* kmers_out;
  721. uint32* prefix_uids;
  722. uint32* suffix_uids;
  723. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  724. extract_non_overlapped_repeats(const string_set_type _string_set,
  725. const SequenceSetKmerCoord* _chain_coords,
  726. const uint32* _chain_pref_gids, const uint32* _chain_suffix_gids,
  727. const uint32* _chain_offsets,
  728. const uint32* _adj_extraction_offsets,
  729. const uint32* _chain_overlaps,
  730. const uint32 _kmer_size, const uint32 _uid_offset,
  731. SequenceSetKmerCoord* _kmers_out, uint32* _prefix_uids, uint32* _suffix_uids):
  732. string_set(_string_set),
  733. super_coords(_chain_coords),
  734. super_prefix_global_ids(_chain_pref_gids),
  735. super_suffix_global_ids(_chain_suffix_gids),
  736. chain_offsets(_chain_offsets),
  737. adj_extraction_offsets(_adj_extraction_offsets),
  738. chain_overlaps(_chain_overlaps),
  739. kmer_size(_kmer_size), uid_offset(_uid_offset),
  740. kmers_out(_kmers_out),
  741. prefix_uids(_prefix_uids), suffix_uids(_suffix_uids),
  742. seeder(_kmer_size, 1u) { }
  743. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  744. void operator() (const uint32 idx) const
  745. {
  746. const SequenceSetKmerCoord chain_coord = super_coords[idx];
  747. const sequence seq = string_set[chain_coord.x];
  748. const uint32 seq_len = seq.length();
  749. const uint32 offset = chain_offsets[idx] + uid_offset;
  750. const uint32 adj_offset = adj_extraction_offsets[idx];
  751. const uint8 unique_suffix = (super_suffix_global_ids[idx] != (uint32) -1);
  752. const uint32 n_kmers_to_extract = chain_offsets[idx+1] - chain_offsets[idx];
  753. uint32 len = chain_coord.z - chain_coord.y;
  754. const uint32 n_kmers = (len >= kmer_size) ? len - kmer_size + 1 : 0; // TODO: remove len check
  755. // if everything overlapped or nothing to extract and no possible R-U edge
  756. if(chain_overlaps[idx] == n_kmers || (n_kmers_to_extract + unique_suffix == 0)) return;
  757. const uint32 kmer_id = (chain_coord.z - unique_suffix) - n_kmers_to_extract - kmer_size + 1;
  758. for (uint32 j = 0; j < n_kmers_to_extract; j++) {
  759. const uint2 kmer = seeder.seed(seq_len, kmer_id + j);
  760. kmers_out[offset + j] = make_uint4(chain_coord.x, kmer.x, kmer.y, chain_coord.w);
  761. suffix_uids[adj_offset + j] = offset + j;
  762. if(j < n_kmers_to_extract - 1) {
  763. prefix_uids[adj_offset + j + 1] = offset + j;
  764. } else {
  765. if(unique_suffix == 1) {
  766. prefix_uids[adj_offset + j + 1] = offset + j; // the last kmer is a prefix only to the RU (k+1)mer
  767. }
  768. }
  769. }
  770. if(unique_suffix == 1) {
  771. suffix_uids[adj_offset + n_kmers_to_extract] = super_suffix_global_ids[idx];
  772. }
  773. // find the first prefix
  774. if(chain_overlaps[idx] == 0 || chain_overlaps[idx] == 1) {
  775. // overlap is just the unique header kmer or none
  776. // use the unique id of the kmer
  777. prefix_uids[adj_offset] = super_prefix_global_ids[idx];
  778. } else {
  779. uint32 pred_chain_idx = idx - 1;
  780. while(1) {
  781. if(chain_overlaps[pred_chain_idx] >= chain_overlaps[idx]) { // not stored by the previous chain // >=2
  782. pred_chain_idx--;
  783. } else {
  784. // ignores the unique predecessor that overlapped and was not extracted
  785. prefix_uids[adj_offset] = uid_offset + chain_offsets[pred_chain_idx] + (chain_overlaps[idx] - chain_overlaps[pred_chain_idx]) - 2; // last overlapped kmer
  786. break;
  787. }
  788. }
  789. }
  790. }
  791. };
  792. struct count_overlapped_adjacencies
  793. {
  794. const uint32* adj_extraction_offsets;
  795. const uint32* overlaps;
  796. uint32* counts;
  797. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  798. count_overlapped_adjacencies(const uint32* _adj_extraction_offsets, const uint32* _overlaps, uint32* _counts):
  799. adj_extraction_offsets(_adj_extraction_offsets),
  800. overlaps(_overlaps),
  801. counts(_counts) { }
  802. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  803. void operator() (const uint32 idx) const
  804. {
  805. int32 n_to_update = overlaps[idx] - 1;
  806. if(n_to_update <= 0) return;
  807. uint32 pred_chain_idx = idx - 1;
  808. while(n_to_update > 0) {
  809. const int32 pred_overlap = overlaps[pred_chain_idx] - 1;
  810. if(pred_overlap >= n_to_update) { // everything is shared with the pred
  811. pred_chain_idx--;
  812. } else {
  813. // update the counts in this pred chain
  814. const uint32 n_update_pred = n_to_update - (pred_overlap > 0 ? pred_overlap : 0);
  815. const uint32 pred_offset = adj_extraction_offsets[pred_chain_idx];
  816. for(uint32 i = 0; i < n_update_pred; i++) {
  817. nvbio::atomic_add(&counts[pred_offset + i], 1);
  818. }
  819. n_to_update -= n_update_pred;
  820. }
  821. }
  822. }
  823. };
  824. struct extract_repeat_adjacencies
  825. {
  826. const uint32* node_offsets;
  827. const uint32* prefix_uids;
  828. const uint32* suffix_uids;
  829. const uint32* edge_counts;
  830. uint32* node_adj_map;
  831. uint32* edge_counts_shuffled;
  832. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  833. extract_repeat_adjacencies(const uint32* _node_offsets, const uint32* _prefix_uids,
  834. const uint32* _suffix_uids, const uint32* _edge_counts,
  835. uint32* _node_adj_map, uint32* _edge_counts_shuffled):
  836. node_offsets(_node_offsets),
  837. prefix_uids(_prefix_uids),
  838. suffix_uids(_suffix_uids),
  839. edge_counts(_edge_counts),
  840. node_adj_map(_node_adj_map),
  841. edge_counts_shuffled(_edge_counts_shuffled){ }
  842. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  843. void operator() (const uint32 prefix_idx) const
  844. {
  845. const uint32 from_uid = prefix_uids[prefix_idx];
  846. const uint32 offset = node_offsets[from_uid];
  847. const uint32 num_edges = node_offsets[from_uid+1] - node_offsets[from_uid];
  848. uint32 i = 0;
  849. while(i < num_edges) { // at most this many prefixes to handle
  850. if(prefix_idx < i) break;
  851. const uint32 uid = prefix_uids[prefix_idx - i];
  852. if(uid != from_uid) break; // handled all the prefixes with the given UID
  853. node_adj_map[offset + i] = suffix_uids[prefix_idx - i];
  854. edge_counts_shuffled[offset + i] = edge_counts[prefix_idx - i];
  855. i++;
  856. }
  857. }
  858. };
  859. struct extract_unique_adjacencies
  860. {
  861. const uint32* node_offsets;
  862. const uint32* prefix_uids;
  863. const uint32* suffix_uids;
  864. const uint32* edge_counts;
  865. uint32* node_adj_map;
  866. uint32* edge_counts_shuffled;
  867. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  868. extract_unique_adjacencies(const uint32* _node_offsets, const uint32* _prefix_uids,
  869. const uint32* _suffix_uids, const uint32* _edge_counts,
  870. uint32* _node_adj_map, uint32* _edge_counts_shuffled):
  871. node_offsets(_node_offsets),
  872. prefix_uids(_prefix_uids),
  873. suffix_uids(_suffix_uids),
  874. edge_counts(_edge_counts),
  875. node_adj_map(_node_adj_map),
  876. edge_counts_shuffled(_edge_counts_shuffled){ }
  877. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  878. void operator() (const uint32 prefix_idx) const
  879. {
  880. const uint32 from_uid = prefix_uids[prefix_idx];
  881. const uint32 offset_last = node_offsets[from_uid+1] - 1;
  882. const uint32 num_edges = node_offsets[from_uid+1] - node_offsets[from_uid];
  883. uint32 i = 0;
  884. uint32 n_unique = 0;
  885. while(i < num_edges) { // at most this many prefixes to handle
  886. if(prefix_idx < i) break;
  887. const uint32 uid = prefix_uids[prefix_idx - i];
  888. if(uid != from_uid) break; // handled all the prefixes with the given UID
  889. if(suffix_uids[prefix_idx - i] == (uint32) -1) { // repeat (TODO: fill with -1)
  890. i++;
  891. continue;
  892. }
  893. node_adj_map[offset_last - n_unique] = suffix_uids[prefix_idx - i]; // fill out from the end
  894. edge_counts_shuffled[offset_last - n_unique] = edge_counts[prefix_idx - i];
  895. i++;
  896. n_unique++;
  897. }
  898. }
  899. };
  900. // converts a coordinate to the corresponding sequence -- used for debugging
  901. template <typename string_set_type>
  902. struct super_coord_to_seq
  903. {
  904. typedef typename string_set_type::string_type sequence;
  905. const string_set_type string_set;
  906. const SequenceSetKmerCoord* super_coords;
  907. const uint32* offsets;
  908. uint8* sequences;
  909. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  910. super_coord_to_seq(const string_set_type _string_set,
  911. const SequenceSetKmerCoord* _super_coords, const uint32* _offsets,
  912. uint8* _sequences) :
  913. string_set(_string_set), super_coords(_super_coords), offsets(_offsets),
  914. sequences(_sequences) {}
  915. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  916. void operator() (const uint64 idx) const
  917. {
  918. const SequenceSetKmerCoord coord = super_coords[idx];
  919. const sequence seq = string_set[coord.x];
  920. const uint32 seq_pos = coord.y;
  921. const uint32 offset = offsets[idx];
  922. for (uint32 i = 0; i < coord.z - coord.y; i++) {
  923. uint8 c = dna_to_char(seq[seq_pos + i]);
  924. sequences[offset + i] = c;
  925. }
  926. }
  927. };
  928. // generate the coordinates of each kmer in the given set
  929. template <typename string_set_type>
  930. void D_KmerSet<string_set_type>::gen_kmer_coords()
  931. {
  932. n_kmers = enumerate_string_set_seeds(
  933. string_set,
  934. uniform_seeds_functor<>(kmer_size, 1u),
  935. coords);
  936. // record the index into the kmer array in the coordinates vector
  937. thrust::for_each(
  938. thrust::make_counting_iterator<uint32>(0u),
  939. thrust::make_counting_iterator<uint32>(0u) + n_kmers,
  940. set_global_id_region_id(plain_view(coords), plain_view(active_region_ids)));
  941. }
  942. // generate the 64-bit compacted kmer sequence keys
  943. // assumes the coordinates have already been generated
  944. template <typename string_set_type>
  945. void D_KmerSet<string_set_type>::gen_kmer_64b_keys()
  946. {
  947. kmers_64b.resize(n_kmers);
  948. thrust::transform(
  949. coords.begin(),
  950. coords.begin() + n_kmers,
  951. kmers_64b.begin(),
  952. kmers_to_64b_functor<string_set_type>(kmer_size, 2, string_set));
  953. }
  954. // sort the kmers based on the 64-bit kmer sequence key only
  955. // assumes the keys have already been generated
  956. template <typename string_set_type>
  957. void D_KmerSet<string_set_type>::sort_kmers_by_64b_keys()
  958. {
  959. // identical keys from the same sequence will be consecutive
  960. // identical keys from the same region will also be consecutive
  961. thrust::stable_sort_by_key(
  962. kmers_64b.begin(),
  963. kmers_64b.begin() + n_kmers,
  964. coords.begin());
  965. }
  966. // stable sort the kmers based on the 64-bit kmer sequence key
  967. // assumes the kmers are sorted by seq id only already
  968. // assumes the keys have already been generated
  969. template <typename string_set_type>
  970. template <typename meta_iterator_type>
  971. void D_KmerSet<string_set_type>::sort_kmers_by_64b_keys_meta(const meta_iterator_type meta_data)
  972. {
  973. thrust::stable_sort_by_key(
  974. kmers_64b.begin(),
  975. kmers_64b.begin() + n_kmers,
  976. thrust::make_zip_iterator(thrust::make_tuple(coords.begin(), meta_data)));
  977. }
  978. // sort the kmers based on the 64-bit kmer sequence key by region
  979. // assumes the keys have already been generated
  980. template <typename string_set_type>
  981. void D_KmerSet<string_set_type>::segmented_sort_kmers_by_64b_keys()
  982. {
  983. // identical keys from the same sequence will be consecutive
  984. // identical keys from the same region will also be consecutive
  985. thrust::stable_sort_by_key(
  986. kmers_64b.begin(),
  987. kmers_64b.begin() + n_kmers,
  988. coords.begin());
  989. D_VectorU32::iterator region_ids = get_scratch_space(n_kmers);
  990. thrust::transform(
  991. coords.begin(),
  992. coords.begin() + n_kmers,
  993. region_ids,
  994. get_kmer_reg_id());
  995. thrust::stable_sort_by_key(
  996. region_ids,
  997. region_ids + n_kmers,
  998. thrust::make_zip_iterator(thrust::make_tuple(coords.begin(), kmers_64b.begin())));
  999. reset_scratch_space();
  1000. }
  1001. // sort kmers by key and set sequence id
  1002. // doing 2-pass sort to use the much faster thrust radix-sort
  1003. // instead of merge-sort on custom keys
  1004. template <typename string_set_type>
  1005. void D_KmerSet<string_set_type>::sort_kmers_by_64b_keys_seqid()
  1006. {
  1007. thrust::stable_sort_by_key(
  1008. kmers_64b.begin(),
  1009. kmers_64b.begin() + n_kmers,
  1010. coords.begin());
  1011. D_VectorU32 seq_ids(n_kmers);
  1012. thrust::transform(
  1013. coords.begin(),
  1014. coords.begin() + n_kmers,
  1015. seq_ids.begin(),
  1016. get_kmer_seq_id());
  1017. thrust::stable_sort_by_key(
  1018. seq_ids.begin(),
  1019. seq_ids.begin() + n_kmers,
  1020. thrust::make_zip_iterator(thrust::make_tuple(kmers_64b.begin(), coords.begin())));
  1021. }
  1022. // sort kmers by key and set sequence id
  1023. // sort the additional meta-data along with the kmers
  1024. // doing 2-pass sort to use the much faster thrust radix-sort
  1025. // instead of merge-sort on custom keys
  1026. template <typename string_set_type>
  1027. template <typename meta_iterator_type>
  1028. void D_KmerSet<string_set_type>::sort_kmers_by_64b_keys_seqid_meta(const meta_iterator_type meta_data)
  1029. {
  1030. thrust::stable_sort_by_key(
  1031. kmers_64b.begin(),
  1032. kmers_64b.begin() + n_kmers,
  1033. thrust::make_zip_iterator(thrust::make_tuple(coords.begin(), meta_data)));
  1034. D_VectorU32 seq_ids(n_kmers);
  1035. thrust::transform(
  1036. coords.begin(),
  1037. coords.begin() + n_kmers,
  1038. seq_ids.begin(),
  1039. get_kmer_seq_id());
  1040. thrust::stable_sort_by_key(
  1041. seq_ids.begin(),
  1042. seq_ids.begin() + n_kmers,
  1043. thrust::make_zip_iterator(thrust::make_tuple(kmers_64b.begin(), coords.begin(), meta_data)));
  1044. }
  1045. // sort kmers in lexicographic order - sorts the indices and returns the iterator
  1046. // kmer length can vary
  1047. // if kmer_size is set, all kmers are assumed to have this size
  1048. template <typename string_set_type>
  1049. D_VectorU32::iterator sort_kmers_lexicographic(const string_set_type string_set,
  1050. D_VectorSetKmerCoord& coords,
  1051. const uint32 n_kmers, const uint32 fixed_kmer_size,
  1052. D_VectorU32& indices)
  1053. {
  1054. const uint32 WORD_BITS = uint32(8u * sizeof(uint32));
  1055. const uint32 SYMBOL_SIZE = 4u;
  1056. const uint32 SYMBOLS_PER_WORD = WORD_BITS/SYMBOL_SIZE;
  1057. // maximum size of a kmer in the set
  1058. uint32 max_length = fixed_kmer_size;
  1059. if(fixed_kmer_size == 0u) {
  1060. max_length = thrust::reduce(
  1061. thrust::make_transform_iterator(coords.begin(), get_kmer_size()),
  1062. thrust::make_transform_iterator(coords.begin() + n_kmers, get_kmer_size()),
  1063. 0u,
  1064. thrust::maximum<uint32>());
  1065. }
  1066. // maximum number of words needed to represent a kmer
  1067. const uint32 max_words = (max_length + SYMBOLS_PER_WORD-1)/SYMBOLS_PER_WORD;
  1068. // ----- LSD radix-sort (word by word) -----
  1069. //uint32 n_kmers = coords.size();
  1070. D_VectorU32 radices(2*n_kmers); // kmer words (sort keys)
  1071. indices.resize(2*n_kmers); // kmer ids (values to be sorted)
  1072. thrust::copy(
  1073. thrust::make_counting_iterator<uint32>(0u),
  1074. thrust::make_counting_iterator<uint32>(n_kmers),
  1075. indices.begin());
  1076. // ping-pong buffers
  1077. cuda::SortBuffers<uint32*,uint32*> sort_buffers;
  1078. cuda::SortEnactor sort_enactor;
  1079. sort_buffers.selector = 0;
  1080. sort_buffers.keys[0] = nvbio::device_view(radices);
  1081. sort_buffers.keys[1] = nvbio::device_view(radices) + n_kmers;
  1082. sort_buffers.values[0] = nvbio::device_view(indices);
  1083. sort_buffers.values[1] = nvbio::device_view(indices) + n_kmers;
  1084. // sort in LSD order
  1085. for (int32 word_idx = max_words-1; word_idx >= 0; --word_idx) {
  1086. // extract the given radix word from each kmer
  1087. thrust::transform(
  1088. indices.begin() + sort_buffers.selector * n_kmers,
  1089. indices.begin() + sort_buffers.selector * n_kmers + n_kmers,
  1090. radices.begin() + sort_buffers.selector * n_kmers,
  1091. kmer_word_extractor_functor<string_set_type>(
  1092. string_set,
  1093. SYMBOL_SIZE,
  1094. plain_view(coords),
  1095. word_idx));
  1096. // sort the words
  1097. sort_enactor.sort(n_kmers, sort_buffers);
  1098. }
  1099. return indices.begin() + sort_buffers.selector * n_kmers;
  1100. }
  1101. template <typename string_set_type>
  1102. void segmented_sort_super_kmers_lexicographic(const string_set_type string_set, uint32 n_coords,
  1103. D_VectorSetKmerCoord& super_coords, D_VectorU32& super_prefix_ids, D_VectorU32& super_suffix_ids)
  1104. {
  1105. D_VectorU32 ids(n_coords);
  1106. D_VectorSetKmerCoord temp(super_coords);
  1107. D_VectorU32 temp_pref(super_prefix_ids);
  1108. D_VectorU32 temp_suf(super_suffix_ids);
  1109. // first pass: sort by coord key
  1110. D_VectorU32::iterator sorted_ids = sort_kmers_lexicographic(string_set, super_coords, n_coords, 0u, ids);
  1111. thrust::gather(
  1112. sorted_ids,
  1113. sorted_ids + n_coords,
  1114. temp.begin(),
  1115. super_coords.begin());
  1116. thrust::gather(
  1117. sorted_ids,
  1118. sorted_ids + n_coords,
  1119. thrust::make_zip_iterator(thrust::make_tuple(temp_pref.begin(), temp_suf.begin())),
  1120. thrust::make_zip_iterator(thrust::make_tuple(super_prefix_ids.begin(), super_suffix_ids.begin())));
  1121. // second pass: stable sort by region id => as a result coord keys will be sorted by region
  1122. //D_VectorU32 reg_ids(n_coords);
  1123. thrust::transform(
  1124. super_coords.begin(),
  1125. super_coords.begin() + n_coords,
  1126. ids.begin(),
  1127. get_kmer_reg_id());
  1128. thrust::stable_sort_by_key(
  1129. ids.begin(),
  1130. ids.begin() + n_coords,
  1131. thrust::make_zip_iterator(thrust::make_tuple(super_coords.begin(), super_prefix_ids.begin(), super_suffix_ids.begin())));
  1132. }
  1133. // count the kmer occurrences across all the sequences in the set
  1134. // assumes that the keys have been sorted
  1135. template <typename string_set_type>
  1136. void D_KmerSet<string_set_type>::count_kmers_rle()
  1137. {
  1138. // count the number of distinct keys and extract them
  1139. kmers_64b_distinct.resize(n_kmers);
  1140. kmers_counts.resize(n_kmers);
  1141. D_VectorU8 d_temp_storage;
  1142. n_distinct = cuda::runlength_encode(
  1143. n_kmers,
  1144. kmers_64b.begin(),
  1145. kmers_64b_distinct.begin(),
  1146. kmers_counts.begin(),
  1147. d_temp_storage);
  1148. }
  1149. // count the kmer occurrences across all the sequences in the set
  1150. // assumes that the keys have been sorted
  1151. template <typename string_set_type>
  1152. void D_KmerSet<string_set_type>::count_kmers()
  1153. {
  1154. // count the number of distinct keys and extract them
  1155. kmers_64b_unique_idxs.resize(n_kmers);
  1156. kmers_counts.resize(n_kmers);
  1157. n_unique = thrust::reduce_by_key(
  1158. thrust::counting_iterator<uint32>(0),
  1159. thrust::counting_iterator<uint32>(0) + n_kmers,
  1160. thrust::constant_iterator<uint32>(1),
  1161. kmers_64b_unique_idxs.begin(),
  1162. kmers_counts.begin(),
  1163. kmer_key_rid_eq(plain_view(kmers_64b), plain_view(coords))).first - kmers_64b_unique_idxs.begin();
  1164. }
  1165. // partition kmers by index into the sorted kmer key set
  1166. // based on whether they are unique or not
  1167. // a kmer is not unique if it occurs more than once in at least one sequence
  1168. template <typename string_set_type>
  1169. void D_KmerSet<string_set_type>::partition_kmers_by_uniqueness()
  1170. {
  1171. // count the number of distinct kmers per sequence
  1172. D_VectorU32::iterator distinct_idxs_per_seq = get_scratch_space(n_kmers);
  1173. D_VectorU32::iterator count_per_seq = get_scratch_space(n_kmers);
  1174. thrust::counting_iterator<uint64> ids(0);
  1175. uint64 n_distinct_per_seq = thrust::reduce_by_key(
  1176. ids,
  1177. ids + n_kmers,
  1178. thrust::constant_iterator<uint32>(1),
  1179. distinct_idxs_per_seq,
  1180. count_per_seq,
  1181. kmer_key_sid_eq(plain_view(kmers_64b), plain_view(coords))).first - distinct_idxs_per_seq;
  1182. D_VectorU32::iterator distinct_idxs = get_scratch_space(n_distinct_per_seq);
  1183. D_VectorU32::iterator seq_count = get_scratch_space(n_distinct_per_seq); // number of sequences containing the same kmer
  1184. D_VectorU32::iterator kmer_counts = get_scratch_space(n_distinct_per_seq); // number of kmer occurrences across all sequences
  1185. n_distinct = thrust::reduce_by_key(
  1186. distinct_idxs_per_seq,
  1187. distinct_idxs_per_seq + n_distinct_per_seq,
  1188. thrust::make_zip_iterator(thrust::make_tuple(thrust::constant_iterator<uint32>(1), count_per_seq)),
  1189. distinct_idxs,
  1190. thrust::make_zip_iterator(thrust::make_tuple(seq_count, kmer_counts)),
  1191. kmer_key_rid_eq(plain_view(kmers_64b), plain_view(coords)),
  1192. kmer_count_tuple_sum()).first - distinct_idxs;
  1193. // partition the distinct indices into unique and non-unique kmers
  1194. kmers_64b_unique_idxs.resize(n_distinct);
  1195. kmers_counts.resize(n_distinct);
  1196. n_unique = thrust::copy_if(
  1197. thrust::make_zip_iterator(thrust::make_tuple(distinct_idxs, kmer_counts)),
  1198. thrust::make_zip_iterator(thrust::make_tuple(distinct_idxs + n_distinct, kmer_counts + n_distinct)),
  1199. thrust::make_zip_iterator(thrust::make_tuple(seq_count, kmer_counts)),
  1200. thrust::make_zip_iterator(thrust::make_tuple(kmers_64b_unique_idxs.begin(), kmers_counts.begin())),
  1201. is_unique_kmer()) - thrust::make_zip_iterator(thrust::make_tuple(kmers_64b_unique_idxs.begin(), kmers_counts.begin()));
  1202. reset_scratch_space();
  1203. }
  1204. template <typename string_set_type>
  1205. void D_KmerSet<string_set_type>::gen_global_unique_map()
  1206. {
  1207. D_VectorU8 unique_map_sorted(n_kmers);
  1208. thrust::for_each(
  1209. kmers_64b_unique_idxs.begin(),
  1210. kmers_64b_unique_idxs.begin() + n_unique,
  1211. mark_kmer_uniqueness(plain_view(unique_map_sorted), plain_view(kmers_64b), plain_view(coords)));
  1212. global_unique_flag_map.resize(n_kmers);
  1213. thrust::gather(
  1214. global_to_sorted_id_map.begin(),
  1215. global_to_sorted_id_map.begin() + n_kmers,
  1216. unique_map_sorted.begin(),
  1217. global_unique_flag_map.begin());
  1218. }
  1219. template <typename string_set_type>
  1220. void D_KmerSet<string_set_type>::gen_global_UID_map()
  1221. {
  1222. D_VectorU32 id_map_sorted(n_kmers);
  1223. thrust::fill(id_map_sorted.begin(), id_map_sorted.begin() + n_kmers, (uint32) -1);
  1224. thrust::for_each(
  1225. thrust::make_zip_iterator(thrust::make_tuple(thrust::make_counting_iterator<uint32>(0u), kmers_64b_unique_idxs.begin())),
  1226. thrust::make_zip_iterator(thrust::make_tuple(thrust::make_counting_iterator<uint32>(0u) + n_unique, kmers_64b_unique_idxs.begin() + n_unique)),
  1227. store_kmer_unique_ids(plain_view(id_map_sorted), plain_view(kmers_64b), plain_view(coords)));
  1228. global_to_UID_map.resize(n_kmers);
  1229. thrust::gather(
  1230. global_to_sorted_id_map.begin(),
  1231. global_to_sorted_id_map.begin() + n_kmers,
  1232. id_map_sorted.begin(),
  1233. global_to_UID_map.begin());
  1234. }
  1235. template <typename string_set_type>
  1236. void D_KmerSet<string_set_type>::gen_global_to_sorted_id_map()
  1237. {
  1238. global_to_sorted_id_map.resize(n_kmers);
  1239. thrust::for_each(
  1240. thrust::make_zip_iterator(thrust::make_tuple(thrust::make_counting_iterator<uint32>(0u), coords.begin())),
  1241. thrust::make_zip_iterator(thrust::make_tuple(thrust::make_counting_iterator<uint32>(0u) + n_kmers, coords.begin() + n_kmers)),
  1242. global_to_sorted_id(plain_view(global_to_sorted_id_map)));
  1243. }
  1244. template <typename string_set_type>
  1245. void D_KmerSet<string_set_type>::mark_unique_kmers()
  1246. {
  1247. global_unique_flag_map.resize(n_kmers);
  1248. global_to_UID_map.resize(n_kmers);
  1249. thrust::fill(global_to_UID_map.begin(), global_to_UID_map.begin() + n_kmers, (uint32) -1);
  1250. thrust::for_each(
  1251. thrust::make_counting_iterator<uint32>(0u),
  1252. thrust::make_counting_iterator<uint32>(0u) + n_unique,
  1253. populate_unique_kmer_data(plain_view(kmers_counts), plain_view(coords), plain_view(kmers_64b_unique_idxs),
  1254. plain_view(global_unique_flag_map), plain_view(global_to_UID_map)));
  1255. }
  1256. struct test_funct : public thrust::unary_function<uint32,uint32>
  1257. {
  1258. NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
  1259. uint32 operator()(const uint32 uid)
  1260. {
  1261. return uid;
  1262. }
  1263. };
  1264. // only keep the coordinates that have a unique prefix
  1265. template <typename string_set_type>
  1266. void D_KmerSet<string_set_type>::filter_coords_by_prefix_uniqueness(const D_VectorU8& unique_map)
  1267. {
  1268. D_VectorSetKmerCoord unique_pref_coords(n_kmers);
  1269. uint32 n_unique_coords = thrust::copy_if( // remove_if
  1270. coords.begin(),
  1271. coords.begin() + n_kmers,
  1272. thrust::make_permutation_iterator(
  1273. unique_map.begin(),
  1274. thrust::make_transform_iterator(coords.begin(), get_prefix_global_id())),
  1275. unique_pref_coords.begin(),
  1276. thrust::identity<uint8>()) - unique_pref_coords.begin();
  1277. thrust::copy(unique_pref_coords.begin(), unique_pref_coords.begin() + n_unique_coords, coords.begin());
  1278. n_kmers = n_unique_coords;
  1279. }
  1280. // for every kmer that is not unique N
  1281. // find its closest unique predecessor P and successor S kmer in the same sequence
  1282. // and output a kmer chain (P ->... N ... -> S)
  1283. // as a new 'super' kmer coordinate
  1284. template <typename string_set_type>
  1285. void D_KmerSet<string_set_type>::extract_super_kmers()
  1286. {
  1287. // collect the ids of the repeat kmers
  1288. kmers_64b_repeat_idxs.resize(n_kmers);
  1289. uint32 n_chains = thrust::copy_if(
  1290. thrust::make_counting_iterator<uint32>(0u),
  1291. thrust::make_counting_iterator<uint32>(0u) + n_kmers,
  1292. global_unique_flag_map.begin(),
  1293. kmers_64b_repeat_idxs.begin(),
  1294. thrust::logical_not<uint8>()) - kmers_64b_repeat_idxs.begin();
  1295. D_VectorSetKmerCoord super_kmers(n_chains);
  1296. D_VectorU32 prefix_uids(n_chains);
  1297. D_VectorU32 suffix_uids(n_chains);
  1298. thrust::for_each(
  1299. thrust::make_counting_iterator(0),
  1300. thrust::make_counting_iterator(0) + n_chains,
  1301. extract_super_kmers_functor<string_set_type>(
  1302. plain_view(kmers_64b_repeat_idxs),
  1303. plain_view(global_unique_flag_map),
  1304. plain_view(global_to_UID_map),
  1305. plain_view(global_to_sorted_id_map),
  1306. plain_view(coords),
  1307. string_set,
  1308. kmer_size,
  1309. plain_view(super_kmers),
  1310. plain_view(prefix_uids),
  1311. plain_view(suffix_uids)));
  1312. // collapse chains that belong to the same sequence and same unique kmer predecessor
  1313. // (these will be consecutive by construction)
  1314. // keep the max chain length coord per same kmer and seq id
  1315. // -- will keep one copy of the chains that is terminated with a unique kmer
  1316. // and the longest chain amongst the ones that are not
  1317. super_coords.resize(n_chains);
  1318. super_prefix_uids.resize(n_chains);
  1319. super_suffix_uids.resize(n_chains);
  1320. uint32 n_collapsed_chains = thrust::reduce_by_key(
  1321. thrust::make_counting_iterator(0u),
  1322. thrust::make_counting_iterator(0u) + n_chains,
  1323. thrust::make_zip_iterator(thrust::make_tuple(super_kmers.begin(), prefix_uids.begin(), suffix_uids.begin())),
  1324. thrust::make_discard_iterator(),
  1325. thrust::make_zip_iterator(thrust::make_tuple(super_coords.begin(), super_prefix_uids.begin(), super_suffix_uids.begin())),
  1326. kmer_pos_sid_eq(plain_view(super_kmers)),
  1327. collapse_same_start_kmers()).second - thrust::make_zip_iterator(thrust::make_tuple(super_coords.begin(), super_prefix_uids.begin(), super_suffix_uids.begin()));
  1328. // eliminate non-unique kmers that are not preceded by a unique kmer
  1329. D_VectorSetKmerCoord t1(n_chains);
  1330. D_VectorU32 t2(n_chains);
  1331. D_VectorU32 t3(n_chains);
  1332. n_super_coords = thrust::remove_copy_if(
  1333. thrust::make_zip_iterator(thrust::make_tuple(
  1334. super_coords.begin(),
  1335. super_prefix_uids.begin(),
  1336. super_suffix_uids.begin())),
  1337. thrust::make_zip_iterator(thrust::make_tuple(
  1338. super_coords.begin() + n_collapsed_chains,
  1339. super_prefix_uids.begin() + n_collapsed_chains,
  1340. super_suffix_uids.begin() + n_collapsed_chains)),
  1341. super_prefix_uids.begin(),
  1342. thrust::make_zip_iterator(thrust::make_tuple(
  1343. t1.begin(),
  1344. t2.begin(),
  1345. t3.begin())),
  1346. is_invalid_super_kmer()) - thrust::make_zip_iterator(thrust::make_tuple(
  1347. t1.begin(),
  1348. t2.begin(),
  1349. t3.begin()));
  1350. thrust::copy(t1.begin(), t1.begin() + n_chains, super_coords.begin());
  1351. thrust::copy(t2.begin(), t2.begin() + n_chains, super_prefix_uids.begin());
  1352. thrust::copy(t3.begin(), t3.begin() + n_chains, super_suffix_uids.begin());
  1353. //super_coords.erase(super_coords.begin() + n_super_coords, super_coords.end());
  1354. //super_prefix_uids.erase(super_prefix_uids.begin() + n_super_coords, super_prefix_uids.end());
  1355. //super_suffix_uids.erase(super_suffix_uids.begin() + n_super_coords, super_suffix_uids.end());
  1356. // printf("Collapsed Filtered Super Kmer Coords: \n");
  1357. // for(uint64 i = 0; i < n_collapsed_chains_filtered; i++) {
  1358. // SequenceSetKmerCoord c = (SequenceSetKmerCoord) super_coords[i];
  1359. // printf("coord [%u %u %u %u] pref %llu suf %llu\n", c.x, c.y, c.z, c.w, (uint64) super_prefix_global_ids[i],
  1360. // (uint64) super_suffix_global_ids[i]);
  1361. // }
  1362. reset_scratch_space();
  1363. }
  1364. // collapses lexicographically sorted chains of kmers by computing their prefix overlaps
  1365. // and extracting kmers not included in the overlapping region
  1366. // along with kmer adjacencies and adjacency count information
  1367. template <typename string_set_type>
  1368. void D_KmerSet<string_set_type>::collapse_and_extract_non_overlaps(D_VectorSetKmerCoord& kmers_out,
  1369. D_VectorU32& prefix_ids_out, D_VectorU32& suffix_ids_out, D_VectorU32& adj_counts)
  1370. {
  1371. uint32 n_super_kmers = n_super_coords;
  1372. // find overlap with the previous chain (in sorted order)
  1373. // overlap is measured in # of shared prefix kmers
  1374. D_VectorU32 max_predecessor_overlaps(n_super_kmers);
  1375. thrust::transform(
  1376. thrust::make_counting_iterator<uint32>(1u), // skip the first chain
  1377. thrust::make_counting_iterator<uint32>(0u) + n_super_kmers,
  1378. max_predecessor_overlaps.begin() + 1,
  1379. find_max_kmer_overlaps<string_set_type>(string_set, plain_view(super_coords), kmer_size));
  1380. // find the number of kmer nodes that need to be created
  1381. // the first kmer in a chain is a unique kmer and should be ignored
  1382. // the last kmer in the chain might be unique, in which case it is also ignored
  1383. thrust::device_vector<uint8> temp_storage;
  1384. D_VectorU32 chain_extraction_offsets(n_super_kmers + 1);
  1385. cuda::inclusive_scan(
  1386. n_super_kmers,
  1387. thrust::make_transform_iterator(
  1388. thrust::make_counting_iterator<uint32>(0u),
  1389. num_non_overlapped_repeats(
  1390. plain_view(super_coords),
  1391. plain_view(super_suffix_uids),
  1392. plain_view(max_predecessor_overlaps), kmer_size)),
  1393. chain_extraction_offsets.begin() + 1,
  1394. thrust::plus<uint32>(),
  1395. temp_storage);
  1396. D_VectorU32 adj_extraction_offsets(n_super_kmers + 1);
  1397. cuda::inclusive_scan(
  1398. n_super_kmers,
  1399. thrust::make_transform_iterator(
  1400. thrust::make_counting_iterator<uint32>(0u),
  1401. num_adj_repeats(
  1402. plain_view(super_coords),
  1403. plain_view(super_suffix_uids),
  1404. plain_view(max_predecessor_overlaps), kmer_size)),
  1405. adj_extraction_offsets.begin() + 1,
  1406. thrust::plus<uint32>(),
  1407. temp_storage);
  1408. // extract non-unique kmers and the (k+1)mer prefix and suffix kmer ids from the collapsed chains
  1409. n_repeat = chain_extraction_offsets[n_super_kmers];
  1410. uint32 n_prefsuf_to_extract = adj_extraction_offsets[n_super_kmers];
  1411. //printf("n_super_coords %u \n", n_super_coords);
  1412. //printf("n_repeat %u \n", n_repeat);
  1413. //printf("Num X-R-X edges %u \n", n_prefsuf_to_extract);
  1414. // allocate the necessary storage and populate
  1415. kmers_out.resize(n_unique + n_repeat);
  1416. prefix_ids_out.resize(n_prefsuf_to_extract);
  1417. suffix_ids_out.resize(n_prefsuf_to_extract);
  1418. thrust::for_each(
  1419. thrust::make_counting_iterator<uint32>(0u),
  1420. thrust::make_counting_iterator<uint32>(0u) + n_super_kmers,
  1421. extract_non_overlapped_repeats<string_set_type>(
  1422. string_set,
  1423. plain_view(super_coords),
  1424. plain_view(super_prefix_uids),
  1425. plain_view(super_suffix_uids),
  1426. plain_view(chain_extraction_offsets),
  1427. plain_view(adj_extraction_offsets),
  1428. plain_view(max_predecessor_overlaps),
  1429. kmer_size, n_unique,
  1430. plain_view(kmers_out),
  1431. plain_view(prefix_ids_out),
  1432. plain_view(suffix_ids_out)));
  1433. // count the number of repeated kmer adjacencies
  1434. adj_counts.resize(n_prefsuf_to_extract);
  1435. thrust::fill(adj_counts.begin(), adj_counts.begin() + n_prefsuf_to_extract, 1);
  1436. thrust::for_each(
  1437. thrust::make_counting_iterator<uint32>(1u),
  1438. thrust::make_counting_iterator<uint32>(0u) + n_super_kmers,
  1439. count_overlapped_adjacencies(
  1440. plain_view(adj_extraction_offsets),
  1441. plain_view(max_predecessor_overlaps),
  1442. plain_view(adj_counts)));
  1443. }