123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264 |
- /*
- * nvbio
- * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of the NVIDIA CORPORATION nor the
- * names of its contributors may be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
- #include "mem-search.h"
- #include "options.h"
- #include "pipeline.h"
- #include "util.h"
- #include <nvbio/basic/numbers.h>
- #include <nvbio/basic/algorithms.h>
- #include <nvbio/basic/timer.h>
- #include <nvbio/io/sequence/sequence_mmap.h>
- #include <thrust/iterator/counting_iterator.h>
- #include <thrust/iterator/transform_iterator.h>
- #include <thrust/sort.h>
- using namespace nvbio;
- static io::SequenceData *load_genome(const char *name)
- {
- if (command_line_options.genome_use_mmap)
- {
- // try loading via mmap first
- io::SequenceDataMMAP *mmap_loader = io::map_sequence_file( name );
- if (mmap_loader)
- return mmap_loader;
- }
- // fall back to file name
- io::SequenceDataHost *file_loader = io::load_sequence_file( DNA, name );
- if (file_loader == NULL)
- {
- log_error(stderr, "could not load genome %s\n");
- exit(1);
- }
- return file_loader;
- }
- static io::FMIndexData *load_index(const char *name, uint32 flags)
- {
- if (command_line_options.genome_use_mmap)
- {
- // try loading via mmap first
- io::FMIndexDataMMAP *mmap_loader = new io::FMIndexDataMMAP();
- if (mmap_loader->load(name))
- return mmap_loader;
- delete mmap_loader;
- }
- // fall back to file name
- io::FMIndexDataHost *file_loader = new io::FMIndexDataHost();
- if (!file_loader->load(name, flags))
- {
- log_error(stderr, "could not load index %s\n");
- exit(1);
- }
- return file_loader;
- }
- void mem_init(struct pipeline_state *pipeline)
- {
- // load the genome on the host
- pipeline->mem.reference_data_host = load_genome(command_line_options.genome_file_name);
- // copy genome data to device
- pipeline->mem.reference_data_device = new io::SequenceDataDevice(*pipeline->mem.reference_data_host);
- // this specifies which portions of the FM index data to load
- const uint32 fm_flags = io::FMIndexData::FORWARD |
- io::FMIndexData::REVERSE |
- io::FMIndexData::SA;
- // load the genome on the host
- pipeline->mem.fmindex_data_host = load_index(command_line_options.genome_file_name, fm_flags);
- // copy genome data to device
- pipeline->mem.fmindex_data_device = new io::FMIndexDataDevice(*pipeline->mem.fmindex_data_host, fm_flags);
- pipeline->mem.f_index = pipeline->mem.fmindex_data_device->index();
- pipeline->mem.r_index = pipeline->mem.fmindex_data_device->rindex();
- }
- // search MEMs for all reads in batch
- void mem_search(struct pipeline_state *pipeline, const io::SequenceDataDevice *reads)
- {
- ScopedTimer<float> timer( &pipeline->stats.search_time ); // keep track of the time spent here
- struct mem_state *mem = &pipeline->mem;
- const uint32 n_reads = reads->size();
- // reset the filter
- mem->mem_filter = mem_state::mem_filter_type();
- const io::SequenceDataAccess<DNA_N> read_access( *reads );
- // search for MEMs
- mem->mem_filter.rank(
- mem->f_index,
- mem->r_index,
- read_access.sequence_string_set(),
- command_line_options.min_intv,
- command_line_options.max_intv,
- command_line_options.min_seed_len,
- command_line_options.split_len,
- command_line_options.split_width);
- log_verbose(stderr, "%.1f average ranges\n", float(mem->mem_filter.n_ranges()) / float(n_reads));
- log_verbose(stderr, "%.1f average MEMs\n", float(mem->mem_filter.n_mems()) / float(n_reads));
- optional_device_synchronize();
- nvbio::cuda::check_error("mem-search kernel");
- }
- // given the first read in a chunk, determine a suitably sized chunk of reads
- // (for which we can locate all MEMs in one go), updating pipeline::chunk
- void fit_read_chunk(
- struct pipeline_state *pipeline,
- const io::SequenceDataDevice *reads,
- const uint32 read_begin) // first read in the chunk
- {
- const ScopedTimer<float> timer( &pipeline->stats.search_time ); // keep track of the time spent here
- struct mem_state *mem = &pipeline->mem;
- const uint32 max_hits = command_line_options.mems_batch;
- //
- // use a binary search to locate the ending read forming a chunk with up to max_hits
- //
- pipeline->chunk.read_begin = read_begin;
- // skip pathological cases
- if (mem->mem_filter.n_ranges() == 0)
- {
- pipeline->chunk.read_end = reads->size();
- pipeline->chunk.mem_begin = 0u;
- pipeline->chunk.mem_end = 0u;
- return;
- }
- // determine the index of the first hit in the chunk
- pipeline->chunk.mem_begin = mem->mem_filter.first_hit( read_begin );
- // perform a binary search to find the end of our batch
- pipeline->chunk.read_end = nvbio::string_batch_bound( mem->mem_filter, pipeline->chunk.mem_begin + max_hits );
- // determine the index of the ending hit in the chunk
- pipeline->chunk.mem_end = mem->mem_filter.first_hit( pipeline->chunk.read_end );
- // check whether we couldn't produce a non-empty batch
- if (pipeline->chunk.mem_begin == pipeline->chunk.mem_end &&
- pipeline->chunk.mem_end < mem->mem_filter.n_mems())
- {
- const uint32 mem_count = mem->mem_filter.first_hit( read_begin+1 ) - pipeline->chunk.mem_begin;
- log_error(stderr,"read %u/%u has too many MEMs (%u), exceeding maximum batch size\n", read_begin, reads->size(), mem_count );
- exit(1);
- }
- }
- // a functor to extract the reference location from a mem
- struct mem_loc_functor
- {
- typedef mem_state::mem_type argument_type;
- typedef uint64 result_type;
- NVBIO_HOST_DEVICE
- uint64 operator() (const argument_type mem) const
- {
- return uint64( mem.index_pos() ) | (uint64( mem.string_id() ) << 32);
- }
- };
- // a functor to extract the reference left coordinate from a mem
- struct mem_left_coord_functor
- {
- typedef mem_state::mem_type argument_type;
- typedef uint64 result_type;
- NVBIO_HOST_DEVICE
- uint64 operator() (const argument_type mem) const
- {
- return uint64( mem.span().x ) | (uint64( mem.string_id() ) << 32);
- }
- };
- // locate all mems in the range defined by pipeline::chunk
- void mem_locate(struct pipeline_state *pipeline, const io::SequenceDataDevice *reads)
- {
- const ScopedTimer<float> timer( &pipeline->stats.locate_time ); // keep track of the time spent here
- struct mem_state *mem = &pipeline->mem;
- struct chains_state<device_tag> *chn = &pipeline->chn;
- if (chn->mems.size() < command_line_options.mems_batch)
- {
- chn->mems.resize( command_line_options.mems_batch );
- chn->mems_index.resize( command_line_options.mems_batch );
- chn->mems_chain.resize( command_line_options.mems_batch );
- }
- const uint32 n_mems = pipeline->chunk.mem_end - pipeline->chunk.mem_begin;
- // skip pathological cases
- if (n_mems == 0u)
- return;
- chn->n_mems = n_mems;
- mem->mem_filter.locate(
- pipeline->chunk.mem_begin,
- pipeline->chunk.mem_end,
- chn->mems.begin() );
- // sort the mems by reference location
- nvbio::vector<device_tag,uint64> loc( n_mems );
- thrust::transform(
- chn->mems.begin(),
- chn->mems.begin() + n_mems,
- loc.begin(),
- mem_left_coord_functor() );
- //mem_loc_functor() );
- thrust::copy(
- thrust::make_counting_iterator<uint32>(0u),
- thrust::make_counting_iterator<uint32>(0u) + n_mems,
- chn->mems_index.begin() );
- // TODO: this is slow, switch to nvbio::cuda::SortEnactor
- thrust::sort_by_key(
- loc.begin(),
- loc.begin() + n_mems,
- chn->mems_index.begin() );
- optional_device_synchronize();
- nvbio::cuda::check_error("mem-locate kernel");
- pipeline->stats.n_mems += n_mems; // keep track of the number of mems produced
- }
|