XuMengJie
/
agv_navigation_ws


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716
							/*
 * nvbio
 * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *    * Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *    * Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *    * Neither the name of the NVIDIA CORPORATION nor the
 *      names of its contributors may be used to endorse or promote products
 *      derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

// sw-benchmark.cu
//

//
// This project supports comparisons against multi-core SSE-enabled CPUs using
// conditional compilation of the SSW library:
//
//  https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library
//
// In order to perform these additional tests, the user must download ssw.h and ssw.c
// from the above repository, copy them in the sw-benchmark directory, and run cmake with
// the option -DSSWLIB=ON.
//
#if defined(SSWLIB)
#include "ssw.h"
#include <omp.h>
#endif

#include <nvbio/basic/timer.h>
#include <nvbio/basic/console.h>
#include <nvbio/basic/cuda/ldg.h>
#include <nvbio/basic/packedstream.h>
#include <nvbio/basic/packedstream_loader.h>
#include <nvbio/basic/vector_view.h>
#include <nvbio/basic/shared_pointer.h>
#include <nvbio/io/sequence/sequence.h>
#include <nvbio/fasta/fasta.h>
#include <nvbio/basic/dna.h>
#include <nvbio/alignment/alignment.h>
#include <nvbio/alignment/batched.h>
#include <nvbio/alignment/sink.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <algorithm>

enum { MAX_READ_LENGTH = 1024 };

using namespace nvbio;

enum { CACHE_SIZE = 64 };
typedef nvbio::lmem_cache_tag<CACHE_SIZE>                                       lmem_cache_tag_type;
typedef nvbio::uncached_tag                                                     uncached_tag_type;

enum { REF_BITS       = 2 };
enum { REF_BIG_ENDIAN = false };

//
// An alignment stream class to be used in conjunction with the BatchAlignmentScore class
//
template <typename t_aligner_type, typename cache_type = lmem_cache_tag_type>
struct AlignmentStream
{
    typedef t_aligner_type                                                          aligner_type;

    typedef nvbio::cuda::ldg_pointer<uint32>                                        storage_iterator;

    typedef nvbio::PackedStringLoader<
        storage_iterator,
        io::SequenceDataTraits<DNA_N>::SEQUENCE_BITS,
        io::SequenceDataTraits<DNA_N>::SEQUENCE_BIG_ENDIAN,cache_type>                          pattern_loader_type;
    typedef typename pattern_loader_type::input_iterator                                        uncached_pattern_iterator;
    typedef typename pattern_loader_type::iterator                                              pattern_iterator;
    typedef nvbio::vector_view<pattern_iterator>                                                pattern_string;

    typedef nvbio::PackedStringLoader<
        storage_iterator,
        REF_BITS,
        REF_BIG_ENDIAN,uncached_tag_type>                                                       text_loader_type;
    typedef typename text_loader_type::input_iterator                                           uncached_text_iterator;
    typedef typename text_loader_type::iterator                                                 text_iterator;
    typedef nvbio::vector_view<text_iterator>                                                   text_string;

    // an alignment context
    struct context_type
    {
        int32                   min_score;
        aln::BestSink<int32>    sink;
    };
    // a container for the strings to be aligned
    struct strings_type
    {
        pattern_loader_type         pattern_loader;
        text_loader_type            text_loader;
        pattern_string              pattern;
        aln::trivial_quality_string quals;
        text_string                 text;
    };

    // constructor
    AlignmentStream(
        aligner_type        _aligner,
        const uint32        _count,
        const uint32*       _offsets,
        const uint32*       _patterns,
        const uint32        _max_pattern_len,
        const uint32        _total_pattern_len,
        const uint32*       _text,
        const uint32        _text_len,
               int16*       _scores) :
        m_aligner           ( _aligner ),
        m_count             (_count),
        m_max_pattern_len   (_max_pattern_len),
        m_total_pattern_len (_total_pattern_len),
        m_text_len          (_text_len),
        m_offsets           (_offsets),
        m_patterns          (storage_iterator(_patterns)),
        m_text              (storage_iterator(_text)),
        m_scores            (_scores) {}

    // get the aligner
    NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
    const aligner_type& aligner() const { return m_aligner; };

    // return the maximum pattern length
    NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
    uint32 max_pattern_length() const { return m_max_pattern_len; }

    // return the maximum text length
    NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
    uint32 max_text_length() const { return m_text_len; }

    // return the stream size
    NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
    uint32 size() const { return m_count; }

    // return the i-th pattern's length
    NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
    uint32 pattern_length(const uint32 i, context_type* context) const { return m_offsets[i+1] - m_offsets[i]; }

    // return the i-th text's length
    NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
    uint32 text_length(const uint32 i, context_type* context) const { return m_text_len; }

    // return the total number of cells
    uint64 cells() const { return uint64( m_total_pattern_len ) * uint64( m_text_len ); }

    // initialize the i-th context
    NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
    bool init_context(
        const uint32    i,
        context_type*   context) const
    {
        context->min_score = Field_traits<int32>::min();
        return true;
    }

    // initialize the i-th context
    NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
    void load_strings(
        const uint32        i,
        const uint32        window_begin,
        const uint32        window_end,
        const context_type* context,
              strings_type* strings) const
    {
        const uint32 offset = m_offsets[i];
        const uint32 length = m_offsets[i+1] - offset;

        strings->text = text_string( m_text_len,
            strings->text_loader.load(
                m_text,
                m_text_len,
                make_uint2( window_begin, window_end ),
                false ) );

        strings->pattern = pattern_string( length,
            strings->pattern_loader.load( m_patterns + offset, length ) );
    }

    // handle the output
    NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
    void output(
        const uint32        i,
        const context_type* context) const
    {
        // copy the output score
        m_scores[i] = context->sink.score;
    }

    aligner_type                m_aligner;
    uint32                      m_count;
    uint32                      m_max_pattern_len;
    uint32                      m_total_pattern_len;
    uint32                      m_text_len;
    const uint32*               m_offsets;
    uncached_pattern_iterator   m_patterns;
    uncached_text_iterator      m_text;
    int16*                      m_scores;
};

// A simple kernel to test the speed of alignment without the possible overheads of the BatchAlignmentScore interface
//
template <uint32 BLOCKDIM, uint32 MAX_PATTERN_LEN, typename aligner_type, typename score_type>
__global__ void alignment_test_kernel(
    const aligner_type aligner,
    const uint32        N_probs,
    const uint32*       offsets,
    const uint32*       pattern_ptr,
    const uint32        text_len,
    const uint32*       text_ptr,
    score_type*         score)
{
    const uint32 tid = blockIdx.x * BLOCKDIM + threadIdx.x;

    typedef lmem_cache_tag_type                                                 lmem_cache_type;
    typedef nvbio::cuda::ldg_pointer<uint32>                                    storage_iterator;

    typedef nvbio::PackedStringLoader<
        storage_iterator,
        io::SequenceDataTraits<DNA_N>::SEQUENCE_BITS,
        io::SequenceDataTraits<DNA_N>::SEQUENCE_BIG_ENDIAN,lmem_cache_type>                     pattern_loader_type;
    typedef typename pattern_loader_type::input_iterator                                        uncached_pattern_iterator;
    typedef typename pattern_loader_type::iterator                                              pattern_iterator;
    typedef nvbio::vector_view<pattern_iterator>                                                pattern_string;

    typedef nvbio::PackedStringLoader<
        storage_iterator,
        REF_BITS,
        REF_BIG_ENDIAN,uncached_tag_type>                                                       text_loader_type;
    typedef typename text_loader_type::input_iterator                                           uncached_text_iterator;
    typedef typename text_loader_type::iterator                                                 text_iterator;
    typedef nvbio::vector_view<text_iterator>                                                   text_string;

    if (tid >= N_probs)
        return;

    const uint32 pattern_off = offsets[tid];
    const uint32 pattern_len = offsets[tid+1] - pattern_off;

    pattern_loader_type pattern_loader;
    pattern_string pattern = pattern_string( pattern_len, pattern_loader.load( uncached_pattern_iterator( pattern_ptr ) + pattern_off, pattern_len ) );

    text_loader_type text_loader;
    text_string text = text_string( text_len, text_loader.load( uncached_text_iterator( text_ptr ), text_len ) );

    aln::BestSink<int32> sink;

    aln::alignment_score<MAX_PATTERN_LEN>(
        aligner,
        pattern,
        aln::trivial_quality_string(),
        text,
        Field_traits<int32>::min(),
        sink );

    score[tid] = sink.score;
}

unsigned char nst_nt4_table[256] = {
	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 5 /*'-'*/, 4, 4,
	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
};

struct ReferenceCounter
{
    ReferenceCounter() : m_size(0) {}

    void begin_read() {}
    void end_read() {}

    void id(const uint8 c) {}
    void read(const uint8 c) { ++m_size; }

    uint32 m_size;
};
struct ReferenceCoder
{
    typedef PackedStream<uint32*,uint8,2,false> stream_type;

    ReferenceCoder(uint32* storage) :
        m_size(0), m_stream( storage )
    {}

    void begin_read() {}
    void end_read() {}

    void id(const uint8 c) {}

    void read(const uint8 s)
    {
        const uint8 c = nst_nt4_table[s];

        m_stream[ m_size++ ] = c < 4 ? c : 0;
    }

    uint32      m_size;
    stream_type m_stream;
};


// execute a given batch alignment type on a given stream
//
// \tparam batch_type               a \ref BatchAlignment "Batch Alignment"
// \tparam stream_type              a stream compatible to the given batch_type
//
// \return                          average time
//
template <typename batch_type, typename stream_type>
float enact_batch(
          batch_type&               batch,
    const stream_type&              stream)
{
    // allow to alloc all the needed temporary storage
    const uint64 temp_size = batch_type::max_temp_storage(
        stream.max_pattern_length(),
        stream.max_text_length(),
        stream.size() );

    Timer timer;
    timer.start();

    // enact the batch
    batch.enact( stream, temp_size, NULL );

    cudaDeviceSynchronize();

    timer.stop();

    return timer.seconds();
}

// execute and time a batch of full DP alignments using BatchAlignmentScore
//
template <typename scheduler_type, typename stream_type>
void batch_score_profile(
    const stream_type               stream)
{
    typedef aln::BatchedAlignmentScore<stream_type, scheduler_type> batch_type;  // our batch type

    // setup a batch
    batch_type batch;

    const float time = enact_batch(
        batch,
        stream );

    fprintf(stderr,"  %5.1f", 1.0e-9f * float(stream.cells())/time );
}

// execute and time the batch_score<scheduler> algorithm for all possible schedulers
//
template <typename aligner_type>
void batch_score_profile_all(
    const aligner_type                      aligner,
    const uint32                            n_tasks,
    const uint32*                           offsets_dvec,
    const uint32*                           pattern_dvec,
    const uint32                            max_pattern_len,
    const uint32                            total_pattern_len,
    const uint32*                           text_dvec,
    const uint32                            text_len,
    int16*                                  score_dvec)
{
    {
        typedef AlignmentStream<aligner_type> stream_type;

        // create a stream
        stream_type stream(
            aligner,
            n_tasks,
            offsets_dvec,
            pattern_dvec,
            max_pattern_len,
            total_pattern_len,
            text_dvec,
            text_len,
            score_dvec );

        // test the DeviceThreadScheduler
        batch_score_profile<aln::DeviceThreadScheduler>( stream );

        // test the DeviceStagedThreadScheduler
        //batch_score_profile<aln::DeviceStagedThreadScheduler>( stream );
    }
    {
        const uint32 BLOCKDIM = 128;
        const uint32 N_BLOCKS = (n_tasks + BLOCKDIM-1) / BLOCKDIM;

        Timer timer;
        timer.start();

        // enact the batch
        alignment_test_kernel<BLOCKDIM,MAX_READ_LENGTH> <<<N_BLOCKS,BLOCKDIM>>>(
            aligner,
            n_tasks,
            offsets_dvec,
            pattern_dvec,
            text_len,
            text_dvec,
            score_dvec );

        cudaDeviceSynchronize();

        timer.stop();

        const float time = timer.seconds();

        fprintf(stderr,"  %5.1f", 1.0e-9f * float(uint64(total_pattern_len)*uint64(text_len))/time );
    }
    fprintf(stderr, " GCUPS\n");
}

enum AlignmentTest
{
    ALL                 = 0xFFFFFFFFu,
    ED                  = 1u,
    SW                  = 2u,
    GOTOH               = 4u,
    ED_BANDED           = 8u,
    SW_BANDED           = 16u,
    GOTOH_BANDED        = 32u,
    SSW                 = 64u
};

int main(int argc, char* argv[])
{
    uint32 TEST_MASK        = 0xFFFFFFFFu;

    const char* reads_name  = argv[argc-2];
    const char* ref_name    = argv[argc-1];
    uint32      threads     = omp_get_num_procs();
    io::QualityEncoding qencoding = io::Phred33;

    for (int i = 0; i < argc-2; ++i)
    {
        if (strcmp( argv[i], "-tests" ) == 0)
        {
            const std::string tests_string( argv[++i] );

            char temp[256];
            const char* begin = tests_string.c_str();
            const char* end   = begin;

            TEST_MASK = 0u;

            while (1)
            {
                while (*end != ':' && *end != '\0')
                {
                    temp[end - begin] = *end;
                    end++;
                }

                temp[end - begin] = '\0';

                if (strcmp( temp, "ed" ) == 0)
                    TEST_MASK |= ED;
                else if (strcmp( temp, "sw" ) == 0)
                    TEST_MASK |= SW;
                else if (strcmp( temp, "gotoh" ) == 0)
                    TEST_MASK |= GOTOH;
                else if (strcmp( temp, "ssw" ) == 0)
                    TEST_MASK |= SSW;

                if (*end == '\0')
                    break;

                ++end; begin = end;
            }
        }
        else if (strcmp( argv[i], "-threads" ) == 0)
            threads = atoi( argv[++i] );
    }

    fprintf(stderr,"sw-benchmark... started\n");

    log_visible(stderr, "opening read file \"%s\"\n", reads_name);
    SharedPointer<nvbio::io::SequenceDataStream> read_data_file(
        nvbio::io::open_sequence_file(reads_name,
                                  qencoding)
    );

    log_visible(stderr, "reading reference file \"%s\"... started\n", ref_name);

    // read the reference
    thrust::host_vector<uint32> h_ref_storage;
    uint32                      ref_length;
    uint32                      ref_words;
    {
        ReferenceCounter counter;

        FASTA_inc_reader fasta( ref_name );
        if (fasta.valid() == false)
        {
            fprintf(stderr, "  error: unable to open reference file \"%s\"\n", ref_name);
            exit(1);
        }
        while (fasta.read( 1024, counter ) == 1024);

        ref_length = counter.m_size;
        ref_words  = (ref_length + 15)/16; // # of words at 2 bits per symbol
    }
    {
        h_ref_storage.resize( ref_words );
        ReferenceCoder coder( &h_ref_storage[0] );

        FASTA_inc_reader fasta( ref_name );
        if (fasta.valid() == false)
        {
            fprintf(stderr, "  error: unable to open reference file \"%s\"\n", ref_name);
            exit(1);
        }
        while (fasta.read( 1024, coder ) == 1024);
    }
    log_visible(stderr, "reading reference file \"%s\"... done (%u bps)\n", ref_name, ref_length);

    typedef PackedStream<uint32*,uint8,REF_BITS,REF_BIG_ENDIAN> ref_stream_type;

    thrust::device_vector<uint32> d_ref_storage( h_ref_storage );
    ref_stream_type d_ref_stream( nvbio::raw_pointer( d_ref_storage ) );

    const uint32 batch_size = 256*1024;

    thrust::device_vector<int16> score_dvec( batch_size, 0 );

  #if defined(SSWLIB)
    std::vector<int8_t> unpacked_ref( ref_length );
    {
        ref_stream_type h_ref_stream( nvbio::raw_pointer( h_ref_storage ) );
        for (uint32 i = 0; i < ref_length; ++i)
            unpacked_ref[i] = h_ref_stream[i];
    }

    // Now set the number of threads
    omp_set_num_threads( threads );

    #pragma omp parallel
    {
        fprintf(stderr, "  running on multiple threads\n");
    }
  #endif

    io::SequenceDataHost h_read_data;

    while (io::next( DNA_N, &h_read_data, read_data_file.get(), batch_size ))
    {
        // build the device side representation
        const io::SequenceDataDevice d_read_data( h_read_data );

        const uint32 n_read_symbols = h_read_data.bps();

        fprintf(stderr,"  %u reads, avg: %u bps, max: %u bps\n",
            h_read_data.size(),
            h_read_data.avg_sequence_len(),
            h_read_data.max_sequence_len());

        if (TEST_MASK & GOTOH)
        {
            aln::SimpleGotohScheme scoring;
            scoring.m_match    =  2;
            scoring.m_mismatch = -1;
            scoring.m_gap_open = -2;
            scoring.m_gap_ext  = -1;

            fprintf(stderr,"  testing Gotoh scoring speed...\n");
            fprintf(stderr,"    %15s : ", "global");
            {
                batch_score_profile_all(
                    aln::make_gotoh_aligner<aln::GLOBAL,aln::TextBlockingTag>( scoring ),
                    d_read_data.size(),
                    nvbio::plain_view( d_read_data ).sequence_index(),
                    nvbio::plain_view( d_read_data ).sequence_storage(),
                    d_read_data.max_sequence_len(),
                    n_read_symbols,
                    nvbio::raw_pointer( d_ref_storage ),
                    ref_length,
                    nvbio::raw_pointer( score_dvec ) );
            }

            fprintf(stderr,"    %15s : ", "semi-global");
            {
                batch_score_profile_all(
                    aln::make_gotoh_aligner<aln::SEMI_GLOBAL,aln::TextBlockingTag>( scoring ),
                    d_read_data.size(),
                    nvbio::plain_view( d_read_data ).sequence_index(),
                    nvbio::plain_view( d_read_data ).sequence_storage(),
                    d_read_data.max_sequence_len(),
                    n_read_symbols,
                    nvbio::raw_pointer( d_ref_storage ),
                    ref_length,
                    nvbio::raw_pointer( score_dvec ) );
            }
            fprintf(stderr,"    %15s : ", "local");
            {
                batch_score_profile_all(
                    aln::make_gotoh_aligner<aln::LOCAL,aln::TextBlockingTag>( scoring ),
                    d_read_data.size(),
                    nvbio::plain_view( d_read_data ).sequence_index(),
                    nvbio::plain_view( d_read_data ).sequence_storage(),
                    d_read_data.max_sequence_len(),
                    n_read_symbols,
                    nvbio::raw_pointer( d_ref_storage ),
                    ref_length,
                    nvbio::raw_pointer( score_dvec ) );
            }
        }
        if (TEST_MASK & ED)
        {
            fprintf(stderr,"  testing Edit Distance scoring speed...\n");
            fprintf(stderr,"    %15s : ", "semi-global");
            {
                batch_score_profile_all(
                    aln::make_edit_distance_aligner<aln::SEMI_GLOBAL,aln::TextBlockingTag>(),
                    d_read_data.size(),
                    nvbio::plain_view( d_read_data ).sequence_index(),
                    nvbio::plain_view( d_read_data ).sequence_storage(),
                    d_read_data.max_sequence_len(),
                    n_read_symbols,
                    nvbio::raw_pointer( d_ref_storage ),
                    ref_length,
                    nvbio::raw_pointer( score_dvec ) );
            }
        }

        #if defined(SSWLIB)
        if (TEST_MASK & SSW)
        {
            fprintf(stderr,"  testing SSW scoring speed...\n");
            fprintf(stderr,"    %15s : ", "local");

            const int8_t mat[4*4] = {2, -1, -1, -1, -1, 2, -1, -1, -1, -1, 2, -1, -1, -1, -1, 2};

            std::vector<int8_t> unpacked_reads( n_read_symbols );

            typedef io::SequenceDataAccess<DNA_N>           read_access_type;
            typedef read_access_type::sequence_stream_type  read_stream_type;

            const read_access_type reads_access( h_read_data );

            const read_stream_type packed_reads( reads_access.sequence_stream() );

            #pragma omp parallel for
            for (int i = 0; i < int( n_read_symbols ); ++i)
                unpacked_reads[i] = packed_reads[i];

            Timer timer;
            timer.start();

            #pragma omp parallel for
            for (int i = 0; i < int( h_read_data.size() ); ++i)
            {
                const uint32 read_off = reads_access.sequence_index()[i];
                const uint32 read_len = reads_access.sequence_index()[i+1] - read_off;

                s_profile* prof = ssw_init( &unpacked_reads[read_off], read_len, mat, 4, 2 );

                s_align* align = ssw_align(
                    prof, 
					&unpacked_ref[0], 
					ref_length, 
					2, 
					2,
					0u,	
					0u,
					0,
					15 );

                align_destroy( align );

                init_destroy( prof );
            }

            timer.stop();
            const float time = timer.seconds();

            fprintf(stderr,"  %5.1f", 1.0e-9f * float(uint64(n_read_symbols)*uint64(ref_length))/time );
            fprintf(stderr, " GCUPS\n");
        }
        #endif
    }
    fprintf(stderr,"sw-benchmark... done\n");
    return 0;
}