XuMengJie
/
agv_navigation_ws


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
							/*
 * nvbio
 * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *    * Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *    * Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *    * Neither the name of the NVIDIA CORPORATION nor the
 *      names of its contributors may be used to endorse or promote products
 *      derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

// condition_test.cu
//
#define NVBIO_CUDA_DEBUG

#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <algorithm>
#include <nvbio/basic/timer.h>
#include <nvbio/basic/console.h>
#include <nvbio/basic/vector.h>
#include <nvbio/basic/cuda/arch.h>
#include <nvbio/basic/cuda/condition.h>
#include <cub/cub.cuh>

namespace nvbio {
namespace condition {

__global__
void scan_kernel(const uint32 n_tile_grids, cuda::condition_set_view conditions, uint32* block_values, const uint32 initial_state)
{
    const uint32 prev_block = blockIdx.x ? blockIdx.x-1    : gridDim.x-1;
          uint32 prev_state = blockIdx.x ? initial_state+1 : initial_state;

    for (uint32 i = 0; i < n_tile_grids; ++i, ++prev_state)
    {
        // compute the tile number
        const uint32 tile_idx = blockIdx.x + i * gridDim.x;

        if (threadIdx.x == 0)
        {
            if (tile_idx)
            {
                // wait on the previous block to post its value
                conditions[ prev_block ].wait( prev_state  );

                block_values[tile_idx] = block_values[tile_idx-1] + 1;
            }
            else
            {
                // set the value for the first tile
                block_values[0] = 0;
            }

            // release the condition for this block
            conditions[ blockIdx.x ].signal(); // equivalent to set( initial_state+i+1 )
        }
    }
}

__global__
void chained_kernel(const uint32 n_tile_grids, cuda::condition_set_view conditions, const uint32 initial_state)
{
    const uint32 prev_block = blockIdx.x ? blockIdx.x-1    : gridDim.x-1;
          uint32 prev_state = blockIdx.x ? initial_state+1 : initial_state;

    for (uint32 i = 0; i < n_tile_grids; ++i, ++prev_state)
    {
        // compute the tile number
        const uint32 tile_idx = blockIdx.x + i * gridDim.x;

        if (threadIdx.x == 0)
        {
            if (tile_idx)
            {
                // wait on the previous block to be ready
                conditions[ prev_block ].wait( prev_state  );
            }

            // release the condition for this block
            conditions[ blockIdx.x ].signal(); // equivalent to set( initial_state+i+1 )
        }
    }
}

template <uint32 BLOCKDIM, bool DO_WORK>
__global__
void fast_scan_kernel(const uint32 n_tile_grids, cuda::condition_set_view conditions, volatile uint32* partials, volatile uint32* prefixes, const uint32 initial_state)
{
    //
    // This scan skeleton performs longer range chaining rather than chaining each CTA
    // to its direct predecessor: the idea is that each CTA writes out its partial,
    // and then checks for the availability of prefix of predecessor (without blocking),
    // if not available, it waits for the previous BLOCKDIM-1 partials and the prefix BLOCKDIM
    // CTAs away, and uses all its threads to reduce them.
    // Hence, CTA[i] depends on the prefix of a CTA[i-BLOCKDIM].
    //
    typedef cub::BlockReduce<uint32,BLOCKDIM> BlockReduce;

    const uint32 PARTIAL_READY = initial_state + 1;
    const uint32 PREFIX_READY  = initial_state + 2;
    
    __shared__ typename BlockReduce::TempStorage smem_storage;
    __shared__ uint32 previous_done;

    const bool is_thread0 = (threadIdx.x == 0);

    for (uint32 i = 0; i < n_tile_grids; ++i)
    {
        //__syncthreads();

        // compute the tile number
        const uint32 tile_idx = blockIdx.x + i * gridDim.x;

        // write out the partial for this tile
        if (DO_WORK)
        {
            if (is_thread0)
                partials[tile_idx] = 1;
        }

        if (is_thread0 && tile_idx)
            previous_done = conditions[tile_idx-1].test( PREFIX_READY );

        __syncthreads();

        if (tile_idx == 0)
        {
            // set the value for the first tile
            if (DO_WORK)
            {
                if (is_thread0)
                    prefixes[0] = 1;
            }
        }
        else if (previous_done)
        {
            // sum to previous prefix
            if (DO_WORK)
            {
                if (is_thread0)
                    prefixes[tile_idx] = prefixes[tile_idx-1] + 1;
            }
        }
        else
        {
            // release the condition variable for the partial
            if (is_thread0)
                conditions[tile_idx].set( PARTIAL_READY );

            int32 prefix  = 0;

            int32 last_tile   = tile_idx;
            int32 prefix_tile = tile_idx;

            // keep looking back until we find a 'ready' prefix
            do
            {
                //
                // lookback up to BLOCKDIM predecessors in parallel, check if any
                // of them is done (i.e. their prefix is ready), and otherwise
                // wait on their partial to arrive.
                //

                previous_done = 0;
                __syncthreads();

                // compute the first tile in this batch
                prefix_tile = nvbio::max( int32( prefix_tile - blockDim.x ), 0 );

                // check if the any of the predecessors in this block is done
                if (prefix_tile + threadIdx.x < last_tile)
                {
                    if (conditions[ prefix_tile + threadIdx.x ].test( PREFIX_READY ))
                        previous_done = prefix_tile + threadIdx.x;
                }

                __syncthreads();

                // let all threads update the prefix tile
                if (previous_done)
                    prefix_tile = previous_done;

                int32 partial = 0;

                // lookback the predecessors in parallel
                if (prefix_tile + threadIdx.x < last_tile)
                {
                    if (previous_done && threadIdx.x == 0)
                    {
                        // let thread0 read the ready prefix
                        if (DO_WORK)
                            partial = prefixes[ prefix_tile ];
                    }
                    else
                    {
                        // wait on the partials
                        conditions[ prefix_tile + threadIdx.x ].wait( PARTIAL_READY );

                        if (DO_WORK)
                            partial = partials[ prefix_tile + threadIdx.x ];
                    }
                }

                if (DO_WORK)
                {
                    // reduce the prefixes
                    prefix += BlockReduce( smem_storage ).Sum( partial );
                }

                last_tile = prefix_tile;
            }
            while (prefix_tile && !previous_done);

            if (DO_WORK)
            {
                if (is_thread0)
                {
                    // write out the final values
                    prefixes[tile_idx] = prefix + 1;
                }
            }
        }

        // release the condition for the scanned value for this tile
        if (is_thread0)
            conditions[tile_idx].set( PREFIX_READY );
    }
}

} // condition namespace

int condition_test()
{
    const uint32 n_tile_grids = 100;

    log_info( stderr, "condition test... started\n" );

    const uint32 blockdim = 128;
    const uint32 n_blocks = (uint32)cuda::max_active_blocks( condition::scan_kernel, blockdim, 0u );

    cuda::condition_set_storage condition_st( n_blocks );
    cuda::condition_set_view    condition_set = condition_st.get();

    log_info( stderr, "  %u blocks\n", n_blocks );

    thrust::device_vector<uint32> dvalues( n_tile_grids*n_blocks );

    uint32* dvalues_ptr = thrust::raw_pointer_cast( &dvalues.front() );

    thrust::host_vector<uint32> hvalues;
    log_info( stderr, "  correctness test... started\n" );

    for (uint32 i = 0; i < 20; ++i)
    {
        // call the testing kernel
        condition::scan_kernel<<<n_blocks,blockdim>>>( n_tile_grids, condition_set, dvalues_ptr, i*n_tile_grids );
        cudaDeviceSynchronize();

        nvbio::cuda::thrust_copy_vector(hvalues, dvalues);

        for (uint32 n = 0; n < n_tile_grids*n_blocks; ++n)
        {
            const uint32 val = hvalues[n];
            if (val != n)
            {
                log_error( stderr, "  found %u at position %u, launch %u\n", val, n, i );
                return 1;
            }
        }
    }
    log_info( stderr, "  correctness test... done\n" );

    const uint32 n_tests = 20;

    log_info( stderr, "  speed test... started\n" );

    condition_st.set(0);
    cudaDeviceSynchronize();

    Timer timer;
    timer.start();

    for (uint32 i = 0; i < n_tests; ++i)
        condition::chained_kernel<<<n_blocks,blockdim>>>( n_tile_grids, condition_set, i*n_tile_grids );

    cudaDeviceSynchronize();
    timer.stop();


    const float time = timer.seconds() / float(n_tests*n_tile_grids);

    log_info( stderr, "  speed test... done:\n    %.3f ns\n    %.3f ns/CTA\n    %.1fM CTAs retired/s\n",
        time * 1.0e6f,
        1.0e6f * (time/float(n_blocks)),
        1.0e-6f * (float(n_blocks)/time) );

    {
        const uint32 blockdim = 128;
        const uint32 n_blocks = (uint32)cuda::max_active_blocks( condition::fast_scan_kernel<blockdim,true>, blockdim, 0u );

        cuda::condition_set_storage condition_st( n_blocks*n_tile_grids );
        cuda::condition_set_view    condition_set = condition_st.get();
        cudaDeviceSynchronize();

        log_info( stderr, "  fast scan... started (%u CTAs)\n", n_blocks );

        thrust::device_vector<uint32> dpartials( n_tile_grids*n_blocks );
        uint32* dpartials_ptr = thrust::raw_pointer_cast( &dpartials.front() );

        for (uint32 i = 0; i < 20; ++i)
        {
            thrust::fill( dpartials.begin(), dpartials.end(), 0 );
            thrust::fill( dvalues.begin(), dvalues.end(), 0 );
            condition::fast_scan_kernel<blockdim,true><<<n_blocks,blockdim>>>( n_tile_grids, condition_set, dpartials_ptr, dvalues_ptr, i*2 );
            cudaDeviceSynchronize();

            nvbio::cuda::thrust_copy_vector(hvalues, dvalues);

            for (uint32 n = 0; n < n_tile_grids*n_blocks; ++n)
            {
                const uint32 val = hvalues[n];
                if (val != n+1)
                {
                    log_error( stderr, "  found %u at position %u, launch %u\n", val, n, i );
                    return 1;
                }
            }
        }

        condition_st.set(0);
        cudaDeviceSynchronize();

        Timer timer;
        timer.start();

        for (uint32 i = 0; i < n_tests; ++i)
            condition::fast_scan_kernel<blockdim,true><<<n_blocks,blockdim>>>( n_tile_grids, condition_set, dpartials_ptr, dvalues_ptr, i*2 );

        cudaDeviceSynchronize();
        timer.stop();
        {
            const float time = timer.seconds() / float(n_tests*n_tile_grids);

            log_info( stderr, "  fast scan test... done:\n    %.3f ns\n    %.3f ns/CTA\n    %.1fM CTAs retired/s\n",
                time * 1.0e6f,
                1.0e6f * (time/float(n_blocks)),
                1.0e-6f * (float(n_blocks)/time) );
        }

        log_info( stderr, "  fast chaining... started\n" );

        condition_st.set(0);
        cudaDeviceSynchronize();

        timer.start();

        for (uint32 i = 0; i < n_tests; ++i)
            condition::fast_scan_kernel<blockdim,false><<<n_blocks,blockdim>>>( n_tile_grids, condition_set, NULL, NULL, i*2 );

        cudaDeviceSynchronize();
        timer.stop();
        {
            const float time = timer.seconds() / float(n_tests*n_tile_grids);

            log_info( stderr, "  fast chaining test... done:\n    %.3f ns\n    %.3f ns/CTA\n    %.1fM CTAs retired/s\n",
                time * 1.0e6f,
                1.0e6f * (time/float(n_blocks)),
                1.0e-6f * (float(n_blocks)/time) );
        }
    }

    log_info( stderr, "condition test... done\n" );
    return 0;
}

} // namespace nvbio