LIVE / thrust /dependencies /cub /test /test_block_load_store.cu
Xu Ma
update
1c3c0d9
raw
history blame
19.5 kB
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Test of BlockLoad and BlockStore utilities
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <iterator>
#include <stdio.h>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/iterator/cache_modified_output_iterator.cuh>
#include <cub/iterator/discard_output_iterator.cuh>
#include <cub/util_allocator.cuh>
#include "test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
bool g_verbose = false;
CachingDeviceAllocator g_allocator(true);
//---------------------------------------------------------------------
// Test kernels
//---------------------------------------------------------------------
/**
* Test load/store kernel.
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
BlockLoadAlgorithm LOAD_ALGORITHM,
BlockStoreAlgorithm STORE_ALGORITHM,
typename InputIteratorT,
typename OutputIteratorT>
__launch_bounds__ (BLOCK_THREADS, 1)
__global__ void Kernel(
InputIteratorT d_in,
OutputIteratorT d_out_unguarded,
OutputIteratorT d_out_guarded,
int num_items)
{
enum
{
TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD
};
// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
// The output value type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
// Threadblock load/store abstraction types
typedef BlockLoad<InputT, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
typedef BlockStore<OutputT, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;
// Shared memory type for this thread block
union TempStorage
{
typename BlockLoad::TempStorage load;
typename BlockStore::TempStorage store;
};
// Allocate temp storage in shared memory
__shared__ TempStorage temp_storage;
// Threadblock work bounds
int block_offset = blockIdx.x * TILE_SIZE;
int guarded_elements = num_items - block_offset;
// Tile of items
OutputT data[ITEMS_PER_THREAD];
// Load data
BlockLoad(temp_storage.load).Load(d_in + block_offset, data);
__syncthreads();
// Store data
BlockStore(temp_storage.store).Store(d_out_unguarded + block_offset, data);
__syncthreads();
// reset data
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
data[ITEM] = OutputT();
__syncthreads();
// Load data
BlockLoad(temp_storage.load).Load(d_in + block_offset, data, guarded_elements);
__syncthreads();
// Store data
BlockStore(temp_storage.store).Store(d_out_guarded + block_offset, data, guarded_elements);
}
//---------------------------------------------------------------------
// Host testing subroutines
//---------------------------------------------------------------------
/**
* Test load/store variants
*/
template <
typename T,
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
BlockLoadAlgorithm LOAD_ALGORITHM,
BlockStoreAlgorithm STORE_ALGORITHM,
typename InputIteratorT,
typename OutputIteratorT>
void TestKernel(
T *h_in,
InputIteratorT d_in,
OutputIteratorT d_out_unguarded_itr,
OutputIteratorT d_out_guarded_itr,
T *d_out_unguarded_ptr,
T *d_out_guarded_ptr,
int grid_size,
int guarded_elements)
{
int compare;
int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
// Test with discard output iterator
typedef typename std::iterator_traits<InputIteratorT>::difference_type OffsetT;
DiscardOutputIterator<OffsetT> discard_itr;
Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
<<<grid_size, BLOCK_THREADS>>>(
d_in,
discard_itr,
discard_itr,
guarded_elements);
// Test with regular output iterator
Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
<<<grid_size, BLOCK_THREADS>>>(
d_in,
d_out_unguarded_itr,
d_out_guarded_itr,
guarded_elements);
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
// Check results
compare = CompareDeviceResults(h_in, d_out_guarded_ptr, guarded_elements, g_verbose, g_verbose);
printf("\tGuarded: %s\n", (compare) ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Check results
compare = CompareDeviceResults(h_in, d_out_unguarded_ptr, unguarded_elements, g_verbose, g_verbose);
printf("\tUnguarded: %s\n", (compare) ? "FAIL" : "PASS");
AssertEquals(0, compare);
}
/**
* Test native pointer. Specialized for sufficient resources
*/
template <
typename T,
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
BlockLoadAlgorithm LOAD_ALGORITHM,
BlockStoreAlgorithm STORE_ALGORITHM>
void TestNative(
int grid_size,
float fraction_valid,
Int2Type<true> /*sufficient_resources*/)
{
int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
int guarded_elements = int(fraction_valid * float(unguarded_elements));
// Allocate host arrays
T *h_in = (T*) malloc(unguarded_elements * sizeof(T));
// Allocate device arrays
T *d_in = NULL;
T *d_out_unguarded = NULL;
T *d_out_guarded = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements));
CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));
// Initialize problem on host and device
for (int i = 0; i < unguarded_elements; ++i)
{
InitValue(INTEGER_SEED, h_in[i], i);
}
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));
printf("TestNative "
"grid_size(%d) "
"guarded_elements(%d) "
"unguarded_elements(%d) "
"BLOCK_THREADS(%d) "
"ITEMS_PER_THREAD(%d) "
"LOAD_ALGORITHM(%d) "
"STORE_ALGORITHM(%d) "
"sizeof(T)(%d)\n",
grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, (int) sizeof(T));
TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
h_in,
(T const *) d_in, // Test const
d_out_unguarded,
d_out_guarded,
d_out_unguarded,
d_out_guarded,
grid_size,
guarded_elements);
// Cleanup
if (h_in) free(h_in);
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
}
/**
* Test native pointer. Specialized for insufficient resources
*/
template <
typename T,
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
BlockLoadAlgorithm LOAD_ALGORITHM,
BlockStoreAlgorithm STORE_ALGORITHM>
void TestNative(
int /*grid_size*/,
float /*fraction_valid*/,
Int2Type<false> /*sufficient_resources*/)
{}
/**
* Test iterator. Specialized for sufficient resources.
*/
template <
typename T,
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
BlockLoadAlgorithm LOAD_ALGORITHM,
BlockStoreAlgorithm STORE_ALGORITHM,
CacheLoadModifier LOAD_MODIFIER,
CacheStoreModifier STORE_MODIFIER>
void TestIterator(
int grid_size,
float fraction_valid,
Int2Type<true> /*sufficient_resources*/)
{
int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
int guarded_elements = int(fraction_valid * float(unguarded_elements));
// Allocate host arrays
T *h_in = (T*) malloc(unguarded_elements * sizeof(T));
// Allocate device arrays
T *d_in = NULL;
T *d_out_unguarded = NULL;
T *d_out_guarded = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements));
CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));
// Initialize problem on host and device
for (int i = 0; i < unguarded_elements; ++i)
{
InitValue(INTEGER_SEED, h_in[i], i);
}
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));
printf("TestIterator "
"grid_size(%d) "
"guarded_elements(%d) "
"unguarded_elements(%d) "
"BLOCK_THREADS(%d) "
"ITEMS_PER_THREAD(%d) "
"LOAD_ALGORITHM(%d) "
"STORE_ALGORITHM(%d) "
"LOAD_MODIFIER(%d) "
"STORE_MODIFIER(%d) "
"sizeof(T)(%d)\n",
grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_MODIFIER, STORE_MODIFIER, (int) sizeof(T));
TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
h_in,
CacheModifiedInputIterator<LOAD_MODIFIER, T>(d_in),
CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_unguarded),
CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_guarded),
d_out_unguarded,
d_out_guarded,
grid_size,
guarded_elements);
// Cleanup
if (h_in) free(h_in);
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
}
/**
* Test iterator. Specialized for insufficient resources.
*/
template <
typename T,
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
BlockLoadAlgorithm LOAD_ALGORITHM,
BlockStoreAlgorithm STORE_ALGORITHM,
CacheLoadModifier LOAD_MODIFIER,
CacheStoreModifier STORE_MODIFIER>
void TestIterator(
int /*grid_size*/,
float /*fraction_valid*/,
Int2Type<false> /*sufficient_resources*/)
{}
/**
* Evaluate different pointer access types
*/
template <
typename T,
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
BlockLoadAlgorithm LOAD_ALGORITHM,
BlockStoreAlgorithm STORE_ALGORITHM>
void TestPointerType(
int grid_size,
float fraction_valid)
{
// Threadblock load/store abstraction types
typedef BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
typedef BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;
#if defined(SM100) || defined(SM110) || defined(SM130)
static const bool sufficient_load_smem = sizeof(typename BlockLoad::TempStorage) <= 1024 * 16;
static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage) <= 1024 * 16;
static const bool sufficient_threads = BLOCK_THREADS <= 512;
#else
static const bool sufficient_load_smem = sizeof(typename BlockLoad::TempStorage) <= 1024 * 48;
static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage) <= 1024 * 48;
static const bool sufficient_threads = BLOCK_THREADS <= 1024;
#endif
static const bool sufficient_resources = sufficient_load_smem && sufficient_store_smem && sufficient_threads;
TestNative<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
TestIterator<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_DEFAULT, STORE_DEFAULT>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
}
/**
* Evaluate different time-slicing strategies
*/
template <
typename T,
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
BlockLoadAlgorithm LOAD_ALGORITHM,
BlockStoreAlgorithm STORE_ALGORITHM>
void TestSlicedStrategy(
int grid_size,
float fraction_valid)
{
TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, true>(grid_size, fraction_valid);
TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, false>(grid_size, fraction_valid);
}
/**
* Evaluate different load/store strategies (specialized for block sizes that are not a multiple of 32)
*/
template <
typename T,
int BLOCK_THREADS,
int ITEMS_PER_THREAD>
void TestStrategy(
int grid_size,
float fraction_valid,
Int2Type<false> /*is_warp_multiple*/)
{
TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, BLOCK_STORE_DIRECT>(grid_size, fraction_valid);
TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, BLOCK_STORE_TRANSPOSE>(grid_size, fraction_valid);
TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_VECTORIZE, BLOCK_STORE_VECTORIZE>(grid_size, fraction_valid);
}
/**
* Evaluate different load/store strategies (specialized for block sizes that are a multiple of 32)
*/
template <
typename T,
int BLOCK_THREADS,
int ITEMS_PER_THREAD>
void TestStrategy(
int grid_size,
float fraction_valid,
Int2Type<true> /*is_warp_multiple*/)
{
TestStrategy<T, BLOCK_THREADS, ITEMS_PER_THREAD>(grid_size, fraction_valid, Int2Type<false>());
TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(grid_size, fraction_valid);
TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED>(grid_size, fraction_valid);
}
/**
* Evaluate different register blocking
*/
template <
typename T,
int BLOCK_THREADS>
void TestItemsPerThread(
int grid_size,
float fraction_valid)
{
Int2Type<BLOCK_THREADS % 32 == 0> is_warp_multiple;
TestStrategy<T, BLOCK_THREADS, 1>(grid_size, fraction_valid, is_warp_multiple);
TestStrategy<T, BLOCK_THREADS, 3>(grid_size, fraction_valid, is_warp_multiple);
TestStrategy<T, BLOCK_THREADS, 4>(grid_size, fraction_valid, is_warp_multiple);
TestStrategy<T, BLOCK_THREADS, 11>(grid_size, fraction_valid, is_warp_multiple);
}
/**
* Evaluate different thread block sizes
*/
template <typename T>
void TestThreads(
int grid_size,
float fraction_valid)
{
TestItemsPerThread<T, 15>(grid_size, fraction_valid);
TestItemsPerThread<T, 32>(grid_size, fraction_valid);
TestItemsPerThread<T, 72>(grid_size, fraction_valid);
TestItemsPerThread<T, 96>(grid_size, fraction_valid);
TestItemsPerThread<T, 128>(grid_size, fraction_valid);
}
/**
* Main
*/
int main(int argc, char** argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--device=<device-id>] "
"[--v] "
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
// Get ptx version
int ptx_version = 0;
CubDebugExit(PtxVersion(ptx_version));
#ifdef QUICK_TEST
// Compile/run quick tests
TestNative< int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(1, 0.8f, Int2Type<true>());
TestIterator< int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE, LOAD_DEFAULT, STORE_DEFAULT>(1, 0.8f, Int2Type<true>());
#else
// Compile/run thorough tests
TestThreads<char>(2, 0.8f);
TestThreads<int>(2, 0.8f);
TestThreads<long>(2, 0.8f);
TestThreads<long2>(2, 0.8f);
if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted
TestThreads<double2>(2, 0.8f);
TestThreads<TestFoo>(2, 0.8f);
TestThreads<TestBar>(2, 0.8f);
#endif
return 0;
}