Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /dependencies /cub /test /test_block_load_store.cu

Xu Ma

update

1c3c0d9 almost 3 years ago

19.5 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/******************************************************************************
	* Test of BlockLoad and BlockStore utilities
	******************************************************************************/

	// Ensure printing of CUDA runtime errors to console
	#define CUB_STDERR

	#include <iterator>
	#include <stdio.h>

	#include <cub/block/block_load.cuh>
	#include <cub/block/block_store.cuh>
	#include <cub/iterator/cache_modified_input_iterator.cuh>
	#include <cub/iterator/cache_modified_output_iterator.cuh>
	#include <cub/iterator/discard_output_iterator.cuh>
	#include <cub/util_allocator.cuh>

	#include "test_util.h"

	using namespace cub;

	//---------------------------------------------------------------------
	// Globals, constants and typedefs
	//---------------------------------------------------------------------

	bool g_verbose = false;
	CachingDeviceAllocator g_allocator(true);


	//---------------------------------------------------------------------
	// Test kernels
	//---------------------------------------------------------------------


	/**
	* Test load/store kernel.
	*/
	template <
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD,
	BlockLoadAlgorithm LOAD_ALGORITHM,
	BlockStoreAlgorithm STORE_ALGORITHM,
	typename InputIteratorT,
	typename OutputIteratorT>
	__launch_bounds__ (BLOCK_THREADS, 1)
	__global__ void Kernel(
	InputIteratorT d_in,
	OutputIteratorT d_out_unguarded,
	OutputIteratorT d_out_guarded,
	int num_items)
	{
	enum
	{
	TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD
	};

	// The input value type
	typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

	// The output value type
	typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
	typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
	typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type

	// Threadblock load/store abstraction types
	typedef BlockLoad<InputT, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
	typedef BlockStore<OutputT, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;

	// Shared memory type for this thread block
	union TempStorage
	{
	typename BlockLoad::TempStorage load;
	typename BlockStore::TempStorage store;
	};

	// Allocate temp storage in shared memory
	__shared__ TempStorage temp_storage;

	// Threadblock work bounds
	int block_offset = blockIdx.x * TILE_SIZE;
	int guarded_elements = num_items - block_offset;

	// Tile of items
	OutputT data[ITEMS_PER_THREAD];

	// Load data
	BlockLoad(temp_storage.load).Load(d_in + block_offset, data);

	__syncthreads();

	// Store data
	BlockStore(temp_storage.store).Store(d_out_unguarded + block_offset, data);

	__syncthreads();

	// reset data
	#pragma unroll
	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
	data[ITEM] = OutputT();

	__syncthreads();

	// Load data
	BlockLoad(temp_storage.load).Load(d_in + block_offset, data, guarded_elements);

	__syncthreads();

	// Store data
	BlockStore(temp_storage.store).Store(d_out_guarded + block_offset, data, guarded_elements);
	}


	//---------------------------------------------------------------------
	// Host testing subroutines
	//---------------------------------------------------------------------


	/**
	* Test load/store variants
	*/
	template <
	typename T,
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD,
	BlockLoadAlgorithm LOAD_ALGORITHM,
	BlockStoreAlgorithm STORE_ALGORITHM,
	typename InputIteratorT,
	typename OutputIteratorT>
	void TestKernel(
	T *h_in,
	InputIteratorT d_in,
	OutputIteratorT d_out_unguarded_itr,
	OutputIteratorT d_out_guarded_itr,
	T *d_out_unguarded_ptr,
	T *d_out_guarded_ptr,
	int grid_size,
	int guarded_elements)
	{
	int compare;

	int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;

	// Test with discard output iterator
	typedef typename std::iterator_traits<InputIteratorT>::difference_type OffsetT;
	DiscardOutputIterator<OffsetT> discard_itr;

	Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
	<<<grid_size, BLOCK_THREADS>>>(
	d_in,
	discard_itr,
	discard_itr,
	guarded_elements);

	// Test with regular output iterator
	Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
	<<<grid_size, BLOCK_THREADS>>>(
	d_in,
	d_out_unguarded_itr,
	d_out_guarded_itr,
	guarded_elements);

	CubDebugExit(cudaPeekAtLastError());
	CubDebugExit(cudaDeviceSynchronize());

	// Check results
	compare = CompareDeviceResults(h_in, d_out_guarded_ptr, guarded_elements, g_verbose, g_verbose);
	printf("\tGuarded: %s\n", (compare) ? "FAIL" : "PASS");
	AssertEquals(0, compare);

	// Check results
	compare = CompareDeviceResults(h_in, d_out_unguarded_ptr, unguarded_elements, g_verbose, g_verbose);
	printf("\tUnguarded: %s\n", (compare) ? "FAIL" : "PASS");
	AssertEquals(0, compare);
	}


	/**
	* Test native pointer. Specialized for sufficient resources
	*/
	template <
	typename T,
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD,
	BlockLoadAlgorithm LOAD_ALGORITHM,
	BlockStoreAlgorithm STORE_ALGORITHM>
	void TestNative(
	int grid_size,
	float fraction_valid,
	Int2Type<true> /sufficient_resources/)
	{
	int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
	int guarded_elements = int(fraction_valid * float(unguarded_elements));

	// Allocate host arrays
	T h_in = (T) malloc(unguarded_elements * sizeof(T));

	// Allocate device arrays
	T *d_in = NULL;
	T *d_out_unguarded = NULL;
	T *d_out_guarded = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_in, sizeof(T) unguarded_elements));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_out_unguarded, sizeof(T) unguarded_elements));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_out_guarded, sizeof(T) guarded_elements));
	CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
	CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));

	// Initialize problem on host and device
	for (int i = 0; i < unguarded_elements; ++i)
	{
	InitValue(INTEGER_SEED, h_in[i], i);
	}
	CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));

	printf("TestNative "
	"grid_size(%d) "
	"guarded_elements(%d) "
	"unguarded_elements(%d) "
	"BLOCK_THREADS(%d) "
	"ITEMS_PER_THREAD(%d) "
	"LOAD_ALGORITHM(%d) "
	"STORE_ALGORITHM(%d) "
	"sizeof(T)(%d)\n",
	grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, (int) sizeof(T));

	TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
	h_in,
	(T const *) d_in, // Test const
	d_out_unguarded,
	d_out_guarded,
	d_out_unguarded,
	d_out_guarded,
	grid_size,
	guarded_elements);

	// Cleanup
	if (h_in) free(h_in);
	if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
	if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
	if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
	}


	/**
	* Test native pointer. Specialized for insufficient resources
	*/
	template <
	typename T,
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD,
	BlockLoadAlgorithm LOAD_ALGORITHM,
	BlockStoreAlgorithm STORE_ALGORITHM>
	void TestNative(
	int /grid_size/,
	float /fraction_valid/,
	Int2Type<false> /sufficient_resources/)
	{}


	/**
	* Test iterator. Specialized for sufficient resources.
	*/
	template <
	typename T,
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD,
	BlockLoadAlgorithm LOAD_ALGORITHM,
	BlockStoreAlgorithm STORE_ALGORITHM,
	CacheLoadModifier LOAD_MODIFIER,
	CacheStoreModifier STORE_MODIFIER>
	void TestIterator(
	int grid_size,
	float fraction_valid,
	Int2Type<true> /sufficient_resources/)
	{
	int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
	int guarded_elements = int(fraction_valid * float(unguarded_elements));

	// Allocate host arrays
	T h_in = (T) malloc(unguarded_elements * sizeof(T));

	// Allocate device arrays
	T *d_in = NULL;
	T *d_out_unguarded = NULL;
	T *d_out_guarded = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_in, sizeof(T) unguarded_elements));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_out_unguarded, sizeof(T) unguarded_elements));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_out_guarded, sizeof(T) guarded_elements));
	CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
	CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));

	// Initialize problem on host and device
	for (int i = 0; i < unguarded_elements; ++i)
	{
	InitValue(INTEGER_SEED, h_in[i], i);
	}
	CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));

	printf("TestIterator "
	"grid_size(%d) "
	"guarded_elements(%d) "
	"unguarded_elements(%d) "
	"BLOCK_THREADS(%d) "
	"ITEMS_PER_THREAD(%d) "
	"LOAD_ALGORITHM(%d) "
	"STORE_ALGORITHM(%d) "
	"LOAD_MODIFIER(%d) "
	"STORE_MODIFIER(%d) "
	"sizeof(T)(%d)\n",
	grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_MODIFIER, STORE_MODIFIER, (int) sizeof(T));

	TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
	h_in,
	CacheModifiedInputIterator<LOAD_MODIFIER, T>(d_in),
	CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_unguarded),
	CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_guarded),
	d_out_unguarded,
	d_out_guarded,
	grid_size,
	guarded_elements);

	// Cleanup
	if (h_in) free(h_in);
	if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
	if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
	if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
	}

	/**
	* Test iterator. Specialized for insufficient resources.
	*/
	template <
	typename T,
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD,
	BlockLoadAlgorithm LOAD_ALGORITHM,
	BlockStoreAlgorithm STORE_ALGORITHM,
	CacheLoadModifier LOAD_MODIFIER,
	CacheStoreModifier STORE_MODIFIER>
	void TestIterator(
	int /grid_size/,
	float /fraction_valid/,
	Int2Type<false> /sufficient_resources/)
	{}


	/**
	* Evaluate different pointer access types
	*/
	template <
	typename T,
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD,
	BlockLoadAlgorithm LOAD_ALGORITHM,
	BlockStoreAlgorithm STORE_ALGORITHM>
	void TestPointerType(
	int grid_size,
	float fraction_valid)
	{
	// Threadblock load/store abstraction types
	typedef BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
	typedef BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;

	#if defined(SM100) \|\| defined(SM110) \|\| defined(SM130)
	static const bool sufficient_load_smem = sizeof(typename BlockLoad::TempStorage) <= 1024 * 16;
	static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage) <= 1024 * 16;
	static const bool sufficient_threads = BLOCK_THREADS <= 512;
	#else
	static const bool sufficient_load_smem = sizeof(typename BlockLoad::TempStorage) <= 1024 * 48;
	static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage) <= 1024 * 48;
	static const bool sufficient_threads = BLOCK_THREADS <= 1024;
	#endif

	static const bool sufficient_resources = sufficient_load_smem && sufficient_store_smem && sufficient_threads;

	TestNative<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
	TestIterator<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_DEFAULT, STORE_DEFAULT>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
	}


	/**
	* Evaluate different time-slicing strategies
	*/
	template <
	typename T,
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD,
	BlockLoadAlgorithm LOAD_ALGORITHM,
	BlockStoreAlgorithm STORE_ALGORITHM>
	void TestSlicedStrategy(
	int grid_size,
	float fraction_valid)
	{
	TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, true>(grid_size, fraction_valid);
	TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, false>(grid_size, fraction_valid);
	}



	/**
	* Evaluate different load/store strategies (specialized for block sizes that are not a multiple of 32)
	*/
	template <
	typename T,
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD>
	void TestStrategy(
	int grid_size,
	float fraction_valid,
	Int2Type<false> /is_warp_multiple/)
	{
	TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, BLOCK_STORE_DIRECT>(grid_size, fraction_valid);
	TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, BLOCK_STORE_TRANSPOSE>(grid_size, fraction_valid);
	TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_VECTORIZE, BLOCK_STORE_VECTORIZE>(grid_size, fraction_valid);
	}


	/**
	* Evaluate different load/store strategies (specialized for block sizes that are a multiple of 32)
	*/
	template <
	typename T,
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD>
	void TestStrategy(
	int grid_size,
	float fraction_valid,
	Int2Type<true> /is_warp_multiple/)
	{
	TestStrategy<T, BLOCK_THREADS, ITEMS_PER_THREAD>(grid_size, fraction_valid, Int2Type<false>());
	TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(grid_size, fraction_valid);
	TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED>(grid_size, fraction_valid);
	}


	/**
	* Evaluate different register blocking
	*/
	template <
	typename T,
	int BLOCK_THREADS>
	void TestItemsPerThread(
	int grid_size,
	float fraction_valid)
	{
	Int2Type<BLOCK_THREADS % 32 == 0> is_warp_multiple;

	TestStrategy<T, BLOCK_THREADS, 1>(grid_size, fraction_valid, is_warp_multiple);
	TestStrategy<T, BLOCK_THREADS, 3>(grid_size, fraction_valid, is_warp_multiple);
	TestStrategy<T, BLOCK_THREADS, 4>(grid_size, fraction_valid, is_warp_multiple);
	TestStrategy<T, BLOCK_THREADS, 11>(grid_size, fraction_valid, is_warp_multiple);
	}


	/**
	* Evaluate different thread block sizes
	*/
	template <typename T>
	void TestThreads(
	int grid_size,
	float fraction_valid)
	{
	TestItemsPerThread<T, 15>(grid_size, fraction_valid);
	TestItemsPerThread<T, 32>(grid_size, fraction_valid);
	TestItemsPerThread<T, 72>(grid_size, fraction_valid);
	TestItemsPerThread<T, 96>(grid_size, fraction_valid);
	TestItemsPerThread<T, 128>(grid_size, fraction_valid);
	}


	/**
	* Main
	*/
	int main(int argc, char** argv)
	{
	// Initialize command line
	CommandLineArgs args(argc, argv);
	g_verbose = args.CheckCmdLineFlag("v");

	// Print usage
	if (args.CheckCmdLineFlag("help"))
	{
	printf("%s "
	"[--device=<device-id>] "
	"[--v] "
	"\n", argv[0]);
	exit(0);
	}

	// Initialize device
	CubDebugExit(args.DeviceInit());

	// Get ptx version
	int ptx_version = 0;
	CubDebugExit(PtxVersion(ptx_version));

	#ifdef QUICK_TEST

	// Compile/run quick tests
	TestNative< int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(1, 0.8f, Int2Type<true>());
	TestIterator< int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE, LOAD_DEFAULT, STORE_DEFAULT>(1, 0.8f, Int2Type<true>());

	#else

	// Compile/run thorough tests
	TestThreads<char>(2, 0.8f);
	TestThreads<int>(2, 0.8f);
	TestThreads<long>(2, 0.8f);
	TestThreads<long2>(2, 0.8f);

	if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted
	TestThreads<double2>(2, 0.8f);
	TestThreads<TestFoo>(2, 0.8f);
	TestThreads<TestBar>(2, 0.8f);

	#endif

	return 0;
	}