Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /dependencies /cub /test /test_iterator.cu

Xu Ma

update

1c3c0d9 almost 3 years ago

25.6 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/******************************************************************************
	* Test of iterator utilities
	******************************************************************************/

	// Ensure printing of CUDA runtime errors to console
	#define CUB_STDERR

	#include <iterator>
	#include <stdio.h>
	#include <typeinfo>

	#include <cub/iterator/arg_index_input_iterator.cuh>
	#include <cub/iterator/cache_modified_input_iterator.cuh>
	#include <cub/iterator/cache_modified_output_iterator.cuh>
	#include <cub/iterator/constant_input_iterator.cuh>
	#include <cub/iterator/counting_input_iterator.cuh>
	#include <cub/iterator/tex_obj_input_iterator.cuh>
	#include <cub/iterator/tex_ref_input_iterator.cuh>
	#include <cub/iterator/transform_input_iterator.cuh>

	#include <cub/util_type.cuh>
	#include <cub/util_allocator.cuh>

	#include "test_util.h"

	#include <thrust/device_ptr.h>
	#include <thrust/copy.h>

	using namespace cub;


	//---------------------------------------------------------------------
	// Globals, constants and typedefs
	//---------------------------------------------------------------------

	bool g_verbose = false;
	CachingDeviceAllocator g_allocator(true);

	// Dispatch types
	enum Backend
	{
	CUB, // CUB method
	THRUST, // Thrust method
	CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
	};


	template <typename T>
	struct TransformOp
	{
	// Increment transform
	__host__ __device__ __forceinline__ T operator()(T input) const
	{
	T addend;
	InitValue(INTEGER_SEED, addend, 1);
	return input + addend;
	}
	};

	struct SelectOp
	{
	template <typename T>
	__host__ __device__ __forceinline__ bool operator()(T input)
	{
	return true;
	}
	};


	//---------------------------------------------------------------------
	// Test kernels
	//---------------------------------------------------------------------

	/**
	* Test random access input iterator
	*/
	template <
	typename InputIteratorT,
	typename T>
	__global__ void Kernel(
	InputIteratorT d_in,
	T *d_out,
	InputIteratorT *d_itrs)
	{
	d_out[0] = *d_in; // Value at offset 0
	d_out[1] = d_in[100]; // Value at offset 100
	d_out[2] = *(d_in + 1000); // Value at offset 1000
	d_out[3] = *(d_in + 10000); // Value at offset 10000

	d_in++;
	d_out[4] = d_in[0]; // Value at offset 1

	d_in += 20;
	d_out[5] = d_in[0]; // Value at offset 21
	d_itrs[0] = d_in; // Iterator at offset 21

	d_in -= 10;
	d_out[6] = d_in[0]; // Value at offset 11;

	d_in -= 11;
	d_out[7] = d_in[0]; // Value at offset 0
	d_itrs[1] = d_in; // Iterator at offset 0
	}



	//---------------------------------------------------------------------
	// Host testing subroutines
	//---------------------------------------------------------------------


	/**
	* Run iterator test on device
	*/
	template <
	typename InputIteratorT,
	typename T,
	int TEST_VALUES>
	void Test(
	InputIteratorT d_in,
	T (&h_reference)[TEST_VALUES])
	{
	// Allocate device arrays
	T *d_out = NULL;
	InputIteratorT *d_itrs = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_out, sizeof(T) TEST_VALUES));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_itrs, sizeof(InputIteratorT) 2));

	int compare;

	// Run unguarded kernel
	Kernel<<<1, 1>>>(d_in, d_out, d_itrs);

	CubDebugExit(cudaPeekAtLastError());
	CubDebugExit(cudaDeviceSynchronize());

	// Check results
	compare = CompareDeviceResults(h_reference, d_out, TEST_VALUES, g_verbose, g_verbose);
	printf("\tValues: %s\n", (compare) ? "FAIL" : "PASS");
	AssertEquals(0, compare);

	// Check iterator at offset 21
	InputIteratorT h_itr = d_in + 21;
	compare = CompareDeviceResults(&h_itr, d_itrs, 1, g_verbose, g_verbose);
	printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
	AssertEquals(0, compare);

	// Check iterator at offset 0
	compare = CompareDeviceResults(&d_in, d_itrs + 1, 1, g_verbose, g_verbose);
	printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
	AssertEquals(0, compare);

	// Cleanup
	if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
	if (d_itrs) CubDebugExit(g_allocator.DeviceFree(d_itrs));
	}


	/**
	* Test constant iterator
	*/
	template <typename T>
	void TestConstant(T base)
	{
	printf("\nTesting constant iterator on type %s (base: %lld)\n", typeid(T).name(), (unsigned long long) (base)); fflush(stdout);

	//
	// Test iterator manipulation in kernel
	//

	T h_reference[8] = {base, base, base, base, base, base, base, base};
	ConstantInputIterator<T> d_itr(base);
	Test(d_itr, h_reference);

	#if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer

	//
	// Test with thrust::copy_if()
	//

	int copy_items = 100;
	T *h_copy = new T[copy_items];
	T *d_copy = NULL;

	for (int i = 0; i < copy_items; ++i)
	h_copy[i] = d_itr[i];

	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_copy, sizeof(T) copy_items));
	thrust::device_ptr<T> d_copy_wrapper(d_copy);

	thrust::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp());

	int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose);
	printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
	AssertEquals(0, compare);

	if (h_copy) delete[] h_copy;
	if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));

	#endif // THRUST_VERSION
	}


	/**
	* Test counting iterator
	*/
	template <typename T>
	void TestCounting(T base)
	{
	printf("\nTesting counting iterator on type %s (base: %d) \n", typeid(T).name(), int(base)); fflush(stdout);

	//
	// Test iterator manipulation in kernel
	//

	// Initialize reference data
	T h_reference[8];
	h_reference[0] = base + 0; // Value at offset 0
	h_reference[1] = base + 100; // Value at offset 100
	h_reference[2] = base + 1000; // Value at offset 1000
	h_reference[3] = base + 10000; // Value at offset 10000
	h_reference[4] = base + 1; // Value at offset 1
	h_reference[5] = base + 21; // Value at offset 21
	h_reference[6] = base + 11; // Value at offset 11
	h_reference[7] = base + 0; // Value at offset 0;

	CountingInputIterator<T> d_itr(base);
	Test(d_itr, h_reference);

	#if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer

	//
	// Test with thrust::copy_if()
	//

	unsigned long long max_items = ((1ull << ((sizeof(T) * 8) - 1)) - 1);
	size_t copy_items = (size_t) CUB_MIN(max_items - base, 100); // potential issue with differencing overflows when T is a smaller type than can handle the offset
	T *h_copy = new T[copy_items];
	T *d_copy = NULL;

	for (unsigned long long i = 0; i < copy_items; ++i)
	h_copy[i] = d_itr[i];

	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_copy, sizeof(T) copy_items));
	thrust::device_ptr<T> d_copy_wrapper(d_copy);
	thrust::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp());

	int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose);
	printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
	AssertEquals(0, compare);

	if (h_copy) delete[] h_copy;
	if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));

	#endif // THRUST_VERSION
	}


	/**
	* Test modified iterator
	*/
	template <typename T, typename CastT>
	void TestModified()
	{
	printf("\nTesting cache-modified iterator on type %s\n", typeid(T).name()); fflush(stdout);

	//
	// Test iterator manipulation in kernel
	//

	constexpr int TEST_VALUES = 11000;

	T *h_data = new T[TEST_VALUES];
	for (int i = 0; i < TEST_VALUES; ++i)
	{
	RandomBits(h_data[i]);
	}

	// Allocate device arrays
	T *d_data = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_data, sizeof(T) TEST_VALUES));
	CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));

	// Initialize reference data
	T h_reference[8];
	h_reference[0] = h_data[0]; // Value at offset 0
	h_reference[1] = h_data[100]; // Value at offset 100
	h_reference[2] = h_data[1000]; // Value at offset 1000
	h_reference[3] = h_data[10000]; // Value at offset 10000
	h_reference[4] = h_data[1]; // Value at offset 1
	h_reference[5] = h_data[21]; // Value at offset 21
	h_reference[6] = h_data[11]; // Value at offset 11
	h_reference[7] = h_data[0]; // Value at offset 0;

	Test(CacheModifiedInputIterator<LOAD_DEFAULT, T>((CastT*) d_data), h_reference);
	Test(CacheModifiedInputIterator<LOAD_CA, T>((CastT*) d_data), h_reference);
	Test(CacheModifiedInputIterator<LOAD_CG, T>((CastT*) d_data), h_reference);
	Test(CacheModifiedInputIterator<LOAD_CS, T>((CastT*) d_data), h_reference);
	Test(CacheModifiedInputIterator<LOAD_CV, T>((CastT*) d_data), h_reference);
	Test(CacheModifiedInputIterator<LOAD_LDG, T>((CastT*) d_data), h_reference);
	Test(CacheModifiedInputIterator<LOAD_VOLATILE, T>((CastT*) d_data), h_reference);

	#if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer

	//
	// Test with thrust::copy_if()
	//

	T *d_copy = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_copy, sizeof(T) TEST_VALUES));

	CacheModifiedInputIterator<LOAD_CG, T> d_in_itr((CastT*) d_data);
	CacheModifiedOutputIterator<STORE_CG, T> d_out_itr((CastT*) d_copy);

	thrust::copy_if(d_in_itr, d_in_itr + TEST_VALUES, d_out_itr, SelectOp());

	int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
	printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
	AssertEquals(0, compare);

	// Cleanup
	if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));

	#endif // THRUST_VERSION

	if (h_data) delete[] h_data;
	if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
	}


	/**
	* Test transform iterator
	*/
	template <typename T, typename CastT>
	void TestTransform()
	{
	printf("\nTesting transform iterator on type %s\n", typeid(T).name()); fflush(stdout);

	//
	// Test iterator manipulation in kernel
	//

	constexpr int TEST_VALUES = 11000;

	T *h_data = new T[TEST_VALUES];
	for (int i = 0; i < TEST_VALUES; ++i)
	{
	InitValue(INTEGER_SEED, h_data[i], i);
	}

	// Allocate device arrays
	T *d_data = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_data, sizeof(T) TEST_VALUES));
	CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));

	TransformOp<T> op;

	// Initialize reference data
	T h_reference[8];
	h_reference[0] = op(h_data[0]); // Value at offset 0
	h_reference[1] = op(h_data[100]); // Value at offset 100
	h_reference[2] = op(h_data[1000]); // Value at offset 1000
	h_reference[3] = op(h_data[10000]); // Value at offset 10000
	h_reference[4] = op(h_data[1]); // Value at offset 1
	h_reference[5] = op(h_data[21]); // Value at offset 21
	h_reference[6] = op(h_data[11]); // Value at offset 11
	h_reference[7] = op(h_data[0]); // Value at offset 0;

	TransformInputIterator<T, TransformOp<T>, CastT> d_itr((CastT) d_data, op);
	Test(d_itr, h_reference);

	#if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer

	//
	// Test with thrust::copy_if()
	//

	T *h_copy = new T[TEST_VALUES];
	for (int i = 0; i < TEST_VALUES; ++i)
	h_copy[i] = op(h_data[i]);

	T *d_copy = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_copy, sizeof(T) TEST_VALUES));
	thrust::device_ptr<T> d_copy_wrapper(d_copy);

	thrust::copy_if(d_itr, d_itr + TEST_VALUES, d_copy_wrapper, SelectOp());

	int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose);
	printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
	AssertEquals(0, compare);

	// Cleanup
	if (h_copy) delete[] h_copy;
	if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));

	#endif // THRUST_VERSION

	if (h_data) delete[] h_data;
	if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
	}


	/**
	* Test tex-obj texture iterator
	*/
	template <typename T, typename CastT>
	void TestTexObj()
	{
	printf("\nTesting tex-obj iterator on type %s\n", typeid(T).name()); fflush(stdout);

	//
	// Test iterator manipulation in kernel
	//

	const unsigned int TEST_VALUES = 11000;
	const unsigned int DUMMY_OFFSET = 500;
	const unsigned int DUMMY_TEST_VALUES = TEST_VALUES - DUMMY_OFFSET;

	T *h_data = new T[TEST_VALUES];
	for (int i = 0; i < TEST_VALUES; ++i)
	{
	RandomBits(h_data[i]);
	}

	// Allocate device arrays
	T *d_data = NULL;
	T *d_dummy = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_data, sizeof(T) TEST_VALUES));
	CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));

	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_dummy, sizeof(T) DUMMY_TEST_VALUES));
	CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));

	// Initialize reference data
	T h_reference[8];
	h_reference[0] = h_data[0]; // Value at offset 0
	h_reference[1] = h_data[100]; // Value at offset 100
	h_reference[2] = h_data[1000]; // Value at offset 1000
	h_reference[3] = h_data[10000]; // Value at offset 10000
	h_reference[4] = h_data[1]; // Value at offset 1
	h_reference[5] = h_data[21]; // Value at offset 21
	h_reference[6] = h_data[11]; // Value at offset 11
	h_reference[7] = h_data[0]; // Value at offset 0;

	// Create and bind obj-based test iterator
	TexObjInputIterator<T> d_obj_itr;
	CubDebugExit(d_obj_itr.BindTexture((CastT) d_data, sizeof(T) TEST_VALUES));

	Test(d_obj_itr, h_reference);

	#if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer

	//
	// Test with thrust::copy_if()
	//

	T *d_copy = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_copy, sizeof(T) TEST_VALUES));
	thrust::device_ptr<T> d_copy_wrapper(d_copy);

	CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES));
	thrust::copy_if(d_obj_itr, d_obj_itr + TEST_VALUES, d_copy_wrapper, SelectOp());

	int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
	printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
	AssertEquals(0, compare);

	// Cleanup
	CubDebugExit(d_obj_itr.UnbindTexture());

	if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));

	#endif // THRUST_VERSION

	if (h_data) delete[] h_data;
	if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
	if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy));
	}


	#if CUDART_VERSION >= 5050

	/**
	* Test tex-ref texture iterator
	*/
	template <typename T, typename CastT>
	void TestTexRef()
	{
	printf("\nTesting tex-ref iterator on type %s\n", typeid(T).name()); fflush(stdout);

	//
	// Test iterator manipulation in kernel
	//

	constexpr int TEST_VALUES = 11000;
	constexpr unsigned int DUMMY_OFFSET = 500;
	constexpr unsigned int DUMMY_TEST_VALUES = TEST_VALUES - DUMMY_OFFSET;

	T *h_data = new T[TEST_VALUES];
	for (int i = 0; i < TEST_VALUES; ++i)
	{
	RandomBits(h_data[i]);
	}

	// Allocate device arrays
	T *d_data = NULL;
	T *d_dummy = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_data, sizeof(T) TEST_VALUES));
	CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));

	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_dummy, sizeof(T) DUMMY_TEST_VALUES));
	CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));

	// Initialize reference data
	T h_reference[8];
	h_reference[0] = h_data[0]; // Value at offset 0
	h_reference[1] = h_data[100]; // Value at offset 100
	h_reference[2] = h_data[1000]; // Value at offset 1000
	h_reference[3] = h_data[10000]; // Value at offset 10000
	h_reference[4] = h_data[1]; // Value at offset 1
	h_reference[5] = h_data[21]; // Value at offset 21
	h_reference[6] = h_data[11]; // Value at offset 11
	h_reference[7] = h_data[0]; // Value at offset 0;

	// Create and bind ref-based test iterator
	TexRefInputIterator<T, __LINE__> d_ref_itr;
	CubDebugExit(d_ref_itr.BindTexture((CastT) d_data, sizeof(T) TEST_VALUES));

	// Create and bind dummy iterator of same type to check with interferance
	TexRefInputIterator<T, __LINE__> d_ref_itr2;
	CubDebugExit(d_ref_itr2.BindTexture((CastT) d_dummy, sizeof(T) DUMMY_TEST_VALUES));

	Test(d_ref_itr, h_reference);

	#if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer

	//
	// Test with thrust::copy_if()
	//

	T *d_copy = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_copy, sizeof(T) TEST_VALUES));
	thrust::device_ptr<T> d_copy_wrapper(d_copy);

	CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES));
	thrust::copy_if(d_ref_itr, d_ref_itr + TEST_VALUES, d_copy_wrapper, SelectOp());

	int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
	printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
	AssertEquals(0, compare);

	if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));

	#endif // THRUST_VERSION

	CubDebugExit(d_ref_itr.UnbindTexture());
	CubDebugExit(d_ref_itr2.UnbindTexture());

	if (h_data) delete[] h_data;
	if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
	if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy));
	}


	/**
	* Test texture transform iterator
	*/
	template <typename T, typename CastT>
	void TestTexTransform()
	{
	printf("\nTesting tex-transform iterator on type %s\n", typeid(T).name()); fflush(stdout);

	//
	// Test iterator manipulation in kernel
	//

	constexpr int TEST_VALUES = 11000;

	T *h_data = new T[TEST_VALUES];
	for (int i = 0; i < TEST_VALUES; ++i)
	{
	InitValue(INTEGER_SEED, h_data[i], i);
	}

	// Allocate device arrays
	T *d_data = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_data, sizeof(T) TEST_VALUES));
	CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));

	TransformOp<T> op;

	// Initialize reference data
	T h_reference[8];
	h_reference[0] = op(h_data[0]); // Value at offset 0
	h_reference[1] = op(h_data[100]); // Value at offset 100
	h_reference[2] = op(h_data[1000]); // Value at offset 1000
	h_reference[3] = op(h_data[10000]); // Value at offset 10000
	h_reference[4] = op(h_data[1]); // Value at offset 1
	h_reference[5] = op(h_data[21]); // Value at offset 21
	h_reference[6] = op(h_data[11]); // Value at offset 11
	h_reference[7] = op(h_data[0]); // Value at offset 0;

	// Create and bind texture iterator
	typedef TexRefInputIterator<T, __LINE__> TextureIterator;

	TextureIterator d_tex_itr;
	CubDebugExit(d_tex_itr.BindTexture((CastT) d_data, sizeof(T) TEST_VALUES));

	// Create transform iterator
	TransformInputIterator<T, TransformOp<T>, TextureIterator> xform_itr(d_tex_itr, op);

	Test(xform_itr, h_reference);

	#if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer

	//
	// Test with thrust::copy_if()
	//

	T *h_copy = new T[TEST_VALUES];
	for (int i = 0; i < TEST_VALUES; ++i)
	h_copy[i] = op(h_data[i]);

	T *d_copy = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_copy, sizeof(T) TEST_VALUES));
	thrust::device_ptr<T> d_copy_wrapper(d_copy);

	thrust::copy_if(xform_itr, xform_itr + TEST_VALUES, d_copy_wrapper, SelectOp());

	int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose);
	printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
	AssertEquals(0, compare);

	// Cleanup
	if (h_copy) delete[] h_copy;
	if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));

	#endif // THRUST_VERSION

	CubDebugExit(d_tex_itr.UnbindTexture());
	if (h_data) delete[] h_data;
	if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
	}

	#endif // CUDART_VERSION




	/**
	* Run non-integer tests
	*/
	template <typename T, typename CastT>
	void Test(Int2Type<false> /* is_integer */)
	{
	TestModified<T, CastT>();
	TestTransform<T, CastT>();

	#if CUB_CDP
	// Test tex-obj iterators if CUDA dynamic parallelism enabled
	TestTexObj<T, CastT>(type_string);
	#endif // CUB_CDP

	#if CUDART_VERSION >= 5050
	// Test tex-ref iterators for CUDA 5.5
	TestTexRef<T, CastT>();
	TestTexTransform<T, CastT>();
	#endif // CUDART_VERSION
	}

	/**
	* Run integer tests
	*/
	template <typename T, typename CastT>
	void Test(Int2Type<true> /* is_integer */)
	{
	TestConstant<T>(0);
	TestConstant<T>(99);

	TestCounting<T>(0);
	TestCounting<T>(99);

	// Run non-integer tests
	Test<T, CastT>(Int2Type<false>());
	}

	/**
	* Run tests
	*/
	template <typename T>
	void Test()
	{
	enum {
	IS_INTEGER = (Traits<T>::CATEGORY == SIGNED_INTEGER) \|\| (Traits<T>::CATEGORY == UNSIGNED_INTEGER)
	};

	// Test non-const type
	Test<T, T>(Int2Type<IS_INTEGER>());

	// Test non-const type
	Test<T, const T>(Int2Type<IS_INTEGER>());
	}


	/**
	* Main
	*/
	int main(int argc, char** argv)
	{
	// Initialize command line
	CommandLineArgs args(argc, argv);
	g_verbose = args.CheckCmdLineFlag("v");

	// Print usage
	if (args.CheckCmdLineFlag("help"))
	{
	printf("%s "
	"[--device=<device-id>] "
	"[--v] "
	"\n", argv[0]);
	exit(0);
	}

	// Initialize device
	CubDebugExit(args.DeviceInit());

	// Get ptx version
	int ptx_version = 0;
	CubDebugExit(PtxVersion(ptx_version));

	// Evaluate different data types
	Test<char>();
	Test<short>();
	Test<int>();
	Test<long>();
	Test<long long>();
	Test<float>();
	if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted
	Test<double>();

	Test<char2>();
	Test<short2>();
	Test<int2>();
	Test<long2>();
	Test<longlong2>();
	Test<float2>();
	if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted
	Test<double2>();

	Test<char3>();
	Test<short3>();
	Test<int3>();
	Test<long3>();
	Test<longlong3>();
	Test<float3>();
	if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted
	Test<double3>();

	Test<char4>();
	Test<short4>();
	Test<int4>();
	Test<long4>();
	Test<longlong4>();
	Test<float4>();
	if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted
	Test<double4>();

	Test<TestFoo>();
	Test<TestBar>();

	printf("\nTest complete\n"); fflush(stdout);

	return 0;
	}