Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /cub /device /dispatch /dispatch_histogram.cuh

Xu Ma

update

1c3c0d9 over 3 years ago

57.4 kB


	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
	*/

	#pragma once

	#include <stdio.h>
	#include <iterator>
	#include <limits>

	#include "../../agent/agent_histogram.cuh"
	#include "../../util_debug.cuh"
	#include "../../util_device.cuh"
	#include "../../thread/thread_search.cuh"
	#include "../../grid/grid_queue.cuh"
	#include "../../config.cuh"

	#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {



	/******************************************************************************
	* Histogram kernel entry points
	*****************************************************************************/

	/**
	* Histogram initialization kernel entry point
	*/
	template <
	int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
	typename CounterT, ///< Integer type for counting sample occurrences per histogram bin
	typename OffsetT> ///< Signed integer type for global offsets
	__global__ void DeviceHistogramInitKernel(
	ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper, ///< Number of output histogram bins per channel
	ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper, ///< Histogram counter data having logical dimensions <tt>CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]</tt>
	GridQueue<int> tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
	{
	if ((threadIdx.x == 0) && (blockIdx.x == 0))
	tile_queue.ResetDrain();

	int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;

	#pragma unroll
	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	{
	if (output_bin < num_output_bins_wrapper.array[CHANNEL])
	d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
	}
	}


	/**
	* Histogram privatized sweep kernel entry point (multi-block). Computes privatized histograms, one per thread block.
	*/
	template <
	typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type
	int PRIVATIZED_SMEM_BINS, ///< Maximum number of histogram bins per channel (e.g., up to 256)
	int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
	int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
	typename SampleIteratorT, ///< The input iterator type. \iterator.
	typename CounterT, ///< Integer type for counting sample occurrences per histogram bin
	typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel
	typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
	typename OffsetT> ///< Signed integer type for global offsets
	__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
	__global__ void DeviceHistogramSweepKernel(
	SampleIteratorT d_samples, ///< Input data to reduce
	ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper, ///< The number bins per final output histogram
	ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper, ///< The number bins per privatized histogram
	ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper, ///< Reference to final output histograms
	ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper, ///< Reference to privatized histograms
	ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper, ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
	ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper, ///< The transform operator for determining privatized counter indices from samples, one for each channel
	OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest
	OffsetT num_rows, ///< The number of rows in the region of interest
	OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest
	int tiles_per_row, ///< Number of image tiles per row
	GridQueue<int> tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
	{
	// Thread block type for compositing input tiles
	typedef AgentHistogram<
	AgentHistogramPolicyT,
	PRIVATIZED_SMEM_BINS,
	NUM_CHANNELS,
	NUM_ACTIVE_CHANNELS,
	SampleIteratorT,
	CounterT,
	PrivatizedDecodeOpT,
	OutputDecodeOpT,
	OffsetT>
	AgentHistogramT;

	// Shared memory for AgentHistogram
	__shared__ typename AgentHistogramT::TempStorage temp_storage;

	AgentHistogramT agent(
	temp_storage,
	d_samples,
	num_output_bins_wrapper.array,
	num_privatized_bins_wrapper.array,
	d_output_histograms_wrapper.array,
	d_privatized_histograms_wrapper.array,
	output_decode_op_wrapper.array,
	privatized_decode_op_wrapper.array);

	// Initialize counters
	agent.InitBinCounters();

	// Consume input tiles
	agent.ConsumeTiles(
	num_row_pixels,
	num_rows,
	row_stride_samples,
	tiles_per_row,
	tile_queue);

	// Store output to global (if necessary)
	agent.StoreOutput();

	}






	/******************************************************************************
	* Dispatch
	******************************************************************************/

	/**
	* Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
	*/
	template <
	int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
	int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
	typename SampleIteratorT, ///< Random-access input iterator type for reading input items \iterator
	typename CounterT, ///< Integer type for counting sample occurrences per histogram bin
	typename LevelT, ///< Type for specifying bin level boundaries
	typename OffsetT> ///< Signed integer type for global offsets
	struct DipatchHistogram
	{
	//---------------------------------------------------------------------
	// Types and constants
	//---------------------------------------------------------------------

	/// The sample value type of the input iterator
	typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

	enum
	{
	// Maximum number of bins per channel for which we will use a privatized smem strategy
	MAX_PRIVATIZED_SMEM_BINS = 256
	};


	//---------------------------------------------------------------------
	// Transform functors for converting samples to bin-ids
	//---------------------------------------------------------------------

	// Searches for bin given a list of bin-boundary levels
	template <typename LevelIteratorT>
	struct SearchTransform
	{
	LevelIteratorT d_levels; // Pointer to levels array
	int num_output_levels; // Number of levels in array

	// Initializer
	__host__ __device__ __forceinline__ void Init(
	LevelIteratorT d_levels, // Pointer to levels array
	int num_output_levels) // Number of levels in array
	{
	this->d_levels = d_levels;
	this->num_output_levels = num_output_levels;
	}

	// Method for converting samples to bin-ids
	template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
	__host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
	{
	/// Level iterator wrapper type
	typedef typename If<IsPointer<LevelIteratorT>::VALUE,
	CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>, // Wrap the native input pointer with CacheModifiedInputIterator
	LevelIteratorT>::Type // Directly use the supplied input iterator type
	WrappedLevelIteratorT;

	WrappedLevelIteratorT wrapped_levels(d_levels);

	int num_bins = num_output_levels - 1;
	if (valid)
	{
	bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
	if (bin >= num_bins)
	bin = -1;
	}
	}
	};


	// Scales samples to evenly-spaced bins
	struct ScaleTransform
	{
	int num_bins; // Number of levels in array
	LevelT max; // Max sample level (exclusive)
	LevelT min; // Min sample level (inclusive)
	LevelT scale; // Bin scaling factor

	// Initializer
	template <typename _LevelT>
	__host__ __device__ __forceinline__ void Init(
	int num_output_levels, // Number of levels in array
	_LevelT max, // Max sample level (exclusive)
	_LevelT min, // Min sample level (inclusive)
	_LevelT scale) // Bin scaling factor
	{
	this->num_bins = num_output_levels - 1;
	this->max = max;
	this->min = min;
	this->scale = scale;
	}

	// Initializer (float specialization)
	__host__ __device__ __forceinline__ void Init(
	int num_output_levels, // Number of levels in array
	float max, // Max sample level (exclusive)
	float min, // Min sample level (inclusive)
	float scale) // Bin scaling factor
	{
	this->num_bins = num_output_levels - 1;
	this->max = max;
	this->min = min;
	this->scale = float(1.0) / scale;
	}

	// Initializer (double specialization)
	__host__ __device__ __forceinline__ void Init(
	int num_output_levels, // Number of levels in array
	double max, // Max sample level (exclusive)
	double min, // Min sample level (inclusive)
	double scale) // Bin scaling factor
	{
	this->num_bins = num_output_levels - 1;
	this->max = max;
	this->min = min;
	this->scale = double(1.0) / scale;
	}

	// Method for converting samples to bin-ids
	template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
	__host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
	{
	LevelT level_sample = (LevelT) sample;

	if (valid && (level_sample >= min) && (level_sample < max))
	bin = (int) ((level_sample - min) / scale);
	}

	// Method for converting samples to bin-ids (float specialization)
	template <CacheLoadModifier LOAD_MODIFIER>
	__host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
	{
	LevelT level_sample = (LevelT) sample;

	if (valid && (level_sample >= min) && (level_sample < max))
	bin = (int) ((level_sample - min) * scale);
	}

	// Method for converting samples to bin-ids (double specialization)
	template <CacheLoadModifier LOAD_MODIFIER>
	__host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
	{
	LevelT level_sample = (LevelT) sample;

	if (valid && (level_sample >= min) && (level_sample < max))
	bin = (int) ((level_sample - min) * scale);
	}
	};


	// Pass-through bin transform operator
	struct PassThruTransform
	{
	// Method for converting samples to bin-ids
	template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
	__host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
	{
	if (valid)
	bin = (int) sample;
	}
	};



	//---------------------------------------------------------------------
	// Tuning policies
	//---------------------------------------------------------------------

	template <int NOMINAL_ITEMS_PER_THREAD>
	struct TScale
	{
	enum
	{
	V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
	VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
	};
	};


	/// SM11
	struct Policy110
	{
	// HistogramSweepPolicy
	typedef AgentHistogramPolicy<
	512,
	(NUM_CHANNELS == 1) ? 8 : 2,
	BLOCK_LOAD_DIRECT,
	LOAD_DEFAULT,
	true,
	GMEM,
	false>
	HistogramSweepPolicy;
	};

	/// SM20
	struct Policy200
	{
	// HistogramSweepPolicy
	typedef AgentHistogramPolicy<
	(NUM_CHANNELS == 1) ? 256 : 128,
	(NUM_CHANNELS == 1) ? 8 : 3,
	(NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE,
	LOAD_DEFAULT,
	true,
	SMEM,
	false>
	HistogramSweepPolicy;
	};

	/// SM30
	struct Policy300
	{
	// HistogramSweepPolicy
	typedef AgentHistogramPolicy<
	512,
	(NUM_CHANNELS == 1) ? 8 : 2,
	BLOCK_LOAD_DIRECT,
	LOAD_DEFAULT,
	true,
	GMEM,
	false>
	HistogramSweepPolicy;
	};

	/// SM35
	struct Policy350
	{
	// HistogramSweepPolicy
	typedef AgentHistogramPolicy<
	128,
	TScale<8>::VALUE,
	BLOCK_LOAD_DIRECT,
	LOAD_LDG,
	true,
	BLEND,
	true>
	HistogramSweepPolicy;
	};

	/// SM50
	struct Policy500
	{
	// HistogramSweepPolicy
	typedef AgentHistogramPolicy<
	384,
	TScale<16>::VALUE,
	BLOCK_LOAD_DIRECT,
	LOAD_LDG,
	true,
	SMEM,
	false>
	HistogramSweepPolicy;
	};



	//---------------------------------------------------------------------
	// Tuning policies of current PTX compiler pass
	//---------------------------------------------------------------------

	#if (CUB_PTX_ARCH >= 500)
	typedef Policy500 PtxPolicy;

	#elif (CUB_PTX_ARCH >= 350)
	typedef Policy350 PtxPolicy;

	#elif (CUB_PTX_ARCH >= 300)
	typedef Policy300 PtxPolicy;

	#elif (CUB_PTX_ARCH >= 200)
	typedef Policy200 PtxPolicy;

	#else
	typedef Policy110 PtxPolicy;

	#endif

	// "Opaque" policies (whose parameterizations aren't reflected in the type signature)
	struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {};


	//---------------------------------------------------------------------
	// Utilities
	//---------------------------------------------------------------------

	/**
	* Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
	*/
	template <typename KernelConfig>
	CUB_RUNTIME_FUNCTION __forceinline__
	static cudaError_t InitConfigs(
	int ptx_version,
	KernelConfig &histogram_sweep_config)
	{
	cudaError_t result = cudaErrorNotSupported;
	if (CUB_IS_DEVICE_CODE)
	{
	#if CUB_INCLUDE_DEVICE_CODE
	// We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
	result = histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
	#endif
	}
	else
	{
	#if CUB_INCLUDE_HOST_CODE
	// We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
	if (ptx_version >= 500)
	{
	result = histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
	}
	else if (ptx_version >= 350)
	{
	result = histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
	}
	else if (ptx_version >= 300)
	{
	result = histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
	}
	else if (ptx_version >= 200)
	{
	result = histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
	}
	else
	{
	result = histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
	}
	#endif
	}
	return result;
	}


	/**
	* Kernel kernel dispatch configuration
	*/
	struct KernelConfig
	{
	int block_threads;
	int pixels_per_thread;

	template <typename BlockPolicy>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Init()
	{
	block_threads = BlockPolicy::BLOCK_THREADS;
	pixels_per_thread = BlockPolicy::PIXELS_PER_THREAD;

	return cudaSuccess;
	}
	};


	//---------------------------------------------------------------------
	// Dispatch entrypoints
	//---------------------------------------------------------------------

	/**
	* Privatization-based dispatch routine
	*/
	template <
	typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel
	typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
	typename DeviceHistogramInitKernelT, ///< Function type of cub::DeviceHistogramInitKernel
	typename DeviceHistogramSweepKernelT> ///< Function type of cub::DeviceHistogramSweepKernel
	CUB_RUNTIME_FUNCTION __forceinline__
	static cudaError_t PrivatizedDispatch(
	void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
	size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
	SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
	CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
	int num_privatized_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
	PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel
	int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
	OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel
	int max_num_output_bins, ///< [in] Maximum number of output bins in any channel
	OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
	OffsetT num_rows, ///< [in] The number of rows in the region of interest
	OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
	DeviceHistogramInitKernelT histogram_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
	DeviceHistogramSweepKernelT histogram_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
	KernelConfig histogram_sweep_config, ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
	cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
	bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
	{
	#ifndef CUB_RUNTIME_ENABLED

	// Kernel launch not supported from this device
	return CubDebug(cudaErrorNotSupported);

	#else

	cudaError error = cudaSuccess;
	do
	{
	// Get device ordinal
	int device_ordinal;
	if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;

	// Get SM count
	int sm_count;
	if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;

	// Get SM occupancy for histogram_sweep_kernel
	int histogram_sweep_sm_occupancy;
	if (CubDebug(error = MaxSmOccupancy(
	histogram_sweep_sm_occupancy,
	histogram_sweep_kernel,
	histogram_sweep_config.block_threads))) break;

	// Get device occupancy for histogram_sweep_kernel
	int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;

	if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
	{
	// Treat as a single linear array of samples
	num_row_pixels *= num_rows;
	num_rows = 1;
	row_stride_samples = num_row_pixels * NUM_CHANNELS;
	}

	// Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
	int pixels_per_tile = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
	int tiles_per_row = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
	int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
	int blocks_per_col = (blocks_per_row > 0) ?
	int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
	0;
	int num_thread_blocks = blocks_per_row * blocks_per_col;

	dim3 sweep_grid_dims;
	sweep_grid_dims.x = (unsigned int) blocks_per_row;
	sweep_grid_dims.y = (unsigned int) blocks_per_col;
	sweep_grid_dims.z = 1;

	// Temporary storage allocation requirements
	const int NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
	void* allocations[NUM_ALLOCATIONS] = {};
	size_t allocation_sizes[NUM_ALLOCATIONS];

	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);

	allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();

	// Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
	if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
	if (d_temp_storage == NULL)
	{
	// Return if the caller is simply requesting the size of the storage allocation
	break;
	}

	// Construct the grid queue descriptor
	GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);

	// Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
	ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;
	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];

	// Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
	ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];

	// Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
	ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;
	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];

	// Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
	ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;
	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];

	// Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
	ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;

	// Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
	ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;

	int histogram_init_block_threads = 256;
	int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;

	// Log DeviceHistogramInitKernel configuration
	if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
	histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);

	// Invoke histogram_init_kernel
	thrust::cuda_cub::launcher::triple_chevron(
	histogram_init_grid_dims, histogram_init_block_threads, 0,
	stream
	).doit(histogram_init_kernel,
	num_output_bins_wrapper,
	d_output_histograms_wrapper,
	tile_queue);

	// Return if empty problem
	if ((blocks_per_row == 0) \|\| (blocks_per_col == 0))
	break;

	// Log histogram_sweep_kernel configuration
	if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
	sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
	histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);

	// Invoke histogram_sweep_kernel
	thrust::cuda_cub::launcher::triple_chevron(
	sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream
	).doit(histogram_sweep_kernel,
	d_samples,
	num_output_bins_wrapper,
	num_privatized_bins_wrapper,
	d_output_histograms_wrapper,
	d_privatized_histograms_wrapper,
	output_decode_op_wrapper,
	privatized_decode_op_wrapper,
	num_row_pixels,
	num_rows,
	row_stride_samples,
	tiles_per_row,
	tile_queue);

	// Check for failure to launch
	if (CubDebug(error = cudaPeekAtLastError())) break;

	// Sync the stream if specified to flush runtime errors
	if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

	}
	while (0);

	return error;

	#endif // CUB_RUNTIME_ENABLED
	}



	/**
	* Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
	*/
	CUB_RUNTIME_FUNCTION
	static cudaError_t DispatchRange(
	void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
	size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
	SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
	CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
	int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
	LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
	OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
	OffsetT num_rows, ///< [in] The number of rows in the region of interest
	OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
	cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
	bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
	Int2Type<false> /is_byte_sample/) ///< [in] Marker type indicating whether or not SampleT is a 8b type
	{
	cudaError error = cudaSuccess;
	do
	{
	// Get PTX version
	int ptx_version = 0;
	if (CubDebug(error = PtxVersion(ptx_version))) break;

	// Get kernel dispatch configurations
	KernelConfig histogram_sweep_config;
	if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
	break;

	// Use the search transform op for converting samples to privatized bins
	typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;

	// Use the pass-thru transform op for converting privatized bins to output bins
	typedef PassThruTransform OutputDecodeOpT;

	PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
	OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
	int max_levels = num_output_levels[0];

	for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
	{
	privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
	if (num_output_levels[channel] > max_levels)
	max_levels = num_output_levels[channel];
	}
	int max_num_output_bins = max_levels - 1;

	// Dispatch
	if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
	{
	// Too many bins to keep in shared memory.
	const int PRIVATIZED_SMEM_BINS = 0;

	if (CubDebug(error = PrivatizedDispatch(
	d_temp_storage,
	temp_storage_bytes,
	d_samples,
	d_output_histograms,
	num_output_levels,
	privatized_decode_op,
	num_output_levels,
	output_decode_op,
	max_num_output_bins,
	num_row_pixels,
	num_rows,
	row_stride_samples,
	DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
	DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
	histogram_sweep_config,
	stream,
	debug_synchronous))) break;
	}
	else
	{
	// Dispatch shared-privatized approach
	const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;

	if (CubDebug(error = PrivatizedDispatch(
	d_temp_storage,
	temp_storage_bytes,
	d_samples,
	d_output_histograms,
	num_output_levels,
	privatized_decode_op,
	num_output_levels,
	output_decode_op,
	max_num_output_bins,
	num_row_pixels,
	num_rows,
	row_stride_samples,
	DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
	DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
	histogram_sweep_config,
	stream,
	debug_synchronous))) break;
	}

	} while (0);

	return error;
	}


	/**
	* Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
	*/
	CUB_RUNTIME_FUNCTION
	static cudaError_t DispatchRange(
	void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
	size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
	SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
	CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
	int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
	LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
	OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
	OffsetT num_rows, ///< [in] The number of rows in the region of interest
	OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
	cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
	bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
	Int2Type<true> /is_byte_sample/) ///< [in] Marker type indicating whether or not SampleT is a 8b type
	{
	cudaError error = cudaSuccess;
	do
	{
	// Get PTX version
	int ptx_version = 0;
	if (CubDebug(error = PtxVersion(ptx_version))) break;

	// Get kernel dispatch configurations
	KernelConfig histogram_sweep_config;
	if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
	break;

	// Use the pass-thru transform op for converting samples to privatized bins
	typedef PassThruTransform PrivatizedDecodeOpT;

	// Use the search transform op for converting privatized bins to output bins
	typedef SearchTransform<LevelT*> OutputDecodeOpT;

	int num_privatized_levels[NUM_ACTIVE_CHANNELS];
	PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
	OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
	int max_levels = num_output_levels[0]; // Maximum number of levels in any channel

	for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
	{
	num_privatized_levels[channel] = 257;
	output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);

	if (num_output_levels[channel] > max_levels)
	max_levels = num_output_levels[channel];
	}
	int max_num_output_bins = max_levels - 1;

	const int PRIVATIZED_SMEM_BINS = 256;

	if (CubDebug(error = PrivatizedDispatch(
	d_temp_storage,
	temp_storage_bytes,
	d_samples,
	d_output_histograms,
	num_privatized_levels,
	privatized_decode_op,
	num_output_levels,
	output_decode_op,
	max_num_output_bins,
	num_row_pixels,
	num_rows,
	row_stride_samples,
	DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
	DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
	histogram_sweep_config,
	stream,
	debug_synchronous))) break;

	} while (0);

	return error;
	}


	/**
	* Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
	*/
	CUB_RUNTIME_FUNCTION __forceinline__
	static cudaError_t DispatchEven(
	void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
	size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
	SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
	CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
	int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
	LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
	LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
	OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
	OffsetT num_rows, ///< [in] The number of rows in the region of interest
	OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
	cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
	bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
	Int2Type<false> /is_byte_sample/) ///< [in] Marker type indicating whether or not SampleT is a 8b type
	{
	cudaError error = cudaSuccess;
	do
	{
	// Get PTX version
	int ptx_version = 0;
	if (CubDebug(error = PtxVersion(ptx_version))) break;

	// Get kernel dispatch configurations
	KernelConfig histogram_sweep_config;
	if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
	break;

	// Use the scale transform op for converting samples to privatized bins
	typedef ScaleTransform PrivatizedDecodeOpT;

	// Use the pass-thru transform op for converting privatized bins to output bins
	typedef PassThruTransform OutputDecodeOpT;

	PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
	OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
	int max_levels = num_output_levels[0];

	for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
	{
	int bins = num_output_levels[channel] - 1;
	LevelT scale = (upper_level[channel] - lower_level[channel]) / bins;

	privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);

	if (num_output_levels[channel] > max_levels)
	max_levels = num_output_levels[channel];
	}
	int max_num_output_bins = max_levels - 1;

	if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
	{
	// Dispatch shared-privatized approach
	const int PRIVATIZED_SMEM_BINS = 0;

	if (CubDebug(error = PrivatizedDispatch(
	d_temp_storage,
	temp_storage_bytes,
	d_samples,
	d_output_histograms,
	num_output_levels,
	privatized_decode_op,
	num_output_levels,
	output_decode_op,
	max_num_output_bins,
	num_row_pixels,
	num_rows,
	row_stride_samples,
	DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
	DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
	histogram_sweep_config,
	stream,
	debug_synchronous))) break;
	}
	else
	{
	// Dispatch shared-privatized approach
	const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;

	if (CubDebug(error = PrivatizedDispatch(
	d_temp_storage,
	temp_storage_bytes,
	d_samples,
	d_output_histograms,
	num_output_levels,
	privatized_decode_op,
	num_output_levels,
	output_decode_op,
	max_num_output_bins,
	num_row_pixels,
	num_rows,
	row_stride_samples,
	DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
	DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
	histogram_sweep_config,
	stream,
	debug_synchronous))) break;
	}
	}
	while (0);

	return error;
	}


	/**
	* Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
	*/
	CUB_RUNTIME_FUNCTION __forceinline__
	static cudaError_t DispatchEven(
	void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
	size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
	SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
	CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
	int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
	LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
	LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
	OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
	OffsetT num_rows, ///< [in] The number of rows in the region of interest
	OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
	cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
	bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
	Int2Type<true> /is_byte_sample/) ///< [in] Marker type indicating whether or not SampleT is a 8b type
	{
	cudaError error = cudaSuccess;
	do
	{
	// Get PTX version
	int ptx_version = 0;
	if (CubDebug(error = PtxVersion(ptx_version))) break;

	// Get kernel dispatch configurations
	KernelConfig histogram_sweep_config;
	if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
	break;

	// Use the pass-thru transform op for converting samples to privatized bins
	typedef PassThruTransform PrivatizedDecodeOpT;

	// Use the scale transform op for converting privatized bins to output bins
	typedef ScaleTransform OutputDecodeOpT;

	int num_privatized_levels[NUM_ACTIVE_CHANNELS];
	PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
	OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
	int max_levels = num_output_levels[0];

	for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
	{
	num_privatized_levels[channel] = 257;

	int bins = num_output_levels[channel] - 1;
	LevelT scale = (upper_level[channel] - lower_level[channel]) / bins;
	output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);

	if (num_output_levels[channel] > max_levels)
	max_levels = num_output_levels[channel];
	}
	int max_num_output_bins = max_levels - 1;

	const int PRIVATIZED_SMEM_BINS = 256;

	if (CubDebug(error = PrivatizedDispatch(
	d_temp_storage,
	temp_storage_bytes,
	d_samples,
	d_output_histograms,
	num_privatized_levels,
	privatized_decode_op,
	num_output_levels,
	output_decode_op,
	max_num_output_bins,
	num_row_pixels,
	num_rows,
	row_stride_samples,
	DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
	DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
	histogram_sweep_config,
	stream,
	debug_synchronous))) break;

	}
	while (0);

	return error;
	}

	};


	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)