Spaces:
Runtime error
Runtime error
/****************************************************************************** | |
* Copyright (c) 2011, Duane Merrill. All rights reserved. | |
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that the following conditions are met: | |
* * Redistributions of source code must retain the above copyright | |
* notice, this list of conditions and the following disclaimer. | |
* * Redistributions in binary form must reproduce the above copyright | |
* notice, this list of conditions and the following disclaimer in the | |
* documentation and/or other materials provided with the distribution. | |
* * Neither the name of the NVIDIA CORPORATION nor the | |
* names of its contributors may be used to endorse or promote products | |
* derived from this software without specific prior written permission. | |
* | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
* | |
******************************************************************************/ | |
/** | |
* \file | |
* cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. | |
*/ | |
#pragma once | |
#include <stdio.h> | |
#include <iterator> | |
#include <limits> | |
#include "../../agent/agent_histogram.cuh" | |
#include "../../util_debug.cuh" | |
#include "../../util_device.cuh" | |
#include "../../thread/thread_search.cuh" | |
#include "../../grid/grid_queue.cuh" | |
#include "../../config.cuh" | |
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h> | |
/// Optional outer namespace(s) | |
CUB_NS_PREFIX | |
/// CUB namespace | |
namespace cub { | |
/****************************************************************************** | |
* Histogram kernel entry points | |
*****************************************************************************/ | |
/** | |
* Histogram initialization kernel entry point | |
*/ | |
template < | |
int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed | |
typename CounterT, ///< Integer type for counting sample occurrences per histogram bin | |
typename OffsetT> ///< Signed integer type for global offsets | |
__global__ void DeviceHistogramInitKernel( | |
ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper, ///< Number of output histogram bins per channel | |
ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper, ///< Histogram counter data having logical dimensions <tt>CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]</tt> | |
GridQueue<int> tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks | |
{ | |
if ((threadIdx.x == 0) && (blockIdx.x == 0)) | |
tile_queue.ResetDrain(); | |
int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x; | |
#pragma unroll | |
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) | |
{ | |
if (output_bin < num_output_bins_wrapper.array[CHANNEL]) | |
d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0; | |
} | |
} | |
/** | |
* Histogram privatized sweep kernel entry point (multi-block). Computes privatized histograms, one per thread block. | |
*/ | |
template < | |
typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type | |
int PRIVATIZED_SMEM_BINS, ///< Maximum number of histogram bins per channel (e.g., up to 256) | |
int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) | |
int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed | |
typename SampleIteratorT, ///< The input iterator type. \iterator. | |
typename CounterT, ///< Integer type for counting sample occurrences per histogram bin | |
typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel | |
typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel | |
typename OffsetT> ///< Signed integer type for global offsets | |
__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS)) | |
__global__ void DeviceHistogramSweepKernel( | |
SampleIteratorT d_samples, ///< Input data to reduce | |
ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper, ///< The number bins per final output histogram | |
ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper, ///< The number bins per privatized histogram | |
ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper, ///< Reference to final output histograms | |
ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper, ///< Reference to privatized histograms | |
ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper, ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel | |
ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper, ///< The transform operator for determining privatized counter indices from samples, one for each channel | |
OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest | |
OffsetT num_rows, ///< The number of rows in the region of interest | |
OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest | |
int tiles_per_row, ///< Number of image tiles per row | |
GridQueue<int> tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks | |
{ | |
// Thread block type for compositing input tiles | |
typedef AgentHistogram< | |
AgentHistogramPolicyT, | |
PRIVATIZED_SMEM_BINS, | |
NUM_CHANNELS, | |
NUM_ACTIVE_CHANNELS, | |
SampleIteratorT, | |
CounterT, | |
PrivatizedDecodeOpT, | |
OutputDecodeOpT, | |
OffsetT> | |
AgentHistogramT; | |
// Shared memory for AgentHistogram | |
__shared__ typename AgentHistogramT::TempStorage temp_storage; | |
AgentHistogramT agent( | |
temp_storage, | |
d_samples, | |
num_output_bins_wrapper.array, | |
num_privatized_bins_wrapper.array, | |
d_output_histograms_wrapper.array, | |
d_privatized_histograms_wrapper.array, | |
output_decode_op_wrapper.array, | |
privatized_decode_op_wrapper.array); | |
// Initialize counters | |
agent.InitBinCounters(); | |
// Consume input tiles | |
agent.ConsumeTiles( | |
num_row_pixels, | |
num_rows, | |
row_stride_samples, | |
tiles_per_row, | |
tile_queue); | |
// Store output to global (if necessary) | |
agent.StoreOutput(); | |
} | |
/****************************************************************************** | |
* Dispatch | |
******************************************************************************/ | |
/** | |
* Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram | |
*/ | |
template < | |
int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) | |
int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed | |
typename SampleIteratorT, ///< Random-access input iterator type for reading input items \iterator | |
typename CounterT, ///< Integer type for counting sample occurrences per histogram bin | |
typename LevelT, ///< Type for specifying bin level boundaries | |
typename OffsetT> ///< Signed integer type for global offsets | |
struct DipatchHistogram | |
{ | |
//--------------------------------------------------------------------- | |
// Types and constants | |
//--------------------------------------------------------------------- | |
/// The sample value type of the input iterator | |
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT; | |
enum | |
{ | |
// Maximum number of bins per channel for which we will use a privatized smem strategy | |
MAX_PRIVATIZED_SMEM_BINS = 256 | |
}; | |
//--------------------------------------------------------------------- | |
// Transform functors for converting samples to bin-ids | |
//--------------------------------------------------------------------- | |
// Searches for bin given a list of bin-boundary levels | |
template <typename LevelIteratorT> | |
struct SearchTransform | |
{ | |
LevelIteratorT d_levels; // Pointer to levels array | |
int num_output_levels; // Number of levels in array | |
// Initializer | |
__host__ __device__ __forceinline__ void Init( | |
LevelIteratorT d_levels, // Pointer to levels array | |
int num_output_levels) // Number of levels in array | |
{ | |
this->d_levels = d_levels; | |
this->num_output_levels = num_output_levels; | |
} | |
// Method for converting samples to bin-ids | |
template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT> | |
__host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) | |
{ | |
/// Level iterator wrapper type | |
typedef typename If<IsPointer<LevelIteratorT>::VALUE, | |
CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>, // Wrap the native input pointer with CacheModifiedInputIterator | |
LevelIteratorT>::Type // Directly use the supplied input iterator type | |
WrappedLevelIteratorT; | |
WrappedLevelIteratorT wrapped_levels(d_levels); | |
int num_bins = num_output_levels - 1; | |
if (valid) | |
{ | |
bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1; | |
if (bin >= num_bins) | |
bin = -1; | |
} | |
} | |
}; | |
// Scales samples to evenly-spaced bins | |
struct ScaleTransform | |
{ | |
int num_bins; // Number of levels in array | |
LevelT max; // Max sample level (exclusive) | |
LevelT min; // Min sample level (inclusive) | |
LevelT scale; // Bin scaling factor | |
// Initializer | |
template <typename _LevelT> | |
__host__ __device__ __forceinline__ void Init( | |
int num_output_levels, // Number of levels in array | |
_LevelT max, // Max sample level (exclusive) | |
_LevelT min, // Min sample level (inclusive) | |
_LevelT scale) // Bin scaling factor | |
{ | |
this->num_bins = num_output_levels - 1; | |
this->max = max; | |
this->min = min; | |
this->scale = scale; | |
} | |
// Initializer (float specialization) | |
__host__ __device__ __forceinline__ void Init( | |
int num_output_levels, // Number of levels in array | |
float max, // Max sample level (exclusive) | |
float min, // Min sample level (inclusive) | |
float scale) // Bin scaling factor | |
{ | |
this->num_bins = num_output_levels - 1; | |
this->max = max; | |
this->min = min; | |
this->scale = float(1.0) / scale; | |
} | |
// Initializer (double specialization) | |
__host__ __device__ __forceinline__ void Init( | |
int num_output_levels, // Number of levels in array | |
double max, // Max sample level (exclusive) | |
double min, // Min sample level (inclusive) | |
double scale) // Bin scaling factor | |
{ | |
this->num_bins = num_output_levels - 1; | |
this->max = max; | |
this->min = min; | |
this->scale = double(1.0) / scale; | |
} | |
// Method for converting samples to bin-ids | |
template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT> | |
__host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) | |
{ | |
LevelT level_sample = (LevelT) sample; | |
if (valid && (level_sample >= min) && (level_sample < max)) | |
bin = (int) ((level_sample - min) / scale); | |
} | |
// Method for converting samples to bin-ids (float specialization) | |
template <CacheLoadModifier LOAD_MODIFIER> | |
__host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid) | |
{ | |
LevelT level_sample = (LevelT) sample; | |
if (valid && (level_sample >= min) && (level_sample < max)) | |
bin = (int) ((level_sample - min) * scale); | |
} | |
// Method for converting samples to bin-ids (double specialization) | |
template <CacheLoadModifier LOAD_MODIFIER> | |
__host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid) | |
{ | |
LevelT level_sample = (LevelT) sample; | |
if (valid && (level_sample >= min) && (level_sample < max)) | |
bin = (int) ((level_sample - min) * scale); | |
} | |
}; | |
// Pass-through bin transform operator | |
struct PassThruTransform | |
{ | |
// Method for converting samples to bin-ids | |
template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT> | |
__host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) | |
{ | |
if (valid) | |
bin = (int) sample; | |
} | |
}; | |
//--------------------------------------------------------------------- | |
// Tuning policies | |
//--------------------------------------------------------------------- | |
template <int NOMINAL_ITEMS_PER_THREAD> | |
struct TScale | |
{ | |
enum | |
{ | |
V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int), | |
VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1) | |
}; | |
}; | |
/// SM11 | |
struct Policy110 | |
{ | |
// HistogramSweepPolicy | |
typedef AgentHistogramPolicy< | |
512, | |
(NUM_CHANNELS == 1) ? 8 : 2, | |
BLOCK_LOAD_DIRECT, | |
LOAD_DEFAULT, | |
true, | |
GMEM, | |
false> | |
HistogramSweepPolicy; | |
}; | |
/// SM20 | |
struct Policy200 | |
{ | |
// HistogramSweepPolicy | |
typedef AgentHistogramPolicy< | |
(NUM_CHANNELS == 1) ? 256 : 128, | |
(NUM_CHANNELS == 1) ? 8 : 3, | |
(NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE, | |
LOAD_DEFAULT, | |
true, | |
SMEM, | |
false> | |
HistogramSweepPolicy; | |
}; | |
/// SM30 | |
struct Policy300 | |
{ | |
// HistogramSweepPolicy | |
typedef AgentHistogramPolicy< | |
512, | |
(NUM_CHANNELS == 1) ? 8 : 2, | |
BLOCK_LOAD_DIRECT, | |
LOAD_DEFAULT, | |
true, | |
GMEM, | |
false> | |
HistogramSweepPolicy; | |
}; | |
/// SM35 | |
struct Policy350 | |
{ | |
// HistogramSweepPolicy | |
typedef AgentHistogramPolicy< | |
128, | |
TScale<8>::VALUE, | |
BLOCK_LOAD_DIRECT, | |
LOAD_LDG, | |
true, | |
BLEND, | |
true> | |
HistogramSweepPolicy; | |
}; | |
/// SM50 | |
struct Policy500 | |
{ | |
// HistogramSweepPolicy | |
typedef AgentHistogramPolicy< | |
384, | |
TScale<16>::VALUE, | |
BLOCK_LOAD_DIRECT, | |
LOAD_LDG, | |
true, | |
SMEM, | |
false> | |
HistogramSweepPolicy; | |
}; | |
//--------------------------------------------------------------------- | |
// Tuning policies of current PTX compiler pass | |
//--------------------------------------------------------------------- | |
#if (CUB_PTX_ARCH >= 500) | |
typedef Policy500 PtxPolicy; | |
#elif (CUB_PTX_ARCH >= 350) | |
typedef Policy350 PtxPolicy; | |
#elif (CUB_PTX_ARCH >= 300) | |
typedef Policy300 PtxPolicy; | |
#elif (CUB_PTX_ARCH >= 200) | |
typedef Policy200 PtxPolicy; | |
#else | |
typedef Policy110 PtxPolicy; | |
#endif | |
// "Opaque" policies (whose parameterizations aren't reflected in the type signature) | |
struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {}; | |
//--------------------------------------------------------------------- | |
// Utilities | |
//--------------------------------------------------------------------- | |
/** | |
* Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use | |
*/ | |
template <typename KernelConfig> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
static cudaError_t InitConfigs( | |
int ptx_version, | |
KernelConfig &histogram_sweep_config) | |
{ | |
cudaError_t result = cudaErrorNotSupported; | |
if (CUB_IS_DEVICE_CODE) | |
{ | |
#if CUB_INCLUDE_DEVICE_CODE | |
// We're on the device, so initialize the kernel dispatch configurations with the current PTX policy | |
result = histogram_sweep_config.template Init<PtxHistogramSweepPolicy>(); | |
#endif | |
} | |
else | |
{ | |
#if CUB_INCLUDE_HOST_CODE | |
// We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version | |
if (ptx_version >= 500) | |
{ | |
result = histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>(); | |
} | |
else if (ptx_version >= 350) | |
{ | |
result = histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>(); | |
} | |
else if (ptx_version >= 300) | |
{ | |
result = histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>(); | |
} | |
else if (ptx_version >= 200) | |
{ | |
result = histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>(); | |
} | |
else | |
{ | |
result = histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>(); | |
} | |
#endif | |
} | |
return result; | |
} | |
/** | |
* Kernel kernel dispatch configuration | |
*/ | |
struct KernelConfig | |
{ | |
int block_threads; | |
int pixels_per_thread; | |
template <typename BlockPolicy> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t Init() | |
{ | |
block_threads = BlockPolicy::BLOCK_THREADS; | |
pixels_per_thread = BlockPolicy::PIXELS_PER_THREAD; | |
return cudaSuccess; | |
} | |
}; | |
//--------------------------------------------------------------------- | |
// Dispatch entrypoints | |
//--------------------------------------------------------------------- | |
/** | |
* Privatization-based dispatch routine | |
*/ | |
template < | |
typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel | |
typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel | |
typename DeviceHistogramInitKernelT, ///< Function type of cub::DeviceHistogramInitKernel | |
typename DeviceHistogramSweepKernelT> ///< Function type of cub::DeviceHistogramSweepKernel | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
static cudaError_t PrivatizedDispatch( | |
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. | |
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation | |
SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). | |
CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1. | |
int num_privatized_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1. | |
PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel | |
int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1. | |
OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel | |
int max_num_output_bins, ///< [in] Maximum number of output bins in any channel | |
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest | |
OffsetT num_rows, ///< [in] The number of rows in the region of interest | |
OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest | |
DeviceHistogramInitKernelT histogram_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel | |
DeviceHistogramSweepKernelT histogram_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel | |
KernelConfig histogram_sweep_config, ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for | |
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>. | |
bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. | |
{ | |
#ifndef CUB_RUNTIME_ENABLED | |
// Kernel launch not supported from this device | |
return CubDebug(cudaErrorNotSupported); | |
#else | |
cudaError error = cudaSuccess; | |
do | |
{ | |
// Get device ordinal | |
int device_ordinal; | |
if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; | |
// Get SM count | |
int sm_count; | |
if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; | |
// Get SM occupancy for histogram_sweep_kernel | |
int histogram_sweep_sm_occupancy; | |
if (CubDebug(error = MaxSmOccupancy( | |
histogram_sweep_sm_occupancy, | |
histogram_sweep_kernel, | |
histogram_sweep_config.block_threads))) break; | |
// Get device occupancy for histogram_sweep_kernel | |
int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count; | |
if (num_row_pixels * NUM_CHANNELS == row_stride_samples) | |
{ | |
// Treat as a single linear array of samples | |
num_row_pixels *= num_rows; | |
num_rows = 1; | |
row_stride_samples = num_row_pixels * NUM_CHANNELS; | |
} | |
// Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy | |
int pixels_per_tile = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread; | |
int tiles_per_row = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile; | |
int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row); | |
int blocks_per_col = (blocks_per_row > 0) ? | |
int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) : | |
0; | |
int num_thread_blocks = blocks_per_row * blocks_per_col; | |
dim3 sweep_grid_dims; | |
sweep_grid_dims.x = (unsigned int) blocks_per_row; | |
sweep_grid_dims.y = (unsigned int) blocks_per_col; | |
sweep_grid_dims.z = 1; | |
// Temporary storage allocation requirements | |
const int NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1; | |
void* allocations[NUM_ALLOCATIONS] = {}; | |
size_t allocation_sizes[NUM_ALLOCATIONS]; | |
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) | |
allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT); | |
allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize(); | |
// Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) | |
if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; | |
if (d_temp_storage == NULL) | |
{ | |
// Return if the caller is simply requesting the size of the storage allocation | |
break; | |
} | |
// Construct the grid queue descriptor | |
GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]); | |
// Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters) | |
ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper; | |
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) | |
d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL]; | |
// Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters) | |
ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper; | |
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) | |
d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL]; | |
// Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters) | |
ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper; | |
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) | |
privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL]; | |
// Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters) | |
ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper; | |
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) | |
output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL]; | |
// Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters) | |
ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper; | |
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) | |
num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1; | |
// Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters) | |
ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper; | |
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) | |
num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1; | |
int histogram_init_block_threads = 256; | |
int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads; | |
// Log DeviceHistogramInitKernel configuration | |
if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n", | |
histogram_init_grid_dims, histogram_init_block_threads, (long long) stream); | |
// Invoke histogram_init_kernel | |
thrust::cuda_cub::launcher::triple_chevron( | |
histogram_init_grid_dims, histogram_init_block_threads, 0, | |
stream | |
).doit(histogram_init_kernel, | |
num_output_bins_wrapper, | |
d_output_histograms_wrapper, | |
tile_queue); | |
// Return if empty problem | |
if ((blocks_per_row == 0) || (blocks_per_col == 0)) | |
break; | |
// Log histogram_sweep_kernel configuration | |
if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n", | |
sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z, | |
histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy); | |
// Invoke histogram_sweep_kernel | |
thrust::cuda_cub::launcher::triple_chevron( | |
sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream | |
).doit(histogram_sweep_kernel, | |
d_samples, | |
num_output_bins_wrapper, | |
num_privatized_bins_wrapper, | |
d_output_histograms_wrapper, | |
d_privatized_histograms_wrapper, | |
output_decode_op_wrapper, | |
privatized_decode_op_wrapper, | |
num_row_pixels, | |
num_rows, | |
row_stride_samples, | |
tiles_per_row, | |
tile_queue); | |
// Check for failure to launch | |
if (CubDebug(error = cudaPeekAtLastError())) break; | |
// Sync the stream if specified to flush runtime errors | |
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; | |
} | |
while (0); | |
return error; | |
#endif // CUB_RUNTIME_ENABLED | |
} | |
/** | |
* Dispatch routine for HistogramRange, specialized for sample types larger than 8bit | |
*/ | |
CUB_RUNTIME_FUNCTION | |
static cudaError_t DispatchRange( | |
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. | |
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation | |
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). | |
CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1. | |
int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1. | |
LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. | |
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest | |
OffsetT num_rows, ///< [in] The number of rows in the region of interest | |
OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest | |
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>. | |
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. | |
Int2Type<false> /*is_byte_sample*/) ///< [in] Marker type indicating whether or not SampleT is a 8b type | |
{ | |
cudaError error = cudaSuccess; | |
do | |
{ | |
// Get PTX version | |
int ptx_version = 0; | |
if (CubDebug(error = PtxVersion(ptx_version))) break; | |
// Get kernel dispatch configurations | |
KernelConfig histogram_sweep_config; | |
if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) | |
break; | |
// Use the search transform op for converting samples to privatized bins | |
typedef SearchTransform<LevelT*> PrivatizedDecodeOpT; | |
// Use the pass-thru transform op for converting privatized bins to output bins | |
typedef PassThruTransform OutputDecodeOpT; | |
PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; | |
OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; | |
int max_levels = num_output_levels[0]; | |
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) | |
{ | |
privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); | |
if (num_output_levels[channel] > max_levels) | |
max_levels = num_output_levels[channel]; | |
} | |
int max_num_output_bins = max_levels - 1; | |
// Dispatch | |
if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) | |
{ | |
// Too many bins to keep in shared memory. | |
const int PRIVATIZED_SMEM_BINS = 0; | |
if (CubDebug(error = PrivatizedDispatch( | |
d_temp_storage, | |
temp_storage_bytes, | |
d_samples, | |
d_output_histograms, | |
num_output_levels, | |
privatized_decode_op, | |
num_output_levels, | |
output_decode_op, | |
max_num_output_bins, | |
num_row_pixels, | |
num_rows, | |
row_stride_samples, | |
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>, | |
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>, | |
histogram_sweep_config, | |
stream, | |
debug_synchronous))) break; | |
} | |
else | |
{ | |
// Dispatch shared-privatized approach | |
const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; | |
if (CubDebug(error = PrivatizedDispatch( | |
d_temp_storage, | |
temp_storage_bytes, | |
d_samples, | |
d_output_histograms, | |
num_output_levels, | |
privatized_decode_op, | |
num_output_levels, | |
output_decode_op, | |
max_num_output_bins, | |
num_row_pixels, | |
num_rows, | |
row_stride_samples, | |
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>, | |
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>, | |
histogram_sweep_config, | |
stream, | |
debug_synchronous))) break; | |
} | |
} while (0); | |
return error; | |
} | |
/** | |
* Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) | |
*/ | |
CUB_RUNTIME_FUNCTION | |
static cudaError_t DispatchRange( | |
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. | |
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation | |
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). | |
CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1. | |
int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1. | |
LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. | |
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest | |
OffsetT num_rows, ///< [in] The number of rows in the region of interest | |
OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest | |
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>. | |
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. | |
Int2Type<true> /*is_byte_sample*/) ///< [in] Marker type indicating whether or not SampleT is a 8b type | |
{ | |
cudaError error = cudaSuccess; | |
do | |
{ | |
// Get PTX version | |
int ptx_version = 0; | |
if (CubDebug(error = PtxVersion(ptx_version))) break; | |
// Get kernel dispatch configurations | |
KernelConfig histogram_sweep_config; | |
if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) | |
break; | |
// Use the pass-thru transform op for converting samples to privatized bins | |
typedef PassThruTransform PrivatizedDecodeOpT; | |
// Use the search transform op for converting privatized bins to output bins | |
typedef SearchTransform<LevelT*> OutputDecodeOpT; | |
int num_privatized_levels[NUM_ACTIVE_CHANNELS]; | |
PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; | |
OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; | |
int max_levels = num_output_levels[0]; // Maximum number of levels in any channel | |
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) | |
{ | |
num_privatized_levels[channel] = 257; | |
output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); | |
if (num_output_levels[channel] > max_levels) | |
max_levels = num_output_levels[channel]; | |
} | |
int max_num_output_bins = max_levels - 1; | |
const int PRIVATIZED_SMEM_BINS = 256; | |
if (CubDebug(error = PrivatizedDispatch( | |
d_temp_storage, | |
temp_storage_bytes, | |
d_samples, | |
d_output_histograms, | |
num_privatized_levels, | |
privatized_decode_op, | |
num_output_levels, | |
output_decode_op, | |
max_num_output_bins, | |
num_row_pixels, | |
num_rows, | |
row_stride_samples, | |
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>, | |
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>, | |
histogram_sweep_config, | |
stream, | |
debug_synchronous))) break; | |
} while (0); | |
return error; | |
} | |
/** | |
* Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit | |
*/ | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
static cudaError_t DispatchEven( | |
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. | |
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation | |
SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). | |
CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1. | |
int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1. | |
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. | |
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. | |
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest | |
OffsetT num_rows, ///< [in] The number of rows in the region of interest | |
OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest | |
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>. | |
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. | |
Int2Type<false> /*is_byte_sample*/) ///< [in] Marker type indicating whether or not SampleT is a 8b type | |
{ | |
cudaError error = cudaSuccess; | |
do | |
{ | |
// Get PTX version | |
int ptx_version = 0; | |
if (CubDebug(error = PtxVersion(ptx_version))) break; | |
// Get kernel dispatch configurations | |
KernelConfig histogram_sweep_config; | |
if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) | |
break; | |
// Use the scale transform op for converting samples to privatized bins | |
typedef ScaleTransform PrivatizedDecodeOpT; | |
// Use the pass-thru transform op for converting privatized bins to output bins | |
typedef PassThruTransform OutputDecodeOpT; | |
PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; | |
OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; | |
int max_levels = num_output_levels[0]; | |
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) | |
{ | |
int bins = num_output_levels[channel] - 1; | |
LevelT scale = (upper_level[channel] - lower_level[channel]) / bins; | |
privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); | |
if (num_output_levels[channel] > max_levels) | |
max_levels = num_output_levels[channel]; | |
} | |
int max_num_output_bins = max_levels - 1; | |
if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) | |
{ | |
// Dispatch shared-privatized approach | |
const int PRIVATIZED_SMEM_BINS = 0; | |
if (CubDebug(error = PrivatizedDispatch( | |
d_temp_storage, | |
temp_storage_bytes, | |
d_samples, | |
d_output_histograms, | |
num_output_levels, | |
privatized_decode_op, | |
num_output_levels, | |
output_decode_op, | |
max_num_output_bins, | |
num_row_pixels, | |
num_rows, | |
row_stride_samples, | |
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>, | |
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>, | |
histogram_sweep_config, | |
stream, | |
debug_synchronous))) break; | |
} | |
else | |
{ | |
// Dispatch shared-privatized approach | |
const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; | |
if (CubDebug(error = PrivatizedDispatch( | |
d_temp_storage, | |
temp_storage_bytes, | |
d_samples, | |
d_output_histograms, | |
num_output_levels, | |
privatized_decode_op, | |
num_output_levels, | |
output_decode_op, | |
max_num_output_bins, | |
num_row_pixels, | |
num_rows, | |
row_stride_samples, | |
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>, | |
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>, | |
histogram_sweep_config, | |
stream, | |
debug_synchronous))) break; | |
} | |
} | |
while (0); | |
return error; | |
} | |
/** | |
* Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) | |
*/ | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
static cudaError_t DispatchEven( | |
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. | |
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation | |
SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). | |
CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1. | |
int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1. | |
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. | |
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. | |
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest | |
OffsetT num_rows, ///< [in] The number of rows in the region of interest | |
OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest | |
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>. | |
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. | |
Int2Type<true> /*is_byte_sample*/) ///< [in] Marker type indicating whether or not SampleT is a 8b type | |
{ | |
cudaError error = cudaSuccess; | |
do | |
{ | |
// Get PTX version | |
int ptx_version = 0; | |
if (CubDebug(error = PtxVersion(ptx_version))) break; | |
// Get kernel dispatch configurations | |
KernelConfig histogram_sweep_config; | |
if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) | |
break; | |
// Use the pass-thru transform op for converting samples to privatized bins | |
typedef PassThruTransform PrivatizedDecodeOpT; | |
// Use the scale transform op for converting privatized bins to output bins | |
typedef ScaleTransform OutputDecodeOpT; | |
int num_privatized_levels[NUM_ACTIVE_CHANNELS]; | |
PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; | |
OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; | |
int max_levels = num_output_levels[0]; | |
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) | |
{ | |
num_privatized_levels[channel] = 257; | |
int bins = num_output_levels[channel] - 1; | |
LevelT scale = (upper_level[channel] - lower_level[channel]) / bins; | |
output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); | |
if (num_output_levels[channel] > max_levels) | |
max_levels = num_output_levels[channel]; | |
} | |
int max_num_output_bins = max_levels - 1; | |
const int PRIVATIZED_SMEM_BINS = 256; | |
if (CubDebug(error = PrivatizedDispatch( | |
d_temp_storage, | |
temp_storage_bytes, | |
d_samples, | |
d_output_histograms, | |
num_privatized_levels, | |
privatized_decode_op, | |
num_output_levels, | |
output_decode_op, | |
max_num_output_bins, | |
num_row_pixels, | |
num_rows, | |
row_stride_samples, | |
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>, | |
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>, | |
histogram_sweep_config, | |
stream, | |
debug_synchronous))) break; | |
} | |
while (0); | |
return error; | |
} | |
}; | |
} // CUB namespace | |
CUB_NS_POSTFIX // Optional outer namespace(s) | |