LIVE / thrust /dependencies /cub /test /test_device_histogram.cu
Xu Ma
update
1c3c0d9
raw
history blame
71 kB
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Test of DeviceHistogram utilities
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <stdio.h>
#include <limits>
#include <algorithm>
#include <typeinfo>
#if defined(QUICK_TEST) || defined(QUICKER_TEST)
#include <npp.h>
#endif
#include <cub/util_allocator.cuh>
#include <cub/iterator/constant_input_iterator.cuh>
#include <cub/device/device_histogram.cuh>
#include "test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
// Dispatch types
enum Backend
{
CUB, // CUB method
NPP, // NPP method
CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
};
bool g_verbose_input = false;
bool g_verbose = false;
int g_timing_iterations = 0;
int g_repeat = 0;
CachingDeviceAllocator g_allocator(true);
//---------------------------------------------------------------------
// Dispatch to NPP histogram
//---------------------------------------------------------------------
#if defined(QUICK_TEST) || defined(QUICKER_TEST)
/**
* Dispatch to single-channel 8b NPP histo-even
*/
template <typename CounterT, typename LevelT, typename OffsetT>
//CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
Int2Type<1> num_channels,
Int2Type<1> num_active_channels,
Int2Type<NPP> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
unsigned char *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
CounterT *d_histogram[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
int num_levels[1], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT lower_level[1], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[1], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
typedef unsigned char SampleT;
cudaError_t error = cudaSuccess;
NppiSize oSizeROI = {
num_row_pixels,
num_rows
};
if (d_temp_storage_bytes == NULL)
{
int nDeviceBufferSize;
nppiHistogramEvenGetBufferSize_8u_C1R(oSizeROI, num_levels[0] ,&nDeviceBufferSize);
temp_storage_bytes = nDeviceBufferSize;
}
else
{
for (int i = 0; i < timing_timing_iterations; ++i)
{
// compute the histogram
nppiHistogramEven_8u_C1R(
d_samples,
row_stride_bytes,
oSizeROI,
d_histogram[0],
num_levels[0],
lower_level[0],
upper_level[0],
(Npp8u*) d_temp_storage);
}
}
return error;
}
/**
* Dispatch to 3/4 8b NPP histo-even
*/
template <typename CounterT, typename LevelT, typename OffsetT>
//CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
Int2Type<4> num_channels,
Int2Type<3> num_active_channels,
Int2Type<NPP> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
unsigned char *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
CounterT *d_histogram[3], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
int num_levels[3], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT lower_level[3], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[3], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
typedef unsigned char SampleT;
cudaError_t error = cudaSuccess;
NppiSize oSizeROI = {
num_row_pixels,
num_rows
};
if (d_temp_storage_bytes == NULL)
{
int nDeviceBufferSize;
nppiHistogramEvenGetBufferSize_8u_AC4R(oSizeROI, num_levels ,&nDeviceBufferSize);
temp_storage_bytes = nDeviceBufferSize;
}
else
{
for (int i = 0; i < timing_timing_iterations; ++i)
{
// compute the histogram
nppiHistogramEven_8u_AC4R(
d_samples,
row_stride_bytes,
oSizeROI,
d_histogram,
num_levels,
lower_level,
upper_level,
(Npp8u*) d_temp_storage);
}
}
return error;
}
#endif // #if defined(QUICK_TEST) || defined(QUICKER_TEST)
//---------------------------------------------------------------------
// Dispatch to different DeviceHistogram entrypoints
//---------------------------------------------------------------------
template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, int BACKEND>
struct Dispatch;
template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS>
struct Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, CUB>
{
/**
* Dispatch to CUB multi histogram-range entrypoint
*/
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
//CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Range(
int timing_timing_iterations,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT *(&d_levels)[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
d_levels,
num_row_pixels,
num_rows,
row_stride_bytes,
stream,
debug_synchronous);
}
return error;
}
/**
* Dispatch to CUB multi histogram-even entrypoint
*/
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
//CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Even(
int timing_timing_iterations,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
num_row_pixels,
num_rows,
row_stride_bytes,
stream,
debug_synchronous);
}
return error;
}
};
template <>
struct Dispatch<1, 1, CUB>
{
/**
* Dispatch to CUB single histogram-range entrypoint
*/
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
//CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Range(
int timing_timing_iterations,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT (&d_levels)[1], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::HistogramRange(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram[0],
num_levels[0],
d_levels[0],
num_row_pixels,
num_rows,
row_stride_bytes,
stream,
debug_synchronous);
}
return error;
}
/**
* Dispatch to CUB single histogram-even entrypoint
*/
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
//CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Even(
int timing_timing_iterations,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::HistogramEven(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram[0],
num_levels[0],
lower_level[0],
upper_level[0],
num_row_pixels,
num_rows,
row_stride_bytes,
stream,
debug_synchronous);
}
return error;
}
};
//---------------------------------------------------------------------
// CUDA nested-parallelism test kernel
//---------------------------------------------------------------------
/**
* Simple wrapper kernel to invoke DeviceHistogram
* /
template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
__global__ void CnpDispatchKernel(
Int2Type<ALGORITHM> algorithm,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t temp_storage_bytes,
SampleT *d_samples,
SampleIteratorT d_sample_itr,
ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_out_histograms,
int num_samples,
bool debug_synchronous)
{
#ifndef CUB_CDP
*d_cdp_error = cudaErrorNotSupported;
#else
*d_cdp_error = Dispatch<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(algorithm, Int2Type<false>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_out_histograms.array, num_samples, 0, debug_synchronous);
*d_temp_storage_bytes = temp_storage_bytes;
#endif
}
/ **
* Dispatch to CDP kernel
* /
template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
cudaError_t Dispatch(
Int2Type<ALGORITHM> algorithm,
Int2Type<true> use_cdp,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleT *d_samples,
SampleIteratorT d_sample_itr,
CounterT *d_histograms[NUM_ACTIVE_CHANNELS],
int num_samples,
cudaStream_t stream,
bool debug_synchronous)
{
// Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_histo_wrapper;
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
// Invoke kernel to invoke device-side dispatch
CnpDispatchKernel<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, ALGORITHM><<<1,1>>>(algorithm, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_histo_wrapper, num_samples, debug_synchronous);
// Copy out temp_storage_bytes
CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
// Copy out error
cudaError_t retval;
CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
return retval;
}
*/
//---------------------------------------------------------------------
// Test generation
//---------------------------------------------------------------------
// Searches for bin given a list of bin-boundary levels
template <typename LevelT>
struct SearchTransform
{
LevelT *levels; // Pointer to levels array
int num_levels; // Number of levels in array
// Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
template <typename SampleT>
int operator()(SampleT sample)
{
int bin = int(std::upper_bound(levels, levels + num_levels, (LevelT) sample) - levels - 1);
if (bin < 0)
{
// Sample out of range
return num_levels;
}
return bin;
}
};
// Scales samples to evenly-spaced bins
template <typename LevelT>
struct ScaleTransform
{
int num_levels; // Number of levels in array
LevelT max; // Max sample level (exclusive)
LevelT min; // Min sample level (inclusive)
LevelT scale; // Bin scaling factor
void Init(
int num_levels, // Number of levels in array
LevelT max, // Max sample level (exclusive)
LevelT min, // Min sample level (inclusive)
LevelT scale) // Bin scaling factor
{
this->num_levels = num_levels;
this->max = max;
this->min = min;
this->scale = scale;
}
// Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
template <typename SampleT>
int operator()(SampleT sample)
{
if ((sample < min) || (sample >= max))
{
// Sample out of range
return num_levels;
}
return (int) ((((LevelT) sample) - min) / scale);
}
};
// Scales samples to evenly-spaced bins
template <>
struct ScaleTransform<float>
{
int num_levels; // Number of levels in array
float max; // Max sample level (exclusive)
float min; // Min sample level (inclusive)
float scale; // Bin scaling factor
void Init(
int num_levels, // Number of levels in array
float max, // Max sample level (exclusive)
float min, // Min sample level (inclusive)
float scale) // Bin scaling factor
{
this->num_levels = num_levels;
this->max = max;
this->min = min;
this->scale = 1.0f / scale;
}
// Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
template <typename SampleT>
int operator()(SampleT sample)
{
if ((sample < min) || (sample >= max))
{
// Sample out of range
return num_levels;
}
return (int) ((((float) sample) - min) * scale);
}
};
/**
* Generate sample
*/
template <typename T, typename LevelT>
void Sample(T &datum, LevelT max_level, int entropy_reduction)
{
unsigned int max = (unsigned int) -1;
unsigned int bits;
RandomBits(bits, entropy_reduction);
float fraction = (float(bits) / max);
datum = (T) (fraction * max_level);
}
/**
* Initialize histogram samples
*/
template <
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename LevelT,
typename SampleT,
typename OffsetT>
void InitializeSamples(
LevelT max_level,
int entropy_reduction,
SampleT *h_samples,
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
// Initialize samples
for (OffsetT row = 0; row < num_rows; ++row)
{
for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
{
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
// Sample offset
OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
// Init sample value
Sample(h_samples[offset], max_level, entropy_reduction);
if (g_verbose_input)
{
if (channel > 0) printf(", ");
std::cout << CoutCast(h_samples[offset]);
}
}
}
}
}
/**
* Initialize histogram solutions
*/
template <
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename SampleIteratorT,
typename TransformOp,
typename OffsetT>
void InitializeBins(
SampleIteratorT h_samples,
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
TransformOp transform_op[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
CounterT *h_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
// Init bins
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
{
for (int bin = 0; bin < num_levels[CHANNEL] - 1; ++bin)
{
h_histogram[CHANNEL][bin] = 0;
}
}
// Initialize samples
if (g_verbose_input) printf("Samples: \n");
for (OffsetT row = 0; row < num_rows; ++row)
{
for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
{
if (g_verbose_input) printf("[");
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
// Sample offset
OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
// Update sample bin
int bin = transform_op[channel](h_samples[offset]);
if (g_verbose_input) printf(" (%d)", bin); fflush(stdout);
if ((bin >= 0) && (bin < num_levels[channel] - 1))
{
// valid bin
h_histogram[channel][bin]++;
}
}
if (g_verbose_input) printf("]");
}
if (g_verbose_input) printf("\n\n");
}
}
/**
* Test histogram-even
*/
template <
Backend BACKEND,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT,
typename SampleIteratorT>
void TestEven(
LevelT max_level,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
SampleIteratorT h_samples,
SampleIteratorT d_samples)
{
OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
printf("\n----------------------------\n");
printf("%s cub::DeviceHistogramEven (%s) %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
(BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
(IsPointer<SampleIteratorT>::VALUE) ? "pointer" : "iterator",
(int) (num_row_pixels * num_rows),
(int) num_rows,
(int) num_row_pixels,
(int) row_stride_bytes,
(int) total_samples,
(int) sizeof(SampleT),
typeid(SampleT).name(),
entropy_reduction,
typeid(CounterT).name(),
NUM_ACTIVE_CHANNELS,
NUM_CHANNELS);
std::cout << CoutCast(max_level) << "\n";
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
std::cout << "\n\tChannel " << channel << ": " << num_levels[channel] - 1 << " bins [" << lower_level[channel] << ", " << upper_level[channel] << ")\n";
fflush(stdout);
// Allocate and initialize host and device data
typedef SampleT Foo; // rename type to quelch gcc warnings (bug?)
CounterT* h_histogram[NUM_ACTIVE_CHANNELS];
ScaleTransform<LevelT> transform_op[NUM_ACTIVE_CHANNELS];
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int bins = num_levels[channel] - 1;
h_histogram[channel] = new CounterT[bins];
transform_op[channel].Init(
num_levels[channel],
upper_level[channel],
lower_level[channel],
((upper_level[channel] - lower_level[channel]) / bins));
}
InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
// Allocate and initialize device data
CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * (num_levels[channel] - 1)));
CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * (num_levels[channel] - 1)));
}
// Allocate CDP device arrays
size_t *d_temp_storage_bytes = NULL;
cudaError_t *d_cdp_error = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
1, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
0, true);
// Allocate temporary storage with "canary" zones
int canary_bytes = 256;
char canary_token = 8;
char* canary_zone = new char[canary_bytes];
memset(canary_zone, canary_token, canary_bytes);
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
// Run warmup/correctness iteration
Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
1, d_temp_storage_bytes, d_cdp_error,
((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
0, true);
// Check canary zones
int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
AssertEquals(0, error);
error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
AssertEquals(0, error);
// Flush any stdout/stderr
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
fflush(stdout);
fflush(stderr);
// Check for correctness (and display results, if specified)
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
error |= channel_error;
}
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
0, false);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * sizeof(SampleT);
printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
avg_millis,
giga_rate,
giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
giga_rate / NUM_CHANNELS,
giga_bandwidth);
}
printf("\n\n");
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
if (h_histogram[channel])
delete[] h_histogram[channel];
if (d_histogram[channel])
CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
}
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
// Correctness asserts
AssertEquals(0, error);
}
/**
* Test histogram-even (native pointer input)
*/
template <
Backend BACKEND,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestEvenNative(
LevelT max_level,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
// Allocate and initialize host sample data
typedef SampleT Foo; // rename type to quelch gcc warnings (bug?)
SampleT* h_samples = new Foo[total_samples];
InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
// Allocate and initialize device data
SampleT* d_samples = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
max_level, entropy_reduction, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
h_samples, d_samples);
// Cleanup
if (h_samples) delete[] h_samples;
if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
}
/**
* Test histogram-even (native pointer input)
*/
template <
Backend BACKEND,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestEvenIterator(
LevelT max_level,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
SampleT sample = (SampleT) lower_level[0];
ConstantInputIterator<SampleT> sample_itr(sample);
TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
max_level, entropy_reduction, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
sample_itr, sample_itr);
}
/**
* Test histogram-range
*/
template <
Backend BACKEND,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestRange(
LevelT max_level,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT* levels[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
printf("\n----------------------------\n");
printf("%s cub::DeviceHistogramRange %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
(BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
(int) (num_row_pixels * num_rows),
(int) num_rows,
(int) num_row_pixels,
(int) row_stride_bytes,
(int) total_samples,
(int) sizeof(SampleT),
typeid(SampleT).name(),
entropy_reduction,
typeid(CounterT).name(),
NUM_ACTIVE_CHANNELS,
NUM_CHANNELS);
std::cout << CoutCast(max_level) << "\n";
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
printf("Channel %d: %d bins [", channel, num_levels[channel] - 1);
std::cout << levels[channel][0];
for (int level = 1; level < num_levels[channel]; ++level)
std::cout << ", " << levels[channel][level];
printf("]\n");
}
fflush(stdout);
// Allocate and initialize host and device data
typedef SampleT Foo; // rename type to quelch gcc warnings (bug?)
SampleT* h_samples = new Foo[total_samples];
CounterT* h_histogram[NUM_ACTIVE_CHANNELS];
SearchTransform<LevelT> transform_op[NUM_ACTIVE_CHANNELS];
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
transform_op[channel].levels = levels[channel];
transform_op[channel].num_levels = num_levels[channel];
int bins = num_levels[channel] - 1;
h_histogram[channel] = new CounterT[bins];
}
InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
// Allocate and initialize device data
SampleT* d_samples = NULL;
LevelT* d_levels[NUM_ACTIVE_CHANNELS];
CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_levels[channel], sizeof(LevelT) * num_levels[channel]));
CubDebugExit(cudaMemcpy(d_levels[channel], levels[channel], sizeof(LevelT) * num_levels[channel], cudaMemcpyHostToDevice));
int bins = num_levels[channel] - 1;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * bins));
CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * bins));
}
// Allocate CDP device arrays
size_t *d_temp_storage_bytes = NULL;
cudaError_t *d_cdp_error = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
1, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples,
d_histogram,
num_levels, d_levels,
num_row_pixels, num_rows, row_stride_bytes,
0, true);
// Allocate temporary storage with "canary" zones
int canary_bytes = 256;
char canary_token = 9;
char* canary_zone = new char[canary_bytes];
memset(canary_zone, canary_token, canary_bytes);
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
// Run warmup/correctness iteration
Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
1, d_temp_storage_bytes, d_cdp_error,
((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
d_samples,
d_histogram,
num_levels, d_levels,
num_row_pixels, num_rows, row_stride_bytes,
0, true);
// Check canary zones
int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
AssertEquals(0, error);
error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
AssertEquals(0, error);
// Flush any stdout/stderr
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
fflush(stdout);
fflush(stderr);
// Check for correctness (and display results, if specified)
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
error |= channel_error;
}
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
d_samples,
d_histogram,
num_levels, d_levels,
num_row_pixels, num_rows, row_stride_bytes,
0, false);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * sizeof(SampleT);
printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
avg_millis,
giga_rate,
giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
giga_rate / NUM_CHANNELS,
giga_bandwidth);
}
printf("\n\n");
// Cleanup
if (h_samples) delete[] h_samples;
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
if (h_histogram[channel])
delete[] h_histogram[channel];
if (d_histogram[channel])
CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
if (d_levels[channel])
CubDebugExit(g_allocator.DeviceFree(d_levels[channel]));
}
if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
// Correctness asserts
AssertEquals(0, error);
}
/**
* Test histogram-even
*/
template <
Backend BACKEND,
typename SampleT,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestEven(
OffsetT num_row_pixels,
OffsetT num_rows,
OffsetT row_stride_bytes,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS],
LevelT max_level,
int max_num_levels)
{
LevelT lower_level[NUM_ACTIVE_CHANNELS];
LevelT upper_level[NUM_ACTIVE_CHANNELS];
// Find smallest level increment
int max_bins = max_num_levels - 1;
LevelT min_level_increment = max_level / max_bins;
// Set upper and lower levels for each channel
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int num_bins = num_levels[channel] - 1;
lower_level[channel] = (max_level - (num_bins * min_level_increment)) / 2;
upper_level[channel] = (max_level + (num_bins * min_level_increment)) / 2;
}
// Test pointer-based samples
TestEvenNative<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
// Test iterator-based samples (CUB-only)
TestEvenIterator<CUB, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
}
/**
* Test histogram-range
*/
template <
Backend BACKEND,
typename SampleT,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestRange(
OffsetT num_row_pixels,
OffsetT num_rows,
OffsetT row_stride_bytes,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS],
LevelT max_level,
int max_num_levels)
{
// Find smallest level increment
int max_bins = max_num_levels - 1;
LevelT min_level_increment = max_level / max_bins;
LevelT* levels[NUM_ACTIVE_CHANNELS];
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
levels[channel] = new LevelT[num_levels[channel]];
int num_bins = num_levels[channel] - 1;
LevelT lower_level = (max_level - (num_bins * min_level_increment)) / 2;
for (int level = 0; level < num_levels[channel]; ++level)
levels[channel][level] = lower_level + (level * min_level_increment);
}
TestRange<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
max_level, entropy_reduction, num_levels, levels, num_row_pixels, num_rows, row_stride_bytes);
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
delete[] levels[channel];
}
/**
* Test different entrypoints
*/
template <
typename SampleT,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename LevelT,
typename OffsetT>
void Test(
OffsetT num_row_pixels,
OffsetT num_rows,
OffsetT row_stride_bytes,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS],
LevelT max_level,
int max_num_levels)
{
TestEven<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
TestRange<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
}
/**
* Test different number of levels
*/
template <
typename SampleT,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename LevelT,
typename OffsetT>
void Test(
OffsetT num_row_pixels,
OffsetT num_rows,
OffsetT row_stride_bytes,
int entropy_reduction,
LevelT max_level,
int max_num_levels)
{
int num_levels[NUM_ACTIVE_CHANNELS];
// Unnecessary testing
// // All the same level
// for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
// {
// num_levels[channel] = max_num_levels;
// }
// Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
// num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
// All different levels
num_levels[0] = max_num_levels;
for (int channel = 1; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
num_levels[channel] = (num_levels[channel - 1] / 2) + 1;
}
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
}
/**
* Test different entropy-levels
*/
template <
typename SampleT,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename LevelT,
typename OffsetT>
void Test(
OffsetT num_row_pixels,
OffsetT num_rows,
OffsetT row_stride_bytes,
LevelT max_level,
int max_num_levels)
{
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, 0, max_level, max_num_levels);
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, -1, max_level, max_num_levels);
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, 5, max_level, max_num_levels);
}
/**
* Test different row strides
*/
template <
typename SampleT,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename LevelT,
typename OffsetT>
void Test(
OffsetT num_row_pixels,
OffsetT num_rows,
LevelT max_level,
int max_num_levels)
{
OffsetT row_stride_bytes = num_row_pixels * NUM_CHANNELS * sizeof(SampleT);
// No padding
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, max_level, max_num_levels);
// 13 samples padding
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes + (13 * sizeof(SampleT)), max_level, max_num_levels);
}
/**
* Test different problem sizes
*/
template <
typename SampleT,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename LevelT,
typename OffsetT>
void Test(
LevelT max_level,
int max_num_levels)
{
// 0 row/col images
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
OffsetT(1920), OffsetT(0), max_level, max_num_levels);
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
OffsetT(0), OffsetT(0), max_level, max_num_levels);
// 1080 image
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
OffsetT(1920), OffsetT(1080), max_level, max_num_levels);
// Sample different aspect ratios sizes
for (OffsetT rows = 1; rows < 1000000; rows *= 1000)
{
for (OffsetT cols = 1; cols < (1000000 / rows); cols *= 1000)
{
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
cols, rows, max_level, max_num_levels);
}
}
// Randomly select linear problem size between 1:10,000,000
unsigned int max_int = (unsigned int) -1;
for (int i = 0; i < 4; ++i)
{
unsigned int num_items;
RandomBits(num_items);
num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
num_items = CUB_MAX(1, num_items);
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
OffsetT(num_items), 1, max_level, max_num_levels);
}
}
/**
* Test different channel interleavings (valid specialiation)
*/
template <
typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestChannels(
LevelT max_level,
int max_num_levels,
Int2Type<true> /*is_valid_tag*/)
{
Test<SampleT, 1, 1, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
Test<SampleT, 4, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
Test<SampleT, 3, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
Test<SampleT, 4, 4, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
}
/**
* Test different channel interleavings (invalid specialiation)
*/
template <
typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestChannels(
LevelT /*max_level*/,
int /*max_num_levels*/,
Int2Type<false> /*is_valid_tag*/)
{}
//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------
/**
* Main
*/
int main(int argc, char** argv)
{
int num_row_pixels = -1;
int entropy_reduction = 0;
int num_rows = 1;
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
g_verbose_input = args.CheckCmdLineFlag("v2");
args.GetCmdLineArgument("n", num_row_pixels);
int row_stride_pixels = num_row_pixels;
args.GetCmdLineArgument("rows", num_rows);
args.GetCmdLineArgument("stride", row_stride_pixels);
args.GetCmdLineArgument("i", g_timing_iterations);
args.GetCmdLineArgument("repeat", g_repeat);
args.GetCmdLineArgument("entropy", entropy_reduction);
#if defined(QUICK_TEST) || defined(QUICKER_TEST)
bool compare_npp = args.CheckCmdLineFlag("npp");
#endif
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--n=<pixels per row>] "
"[--rows=<number of rows>] "
"[--stride=<row stride in pixels>] "
"[--i=<timing iterations>] "
"[--device=<device-id>] "
"[--repeat=<repetitions of entire test suite>]"
"[--entropy=<entropy-reduction factor (default 0)>]"
"[--v] "
"[--cdp]"
"[--npp]"
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
// Get ptx version
int ptx_version = 0;
CubDebugExit(PtxVersion(ptx_version));
if (num_row_pixels < 0)
{
num_row_pixels = 1920 * 1080;
row_stride_pixels = num_row_pixels;
}
#if defined(QUICKER_TEST)
// Compile/run quick tests
{
// HistogramEven: unsigned char 256 bins
typedef unsigned char SampleT;
typedef int LevelT;
LevelT max_level = 256;
int num_levels[1] = {257};
int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
// The NPP path doesn't compile as of 2020-06:
// No Dispatch<int, int, NPP> specialization defined.
// if (compare_npp)
// TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
}
{
// HistogramRange: signed char 256 bins
typedef signed char SampleT;
typedef int LevelT;
LevelT max_level = 256;
int num_levels[1] = {257};
int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
TestRange<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
}
#elif defined(QUICK_TEST)
// Compile/run quick tests
{
// HistogramEven: unsigned char 256 bins
typedef unsigned char SampleT;
typedef int LevelT;
LevelT max_level = 256;
int num_levels[1] = {257};
int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
// The NPP path doesn't compile as of 2020-06:
// No Dispatch<int, int, NPP> specialization defined.
// if (compare_npp)
// TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
}
{
// HistogramEven: 4/4 multichannel Unsigned char 256 bins
typedef unsigned char SampleT;
typedef int LevelT;
LevelT max_level = 256;
int num_levels[4] = {257, 257, 257, 257};
int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 4;
TestEven<CUB, SampleT, 4, 4, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
}
{
// HistogramEven: 3/4 multichannel Unsigned char 256 bins
typedef unsigned char SampleT;
typedef int LevelT;
LevelT max_level = 256;
int num_levels[3] = {257, 257, 257};
int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 4;
TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
// The NPP path doesn't compile as of 2020-06:
// No Dispatch<int, int, NPP> specialization defined.
// if (compare_npp)
// TestEven<NPP, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
}
{
// HistogramEven: short [0,1024] 256 bins
typedef unsigned short SampleT;
typedef unsigned short LevelT;
LevelT max_level = 1024;
int num_levels[1] = {257};
int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
}
{
// HistogramEven: float [0,1.0] 256 bins
typedef float SampleT;
typedef float LevelT;
LevelT max_level = 1.0;
int num_levels[1] = {257};
int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
}
{
// HistogramEven: 3/4 multichannel float [0,1.0] 256 bins
typedef float SampleT;
typedef float LevelT;
LevelT max_level = 1.0;
int num_levels[3] = {257, 257, 257};
int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 4;
TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
}
{
// HistogramRange: signed char 256 bins
typedef signed char SampleT;
typedef int LevelT;
LevelT max_level = 256;
int num_levels[1] = {257};
int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
TestRange<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
}
{
// HistogramRange: 3/4 channel, unsigned char, varied bins (256, 128, 64)
typedef unsigned char SampleT;
typedef int LevelT;
LevelT max_level = 256;
int num_levels[3] = {257, 129, 65};
int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 4;
TestRange<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
}
if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted
{
// HistogramEven: double [0,1.0] 64 bins
typedef double SampleT;
typedef double LevelT;
LevelT max_level = 1.0;
int num_levels[1] = {65};
int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
}
{
// HistogramEven: short [0,1024] 512 bins
typedef unsigned short SampleT;
typedef unsigned short LevelT;
LevelT max_level = 1024;
int num_levels[1] = {513};
int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
}
#else
// Compile/run thorough tests
for (int i = 0; i <= g_repeat; ++i)
{
TestChannels <unsigned char, int, int, int>(256, 256 + 1, Int2Type<true>());
TestChannels <signed char, int, int, int>(256, 256 + 1, Int2Type<true>());
TestChannels <unsigned short, int, int, int>(128, 128 + 1, Int2Type<true>());
TestChannels <unsigned short, int, int, int>(8192, 8192 + 1, Int2Type<true>());
TestChannels <float, int, float, int>(1.0, 256 + 1, Int2Type<true>());
// Test down-conversion of size_t offsets to int
TestChannels <unsigned char, int, int, long long>(256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>());
}
#endif
return 0;
}