Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /cub /block /block_reduce.cuh

Xu Ma

update

1c3c0d9 about 3 years ago

25.3 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
	*/

	#pragma once

	#include "specializations/block_reduce_raking.cuh"
	#include "specializations/block_reduce_raking_commutative_only.cuh"
	#include "specializations/block_reduce_warp_reductions.cuh"
	#include "../config.cuh"
	#include "../util_ptx.cuh"
	#include "../util_type.cuh"
	#include "../thread/thread_operators.cuh"

	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {



	/******************************************************************************
	* Algorithmic variants
	******************************************************************************/

	/**
	* BlockReduceAlgorithm enumerates alternative algorithms for parallel
	* reduction across a CUDA thread block.
	*/
	enum BlockReduceAlgorithm
	{

	/**
	* \par Overview
	* An efficient "raking" reduction algorithm that only supports commutative
	* reduction operators (true for most operations, e.g., addition).
	*
	* \par
	* Execution is comprised of three phases:
	* -# Upsweep sequential reduction in registers (if threads contribute more
	* than one input each). Threads in warps other than the first warp place
	* their partial reductions into shared memory.
	* -# Upsweep sequential reduction in shared memory. Threads within the first
	* warp continue to accumulate by raking across segments of shared partial reductions
	* -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
	*
	* \par
	* \image html block_reduce.png
	* <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
	*
	* \par Performance Considerations
	* - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
	* and is preferable when the reduction operator is commutative. This variant
	* applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
	* throughput across the GPU when suitably occupied. However, turn-around latency may be
	* higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
	* when the GPU is under-occupied.
	*/
	BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,


	/**
	* \par Overview
	* An efficient "raking" reduction algorithm that supports commutative
	* (e.g., addition) and non-commutative (e.g., string concatenation) reduction
	* operators. \blocked.
	*
	* \par
	* Execution is comprised of three phases:
	* -# Upsweep sequential reduction in registers (if threads contribute more
	* than one input each). Each thread then places the partial reduction
	* of its item(s) into shared memory.
	* -# Upsweep sequential reduction in shared memory. Threads within a
	* single warp rake across segments of shared partial reductions.
	* -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
	*
	* \par
	* \image html block_reduce.png
	* <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
	*
	* \par Performance Considerations
	* - This variant performs more communication than BLOCK_REDUCE_RAKING
	* and is only preferable when the reduction operator is non-commutative. This variant
	* applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
	* throughput across the GPU when suitably occupied. However, turn-around latency may be
	* higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
	* when the GPU is under-occupied.
	*/
	BLOCK_REDUCE_RAKING,


	/**
	* \par Overview
	* A quick "tiled warp-reductions" reduction algorithm that supports commutative
	* (e.g., addition) and non-commutative (e.g., string concatenation) reduction
	* operators.
	*
	* \par
	* Execution is comprised of four phases:
	* -# Upsweep sequential reduction in registers (if threads contribute more
	* than one input each). Each thread then places the partial reduction
	* of its item(s) into shared memory.
	* -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
	* reduction within each warp.
	* -# A propagation phase where the warp reduction outputs in each warp are
	* updated with the aggregate from each preceding warp.
	*
	* \par
	* \image html block_scan_warpscans.png
	* <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
	*
	* \par Performance Considerations
	* - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
	* or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
	* throughput across the GPU. However turn-around latency may be lower and
	* thus useful when the GPU is under-occupied.
	*/
	BLOCK_REDUCE_WARP_REDUCTIONS,
	};


	/******************************************************************************
	* Block reduce
	******************************************************************************/

	/**
	* \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
	* \ingroup BlockModule
	*
	* \tparam T Data type being reduced
	* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
	* \tparam ALGORITHM <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
	* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
	* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
	* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
	*
	* \par Overview
	* - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
	* uses a binary combining operator to compute a single aggregate from a list of input elements.
	* - \rowmajor
	* - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
	* -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>. An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
	* -# <b>cub::BLOCK_REDUCE_RAKING</b>. An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
	* -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>. A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
	*
	* \par Performance Considerations
	* - \granularity
	* - Very efficient (only one synchronization barrier).
	* - Incurs zero bank conflicts for most types
	* - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
	* - Summation (<b><em>vs.</em></b> generic reduction)
	* - \p BLOCK_THREADS is a multiple of the architecture's warp size
	* - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
	* - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
	*
	* \par A Simple Example
	* \blockcollective{BlockReduce}
	* \par
	* The code snippet below illustrates a sum reduction of 512 integer items that
	* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
	* where each thread owns 4 consecutive items.
	* \par
	* \code
	* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Specialize BlockReduce for a 1D block of 128 threads on type int
	* typedef cub::BlockReduce<int, 128> BlockReduce;
	*
	* // Allocate shared memory for BlockReduce
	* __shared__ typename BlockReduce::TempStorage temp_storage;
	*
	* // Obtain a segment of consecutive items that are blocked across threads
	* int thread_data[4];
	* ...
	*
	* // Compute the block-wide sum for thread0
	* int aggregate = BlockReduce(temp_storage).Sum(thread_data);
	*
	* \endcode
	*
	*/
	template <
	typename T,
	int BLOCK_DIM_X,
	BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS,
	int BLOCK_DIM_Y = 1,
	int BLOCK_DIM_Z = 1,
	int PTX_ARCH = CUB_PTX_ARCH>
	class BlockReduce
	{
	private:

	/******************************************************************************
	* Constants and type definitions
	******************************************************************************/

	/// Constants
	enum
	{
	/// The thread block size in threads
	BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
	};

	typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpReductions;
	typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> RakingCommutativeOnly;
	typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> Raking;

	/// Internal specialization type
	typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
	WarpReductions,
	typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),
	RakingCommutativeOnly,
	Raking>::Type>::Type InternalBlockReduce; // BlockReduceRaking

	/// Shared memory storage layout type for BlockReduce
	typedef typename InternalBlockReduce::TempStorage _TempStorage;


	/******************************************************************************
	* Utility methods
	******************************************************************************/

	/// Internal storage allocator
	__device__ __forceinline__ _TempStorage& PrivateStorage()
	{
	__shared__ _TempStorage private_storage;
	return private_storage;
	}


	/******************************************************************************
	* Thread fields
	******************************************************************************/

	/// Shared storage reference
	_TempStorage &temp_storage;

	/// Linear thread-id
	unsigned int linear_tid;


	public:

	/// \smemstorage{BlockReduce}
	struct TempStorage : Uninitialized<_TempStorage> {};


	/****************************************************************//
	* \name Collective constructors
	*********************************************************************/
	//@{

	/**
	* \brief Collective constructor using a private static allocation of shared memory as temporary storage.
	*/
	__device__ __forceinline__ BlockReduce()
	:
	temp_storage(PrivateStorage()),
	linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
	{}


	/**
	* \brief Collective constructor using the specified memory allocation as temporary storage.
	*/
	__device__ __forceinline__ BlockReduce(
	TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
	:
	temp_storage(temp_storage.Alias()),
	linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
	{}


	//@} end member group
	/****************************************************************//
	* \name Generic reductions
	*********************************************************************/
	//@{


	/**
	* \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor. Each thread contributes one input element.
	*
	* \par
	* - The return value is undefined in threads other than thread<sub>0</sub>.
	* - \rowmajor
	* - \smemreuse
	*
	* \par Snippet
	* The code snippet below illustrates a max reduction of 128 integer items that
	* are partitioned across 128 threads.
	* \par
	* \code
	* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Specialize BlockReduce for a 1D block of 128 threads on type int
	* typedef cub::BlockReduce<int, 128> BlockReduce;
	*
	* // Allocate shared memory for BlockReduce
	* __shared__ typename BlockReduce::TempStorage temp_storage;
	*
	* // Each thread obtains an input item
	* int thread_data;
	* ...
	*
	* // Compute the block-wide max for thread0
	* int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
	*
	* \endcode
	*
	* \tparam ReductionOp <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
	*/
	template <typename ReductionOp>
	__device__ __forceinline__ T Reduce(
	T input, ///< [in] Calling thread's input
	ReductionOp reduction_op) ///< [in] Binary reduction functor
	{
	return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
	}


	/**
	* \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor. Each thread contributes an array of consecutive input elements.
	*
	* \par
	* - The return value is undefined in threads other than thread<sub>0</sub>.
	* - \granularity
	* - \smemreuse
	*
	* \par Snippet
	* The code snippet below illustrates a max reduction of 512 integer items that
	* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
	* where each thread owns 4 consecutive items.
	* \par
	* \code
	* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Specialize BlockReduce for a 1D block of 128 threads on type int
	* typedef cub::BlockReduce<int, 128> BlockReduce;
	*
	* // Allocate shared memory for BlockReduce
	* __shared__ typename BlockReduce::TempStorage temp_storage;
	*
	* // Obtain a segment of consecutive items that are blocked across threads
	* int thread_data[4];
	* ...
	*
	* // Compute the block-wide max for thread0
	* int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
	*
	* \endcode
	*
	* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
	* \tparam ReductionOp <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
	*/
	template <
	int ITEMS_PER_THREAD,
	typename ReductionOp>
	__device__ __forceinline__ T Reduce(
	T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment
	ReductionOp reduction_op) ///< [in] Binary reduction functor
	{
	// Reduce partials
	T partial = internal::ThreadReduce(inputs, reduction_op);
	return Reduce(partial, reduction_op);
	}


	/**
	* \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor. The first \p num_valid threads each contribute one input element.
	*
	* \par
	* - The return value is undefined in threads other than thread<sub>0</sub>.
	* - \rowmajor
	* - \smemreuse
	*
	* \par Snippet
	* The code snippet below illustrates a max reduction of a partially-full tile of integer items that
	* are partitioned across 128 threads.
	* \par
	* \code
	* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
	*
	* __global__ void ExampleKernel(int num_valid, ...)
	* {
	* // Specialize BlockReduce for a 1D block of 128 threads on type int
	* typedef cub::BlockReduce<int, 128> BlockReduce;
	*
	* // Allocate shared memory for BlockReduce
	* __shared__ typename BlockReduce::TempStorage temp_storage;
	*
	* // Each thread obtains an input item
	* int thread_data;
	* if (threadIdx.x < num_valid) thread_data = ...
	*
	* // Compute the block-wide max for thread0
	* int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
	*
	* \endcode
	*
	* \tparam ReductionOp <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
	*/
	template <typename ReductionOp>
	__device__ __forceinline__ T Reduce(
	T input, ///< [in] Calling thread's input
	ReductionOp reduction_op, ///< [in] Binary reduction functor
	int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
	{
	// Determine if we scan skip bounds checking
	if (num_valid >= BLOCK_THREADS)
	{
	return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
	}
	else
	{
	return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
	}
	}


	//@} end member group
	/****************************************************************//
	* \name Summation reductions
	*********************************************************************/
	//@{


	/**
	* \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes one input element.
	*
	* \par
	* - The return value is undefined in threads other than thread<sub>0</sub>.
	* - \rowmajor
	* - \smemreuse
	*
	* \par Snippet
	* The code snippet below illustrates a sum reduction of 128 integer items that
	* are partitioned across 128 threads.
	* \par
	* \code
	* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Specialize BlockReduce for a 1D block of 128 threads on type int
	* typedef cub::BlockReduce<int, 128> BlockReduce;
	*
	* // Allocate shared memory for BlockReduce
	* __shared__ typename BlockReduce::TempStorage temp_storage;
	*
	* // Each thread obtains an input item
	* int thread_data;
	* ...
	*
	* // Compute the block-wide sum for thread0
	* int aggregate = BlockReduce(temp_storage).Sum(thread_data);
	*
	* \endcode
	*
	*/
	__device__ __forceinline__ T Sum(
	T input) ///< [in] Calling thread's input
	{
	return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
	}

	/**
	* \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements.
	*
	* \par
	* - The return value is undefined in threads other than thread<sub>0</sub>.
	* - \granularity
	* - \smemreuse
	*
	* \par Snippet
	* The code snippet below illustrates a sum reduction of 512 integer items that
	* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
	* where each thread owns 4 consecutive items.
	* \par
	* \code
	* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Specialize BlockReduce for a 1D block of 128 threads on type int
	* typedef cub::BlockReduce<int, 128> BlockReduce;
	*
	* // Allocate shared memory for BlockReduce
	* __shared__ typename BlockReduce::TempStorage temp_storage;
	*
	* // Obtain a segment of consecutive items that are blocked across threads
	* int thread_data[4];
	* ...
	*
	* // Compute the block-wide sum for thread0
	* int aggregate = BlockReduce(temp_storage).Sum(thread_data);
	*
	* \endcode
	*
	* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
	*/
	template <int ITEMS_PER_THREAD>
	__device__ __forceinline__ T Sum(
	T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment
	{
	// Reduce partials
	T partial = internal::ThreadReduce(inputs, cub::Sum());
	return Sum(partial);
	}


	/**
	* \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. The first \p num_valid threads each contribute one input element.
	*
	* \par
	* - The return value is undefined in threads other than thread<sub>0</sub>.
	* - \rowmajor
	* - \smemreuse
	*
	* \par Snippet
	* The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
	* are partitioned across 128 threads.
	* \par
	* \code
	* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
	*
	* __global__ void ExampleKernel(int num_valid, ...)
	* {
	* // Specialize BlockReduce for a 1D block of 128 threads on type int
	* typedef cub::BlockReduce<int, 128> BlockReduce;
	*
	* // Allocate shared memory for BlockReduce
	* __shared__ typename BlockReduce::TempStorage temp_storage;
	*
	* // Each thread obtains an input item (up to num_items)
	* int thread_data;
	* if (threadIdx.x < num_valid)
	* thread_data = ...
	*
	* // Compute the block-wide sum for thread0
	* int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
	*
	* \endcode
	*
	*/
	__device__ __forceinline__ T Sum(
	T input, ///< [in] Calling thread's input
	int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
	{
	// Determine if we scan skip bounds checking
	if (num_valid >= BLOCK_THREADS)
	{
	return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
	}
	else
	{
	return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
	}
	}


	//@} end member group
	};

	/**
	* \example example_block_reduce.cu
	*/

	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)