WaveGRU-Text-To-Speech

Sleeping

WaveGRU-Text-To-Speech / sparse_matmul /compute /kernels_generic.h

NTT123

add fast cpp wavegru

d1a84ee over 3 years ago

11.5 kB

	/*
	* Copyright 2021 Google LLC
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_GENERIC_H_
	#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_GENERIC_H_

	#include <algorithm>
	#include <type_traits>

	#include "sparse_matmul/numerics/fixed_types.h"
	#include "sparse_matmul/numerics/float16_types.h"
	#include "sparse_matmul/numerics/type_utils.h"

	// Separate out the assembly kernels for readability. Eventually this will
	// become an ifdef switch on the architecture type.
	#if defined __aarch64__
	#include "sparse_matmul/compute/kernels_arm.h"
	#elif defined __AVX__
	#include "sparse_matmul/compute/kernels_avx.h"
	#else // defined __AVX__
	// If there is no architecture-specific implementation, then always use generic.
	template <typename WeightType, typename RhsType, typename OutType>
	struct ShouldEnableGenericSpMV_4x4 : std::true_type {};
	template <typename WeightType, typename RhsType, typename OutType>
	struct ShouldEnableGenericSpMM5_4x4 : std::true_type {};
	template <typename WeightType, typename RhsType, typename OutType>
	struct ShouldEnableGenericSpMV_1x1 : std::true_type {};
	template <typename WeightType, typename RhsType, typename OutType>
	struct ShouldEnableGenericSpMM5_1x1 : std::true_type {};
	template <typename Type>
	struct ShouldEnableGenericAdd : std::true_type {};
	#endif // defined __arch64__

	namespace csrblocksparse {
	namespace detail {

	// The computational routines do NO error checking for speed. It is assumed
	// that this has been handled by CSRBlockSparseMatrix.

	// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
	// blocked pattern, x is a vector and b is vector. Weights are stored for this
	// routine by making each 4x4 block contiguous. Blocks are ordered in standard
	// row-major format. column indices are converted to deltas and then multiplied
	// by 2 to convert to bytes, so that the value can be used directly to offset
	// the pointer into the rhs vector.
	//
	// NOTE: The bias is expected to have be multiplied by .25f prior to calling
	// this function. This is automatically taken care of in SparseLinearLayer.
	// The bias is reconstructed through horizontal additions, leads to a small
	// speedup by reducing latencies at the end of the loop.
	template <typename WeightType, typename RhsType, typename OutType>
	typename std::enable_if<
	ShouldEnableGenericSpMV_4x4<WeightType, RhsType, OutType>::value>::type
	SpMV_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
	const int32_t* nnz_per_row, const RhsType* rhs_ptr,
	const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
	OutType* out_ptr, int64_t assigned_rows,
	int64_t rows /* only used in SpMM variants */,
	int64_t cols /* only used in SpMM variants */, int relu) {
	for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
	float accumulators[4];
	// Undo the divion by the happens for the assembly version.
	for (int i = 0; i < 4; ++i)
	accumulators[i] = 4.f * static_cast<float>(*bias_ptr++);

	int reduced_col_count = *nnz_per_row++;
	for (int c = 0; c < reduced_col_count; ++c) {
	int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
	rhs_ptr += col_delta;

	// Multiply this 4x4 block.
	for (int i = 0; i < 4; ++i) {
	for (int j = 0; j < 4; ++j) {
	accumulators[i] += static_cast<float>(weights_ptr++)
	static_cast<float>(rhs_ptr[j]);
	}
	}
	}

	for (int i = 0; i < 4; ++i)
	*out_ptr++ = static_cast<OutType>(relu ? std::max(accumulators[i], 0.f)
	: accumulators[i]);
	}
	}

	// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
	// blocked pattern, x is a fat vector with 5 columns and b is vector. b is
	// broadcast. Weights are stored for this routine by making each 4x4 block
	// contiguous. Blocks are ordered in standard row-major format. column indices
	// are converted to deltas and then multiplied by 2 to convert to bytes, so
	// that the value can be used directly to offset the pointer into the rhs
	// vector.
	//
	// NOTE: The bias is expected to have be multiplied by .25f prior to calling
	// this function. This is automatically taken care of in SparseLinearLayer.
	// The bias is reconstructed through horizontal additions, leads to a small
	// speedup by reducing latencies at the end of the loop.
	template <typename WeightType, typename RhsType, typename OutType>
	typename std::enable_if<
	ShouldEnableGenericSpMM5_4x4<WeightType, RhsType, OutType>::value>::type
	SpMM5_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
	const int32_t* nnz_per_row, const RhsType* rhs_ptr,
	const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
	OutType* out_ptr, int64_t assigned_rows, int64_t rows, int64_t cols,
	int relu) {
	const RhsType* rhs_ptrs[5];
	for (int i = 0; i < 5; ++i) rhs_ptrs[i] = rhs_ptr + i * cols;

	OutType* out_ptrs[5];
	for (int i = 0; i < 5; ++i) out_ptrs[i] = out_ptr + i * rows;

	for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
	float accumulators[4][5];
	// Undo the divion by the happens for the assembly version.
	for (int i = 0; i < 4; ++i) {
	for (int k = 0; k < 5; ++k) {
	accumulators[i][k] = 4.f * static_cast<float>(*bias_ptr);
	}
	++bias_ptr;
	}

	int reduced_col_count = *nnz_per_row++;
	for (int c = 0; c < reduced_col_count; ++c) {
	int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
	for (int k = 0; k < 5; ++k) rhs_ptrs[k] += col_delta;

	// multiply this 4x4 block
	for (int i = 0; i < 4; ++i) {
	for (int j = 0; j < 4; ++j) {
	for (int k = 0; k < 5; ++k) {
	accumulators[i][k] += static_cast<float>(weights_ptr)
	static_cast<float>(rhs_ptrs[k][j]);
	}
	weights_ptr++;
	}
	}
	}

	for (int k = 0; k < 5; ++k) {
	for (int i = 0; i < 4; ++i) {
	out_ptrs[k][0] = static_cast<OutType>(
	relu ? std::max(accumulators[i][k], 0.f) : accumulators[i][k]);
	out_ptrs[k]++;
	}
	}
	}
	}

	// Performs the calculation y = A * x + b where A is a sparse matrix with
	// a 1x1 blocked pattern (ie unstructured), x is a
	// vector and b is vector.
	// Weights are stored for this routine in standard CSR format. Each row must
	// have a multiple of 8 columns.
	// column indices are converted to deltas and then multiplied by 2 to convert
	// to bytes, so that the value can be used directly to offset the pointer
	// into the rhs vector.
	// NOTE: The bias is expected to have be multiplied by .25f prior to calling
	// this function. This is automatically taken care of in SparseLinearLayer.
	// The bias is reconstructed through horizontal additions, leads to a small
	// speedup by reducing latencies at the end of the loop.
	template <typename WeightType, typename RhsType, typename OutType>
	typename std::enable_if<
	ShouldEnableGenericSpMV_1x1<WeightType, RhsType, OutType>::value>::type
	SpMV_1x1(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
	const int32_t* nnz_per_row, const RhsType* rhs_ptr,
	const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
	OutType* out_ptr, int64_t assigned_rows,
	int64_t rows /* only used in SpMM variants */,
	int64_t cols /* only used in SpMM variants */, int relu) {
	for (int row = 0; row < assigned_rows; ++row) {
	// Undo the divion by the happens for the assembly version.
	float accumulator = 4.f * static_cast<float>(*bias_ptr++);

	int col_count = *nnz_per_row++;
	for (int c = 0; c < col_count; ++c) {
	int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
	rhs_ptr += col_delta;

	accumulator +=
	static_cast<float>(weights_ptr++) static_cast<float>(*rhs_ptr);
	}

	*out_ptr++ =
	static_cast<OutType>(relu ? std::max(accumulator, 0.f) : accumulator);
	}
	}

	// Performs the calculation y = A * x + b where A is a sparse matrix with
	// a 1x1 blocked pattern (ie unstructured), x is a
	// vector and b is vector.
	// Weights are stored for this routine in standard CSR format. Each row must
	// have a multiple of 8 columns.
	// column indices are converted to deltas and then multiplied by 2 to convert
	// to bytes, so that the value can be used directly to offset the pointer
	// into the rhs vector.
	// NOTE: The bias is expected to have be multiplied by .25f prior to calling
	// this function. This is automatically taken care of in SparseLinearLayer.
	// The bias is reconstructed through horizontal additions, leads to a small
	// speedup by reducing latencies at the end of the loop.
	template <typename WeightType, typename RhsType, typename OutType>
	typename std::enable_if<
	ShouldEnableGenericSpMM5_1x1<WeightType, RhsType, OutType>::value>::type
	SpMM5_1x1(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
	const int32_t* nnz_per_row, const RhsType* rhs_ptr,
	const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
	OutType* out_ptr, int64_t assigned_rows, int64_t rows, int64_t cols,
	int relu) {
	const RhsType* rhs_ptrs[5];
	for (int i = 0; i < 5; ++i) rhs_ptrs[i] = rhs_ptr + i * cols;

	OutType* out_ptrs[5];
	for (int i = 0; i < 5; ++i) out_ptrs[i] = out_ptr + i * rows;

	for (int row = 0; row < assigned_rows; ++row) {
	// Undo the divion by the happens for the assembly version.
	float accumulator[5];
	for (int i = 0; i < 5; ++i)
	accumulator[i] = 4.f * static_cast<float>(*bias_ptr);

	++bias_ptr;

	int col_count = *nnz_per_row++;
	for (int c = 0; c < col_count; ++c) {
	int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
	for (int i = 0; i < 5; ++i) {
	rhs_ptrs[i] += col_delta;
	accumulator[i] += static_cast<float>(weights_ptr)
	static_cast<float>(rhs_ptrs[i][0]);
	}
	weights_ptr++;
	}

	for (int i = 0; i < 5; ++i) {
	out_ptrs[i][0] = static_cast<OutType>(relu ? std::max(accumulator[i], 0.f)
	: accumulator[i]);
	out_ptrs[i]++;
	}
	}
	}

	template <typename Type>
	typename std::enable_if<ShouldEnableGenericAdd<Type>::value>::type SumVectors(
	int start, int end, const Type* add1, const Type* add2, Type* result) {
	LOG_FIRST_N(WARNING, 1) << "SumVectors: using generic kernel!";
	for (int i = start; i < end; ++i) {
	Type sum = static_cast<Type>(static_cast<float>(add1[i]) +
	static_cast<float>(add2[i]));
	result[i] = sum;
	}
	}

	} // namespace detail
	} // namespace csrblocksparse

	#undef LABEL_COL_LOOP
	#undef LABEL_ROW_LOOP
	#undef LABEL_SKIP_COL_LOOP
	#undef LABEL_TOP_LOOP

	#endif // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_GENERIC_H_