Spaces:
Sleeping
Sleeping
| /* | |
| * Copyright 2021 Google LLC | |
| * | |
| * Licensed under the Apache License, Version 2.0 (the "License"); | |
| * you may not use this file except in compliance with the License. | |
| * You may obtain a copy of the License at | |
| * | |
| * http://www.apache.org/licenses/LICENSE-2.0 | |
| * | |
| * Unless required by applicable law or agreed to in writing, software | |
| * distributed under the License is distributed on an "AS IS" BASIS, | |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| * See the License for the specific language governing permissions and | |
| * limitations under the License. | |
| */ | |
| namespace csrblocksparse { | |
| template <typename WeightType, typename RhsType, | |
| typename BiasType = typename TypeOfProduct<WeightType, RhsType>::type, | |
| typename DeltaType = int16_t> | |
| class SparseLinearLayer { | |
| public: | |
| SparseLinearLayer() {} | |
| SparseLinearLayer(CsrBlockSparseMatrix<WeightType, RhsType>&& sparse_matrix, | |
| CacheAlignedVector<BiasType>&& bias) | |
| : sparse_matrix_(std::move(sparse_matrix)), full_bias_(std::move(bias)) { | |
| CHECK_EQ(sparse_matrix_.rows(), full_bias_.size()); | |
| // Some kernels expect that the bias is divided by 4, so we store a second | |
| // copy of a quarter of the bias. | |
| // TODO(b/189958858): Remove the quartered bias if it can be done without | |
| // loss of speed, and rename the |full_bias_| member back to |bias_|. | |
| bias_ = full_bias_; | |
| for (int i = 0; i < bias_.size(); ++i) { | |
| bias_[i] = static_cast<BiasType>(.25f * static_cast<float>(bias_[i])); | |
| } | |
| } | |
| SparseLinearLayer( | |
| const SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>& src) { | |
| *this = src; | |
| } | |
| SparseLinearLayer& operator=( | |
| const SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>& src) { | |
| sparse_matrix_ = src.sparse_matrix_; | |
| bias_ = src.bias_; | |
| full_bias_ = src.full_bias_; | |
| mid_output_ = src.mid_output_; | |
| thread_layers_ = src.thread_layers_; | |
| num_threads_ = src.num_threads_; | |
| if (src.split_pc_) { | |
| split_pc_ = absl::make_unique<ProducerConsumer>( | |
| src.split_pc_->num_producers(), src.split_pc_->num_consumers()); | |
| } | |
| return *this; | |
| } | |
| // Does Ax + b where A is a block sparse compressed sparse row matrix and | |
| // x is a COLUMN MAJOR dense vector or matrix. Bias is a vector that is | |
| // broadcast if rhs has more than one column. | |
| template <typename RhsClassType, typename OutType> | |
| void SpMM_bias(const RhsClassType& rhs, OutType* out, bool relu = false, | |
| int tid = 0, SpinBarrier* barrier = nullptr) const { | |
| static_assert( | |
| std::is_same<typename RhsClassType::value_type, RhsType>::value, ""); | |
| sparse_matrix_.SpMM_bias(rhs, bias_, out, relu, tid, barrier); | |
| } | |
| // Multiplies a sparse matrix by a possibly dense matrix, as SpMM_bias above, | |
| // and then samples from the output (softmax distribution) layer. | |
| template <typename RhsClassType, typename OutType> | |
| int SpMM_bias_Sample(const RhsClassType& rhs, OutType* out, float temperature, | |
| int tid, SpinBarrier* barrier, std::minstd_rand* gen, | |
| CacheAlignedVector<float>* scratch) const { | |
| static_assert( | |
| std::is_same<typename RhsClassType::value_type, RhsType>::value, ""); | |
| return sparse_matrix_.SpMM_bias_Sample(rhs, bias_, out, temperature, tid, | |
| barrier, gen, scratch); | |
| } | |
| template <typename RhsClassType, typename OutType> | |
| void MatVec(const RhsClassType& rhs, bool relu, int tid, int replicas, | |
| int output_stride, OutType* output, | |
| SpinBarrier* barrier = nullptr) { | |
| static_assert( | |
| std::is_same<typename RhsClassType::value_type, RhsType>::value, ""); | |
| if (block_width() == 4 && (block_height() == 4 || block_height() == 8) && | |
| !IsCustomFloatType<WeightType>::value) { | |
| if (!IsSplit()) { | |
| sparse_matrix_.MatVec(rhs.cast_data(), full_bias_.cast_data(), relu, | |
| tid, replicas, output_stride, output->data()); | |
| if (barrier != nullptr) barrier->barrier(); | |
| return; | |
| } | |
| // NOTE: Until the quartered bias is removed it is a bad idea to split | |
| // for ARM in the same way, as we would have to quarter the output of | |
| // the first part of the split before running the second part. | |
| // Signal completion of the previous MatVec. | |
| split_pc_->produce(); | |
| PartLinearLayer& thread_part = thread_layers_[tid]; | |
| auto offset_output = | |
| sparse_matrix_.thread_bounds().OffsetOutput(output->data(), tid); | |
| auto mid_output = | |
| sparse_matrix_.thread_bounds().OffsetOutput(mid_output_.data(), tid); | |
| auto offset_bias = sparse_matrix_.thread_bounds().OffsetOutput( | |
| mid_output_.cast_data(), tid); | |
| // We can continue to consume the data that this thread produced and | |
| // compute just the |self_matrix| part. | |
| // No |relu| or |replicas|, as this is only a partial matmul. | |
| // |tid| is always zero because the matrix has been split by tid. | |
| thread_part.self_matrix.MatVec( | |
| rhs.cast_data(), thread_part.full_bias.cast_data(), /*relu=*/false, | |
| /*tid=*/0, /*replicas=*/1, output_stride, mid_output); | |
| // We have to wait for the other threads to finish working on the previous | |
| // MatMul before consuming the rest of |rhs|. | |
| split_pc_->consume(); | |
| thread_part.other_matrix.MatVec(rhs.cast_data(), offset_bias, relu, | |
| /*tid=*/0, replicas, output_stride, | |
| offset_output); | |
| return; | |
| } | |
| DCHECK_EQ(replicas, 1) << "Must have single replica for SpMM API"; | |
| if (IsSplit()) { | |
| // Generics aren't setup to use a split matrix. This will be inefficient. | |
| split_pc_->produce(); | |
| split_pc_->consume(); | |
| } | |
| if (block_height() == 8) { | |
| // We are currently forced to use MatVec generics for this case. | |
| LOG(WARNING) << "Need to implement MatVec for 8x4 for non-AVX2 targets!!"; | |
| sparse_matrix_.MatVec(rhs.cast_data(), full_bias_.cast_data(), relu, tid, | |
| replicas, output_stride, output->data()); | |
| if (barrier != nullptr) barrier->barrier(); | |
| } else { | |
| sparse_matrix_.SpMM_bias(rhs, bias_, output, relu, tid, barrier); | |
| } | |
| } | |
| int rows() const { return sparse_matrix_.rows(); } | |
| int cols() const { return sparse_matrix_.cols(); } | |
| float sparsity() const { return sparse_matrix_.sparsity(); } | |
| int block_width() const { return sparse_matrix_.block_width(); } | |
| int block_height() const { return sparse_matrix_.block_height(); } | |
| int num_threads() const { return sparse_matrix_.num_threads(); } | |
| const CacheAlignedVector<BiasType>& bias() const { return bias_; } | |
| const std::vector<int>& split_points() const { | |
| return sparse_matrix_.split_points(); | |
| } | |
| bool IsSplit() const { | |
| return !thread_layers_.empty() && split_pc_ != nullptr; | |
| } | |
| std::size_t bytes() const { return sparse_matrix_.bytes() + bias_.bytes(); } | |
| void Print() const { | |
| printf("Matrix\n"); | |
| sparse_matrix_.Print(); | |
| printf("Bias\n"); | |
| bias_.Print(); | |
| } | |
| // Combines adjacent row blocks, doubling the block height. | |
| // This necessarily involves adding zero weights where the blocks don't align | |
| // across adjacent pairs of rows, so use with caution, as the resulting matrix | |
| // is most likely to run slower if very sparse to begin with. | |
| // In the few cases where the blocks do mostly align, the resulting matmul | |
| // could be much faster, as the number of reads of the rhs will be halved. | |
| void DoubleBlockHeight() { sparse_matrix_.DoubleBlockHeight(); } | |
| // Cache_line_size is provided only for testing. Normally uses a value for | |
| // the current architecture. | |
| int PrepareForThreads(int num_threads, int cache_line_size = -1) { | |
| num_threads_ = num_threads; | |
| if (num_threads_ > 1) { | |
| split_pc_ = | |
| absl::make_unique<ProducerConsumer>(num_threads_, num_threads_); | |
| } else { | |
| split_pc_.reset(nullptr); | |
| } | |
| return sparse_matrix_.PrepareForThreads(num_threads, cache_line_size); | |
| } | |
| // Partitions the matrix into pieces by thread. | |
| // In this matrix, we can go ahead and calculate the part that only depends | |
| // on rhs inputs that were generated by this thread in the previous matvec, | |
| // without having to use any thread synchronization, and only after that do we | |
| // have to wait for the other threads to finish the previous matvec. | |
| // So we split the matrix using the |split_points| from the previous matrix | |
| // into 2 * |num_threads_| pieces: self and other for each thread, being the | |
| // parts that can be calculated before and after the other threads have | |
| // completed their calculation of the previous matvec. | |
| // We then have to use a ProducerConsumer lock instead of a SpinBarrier to | |
| // synchronize the data produced by the other threads. | |
| void SliceForThreads(const std::vector<int>& split_points) { | |
| thread_layers_.clear(); | |
| thread_layers_.reserve(num_threads_); | |
| LOG(INFO) << "Slicing " << rows() << "x" << cols() << " matrix for " | |
| << num_threads_ << " threads"; | |
| for (int tid = 0; tid < num_threads_; ++tid) { | |
| thread_layers_.emplace_back( | |
| sparse_matrix_, full_bias_, bias_, tid, | |
| split_points[tid] * sparse_matrix_.block_height(), | |
| split_points[tid + 1] * sparse_matrix_.block_height()); | |
| } | |
| mid_output_ = | |
| std::move(csrblocksparse::CacheAlignedVector<BiasType>(rows())); | |
| mid_output_.FillZero(); | |
| } | |
| // Splits the layer by inputs into 2 equal pieces. Each of the resulting | |
| // layers should be computed independently on the first and second halves of | |
| // the inputs respectively and the results added to achieve the same effect | |
| // as the original layer. | |
| void SplitInputs( | |
| SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part1, | |
| SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part2) { | |
| CsrBlockSparseMatrix<WeightType, RhsType> matrix1( | |
| sparse_matrix_.SplitByColumn(0, sparse_matrix_.cols() / 2)); | |
| CsrBlockSparseMatrix<WeightType, RhsType> matrix2( | |
| sparse_matrix_.SplitByColumn(sparse_matrix_.cols() / 2, | |
| sparse_matrix_.cols())); | |
| *part1 = | |
| std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>( | |
| std::move(matrix1), | |
| std::move(CacheAlignedVector<BiasType>(full_bias_)))); | |
| CacheAlignedVector<BiasType> bias2(sparse_matrix_.rows()); | |
| bias2.FillZero(); | |
| *part2 = | |
| std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>( | |
| std::move(matrix2), std::move(bias2))); | |
| } | |
| // Splits the layer by outputs into 2 equal pieces. Each of the resulting | |
| // layers should be computed independently on the full inputs and the results | |
| // concatenated to achieve the same effect as the original layer. | |
| void SplitOutputs( | |
| SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part1, | |
| SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part2) { | |
| LOG(INFO) << "input rows=" << sparse_matrix_.rows() | |
| << ", cols=" << sparse_matrix_.cols(); | |
| CsrBlockSparseMatrix<WeightType, RhsType> matrix1( | |
| sparse_matrix_.SplitByRow(0, sparse_matrix_.rows() / 2)); | |
| CsrBlockSparseMatrix<WeightType, RhsType> matrix2(sparse_matrix_.SplitByRow( | |
| sparse_matrix_.rows() / 2, sparse_matrix_.rows())); | |
| CacheAlignedVector<BiasType> bias1(full_bias_, 0, full_bias_.size() / 2); | |
| *part1 = | |
| std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>( | |
| std::move(matrix1), std::move(bias1))); | |
| CacheAlignedVector<BiasType> bias2(full_bias_, full_bias_.size() / 2, | |
| full_bias_.size()); | |
| *part2 = | |
| std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>( | |
| std::move(matrix2), std::move(bias2))); | |
| } | |
| private: | |
| // Simple struct to hold a partitioned layer. | |
| struct PartLinearLayer { | |
| // The original matrix is first split by row to generate only the outputs | |
| // for the given tid. The |row_sub_matrix| is then split by column into two | |
| // partitions: | |
| // self is the part for which the rhs elements in [|start_col|, |end_col|) | |
| // were generated by this thread in some previous matmul. | |
| // |other| is the rest of the columns that require rhs elements from other | |
| // threads. | |
| // NOTE that| start_col|, |end_col| are in raw columns, not blocks. | |
| PartLinearLayer(const CsrBlockSparseMatrix<WeightType, RhsType>& matrix, | |
| const CacheAlignedVector<BiasType>& bias, | |
| const CacheAlignedVector<BiasType>& bias_4, int tid, | |
| int start_col, int end_col) { | |
| int block_height = matrix.block_height(); | |
| // Split the input matrix by row, selecting only the rows relevant to | |
| // thread tid. | |
| int start_row = matrix.split_points()[tid] * block_height; | |
| int end_row = matrix.split_points()[tid + 1] * block_height; | |
| LOG(INFO) << "input cols [" << start_col << "," << end_col << ") rows [" | |
| << start_row << "," << end_row << ")"; | |
| CsrBlockSparseMatrix<WeightType, RhsType> row_sub_matrix = | |
| matrix.SplitByRow(start_row, end_row); | |
| // Partition into the columns that use rhs elements that thread tid | |
| // produced in a previous matmul, and the other rhs elements. | |
| // NOTE that we |keep_rhs_size|=true so that each matrix can operate on | |
| // the same rhs input vector. The self matrix just guarantees not to | |
| // access any of the elements that are generated by another thread. | |
| self_matrix = std::move(row_sub_matrix.SplitByColumn( | |
| start_col, end_col, /*keep_rhs_size=*/true)); | |
| self_matrix.PrepareForThreads(1); | |
| // The reversed start and end slice out the complement of [start, end). | |
| other_matrix = std::move(row_sub_matrix.SplitByColumn( | |
| end_col, start_col, /*keep_rhs_size=*/true)); | |
| other_matrix.PrepareForThreads(1); | |
| full_bias = | |
| std::move(CacheAlignedVector<BiasType>(bias, start_row, end_row)); | |
| // TODO(b/189958858): Eliminate the quarter bias from all the code. | |
| quarter_bias = | |
| std::move(CacheAlignedVector<BiasType>(bias_4, start_row, end_row)); | |
| } | |
| // The part of the matrix that only depends on this thread for rhs inputs. | |
| CsrBlockSparseMatrix<WeightType, RhsType> self_matrix; | |
| CacheAlignedVector<BiasType> full_bias; | |
| CacheAlignedVector<BiasType> quarter_bias; | |
| // The part of the matrix that uses rhs inputs from other threads. | |
| CsrBlockSparseMatrix<WeightType, RhsType> other_matrix; | |
| }; | |
| CsrBlockSparseMatrix<WeightType, RhsType, DeltaType> sparse_matrix_; | |
| CacheAlignedVector<BiasType> bias_; | |
| CacheAlignedVector<BiasType> full_bias_; | |
| // Output from the self_matrix that will be given to |other_matrix| as bias. | |
| CacheAlignedVector<BiasType> mid_output_; | |
| // One partitioned pair of matrices for each thread. | |
| std::vector<PartLinearLayer> thread_layers_; | |
| // Producer-consumer lock used to wait between computing |self_matrix| and | |
| // |other_matrix| for the other threads to finish the *previous* matvec. | |
| std::unique_ptr<ProducerConsumer> split_pc_; | |
| int num_threads_ = 0; | |
| }; | |
| template <typename WeightType, typename RhsType> | |
| SparseLinearLayer<WeightType, RhsType> CreateRandomLayer(int rows, int cols, | |
| float sparsity, | |
| int block_height = 1, | |
| int block_width = 1) { | |
| typedef typename TypeOfProduct<WeightType, RhsType>::type BiasType; | |
| CacheAlignedVector<BiasType> bias(rows); | |
| bias.FillRandom(); | |
| auto masked_matrix = MaskedSparseMatrix<float>(rows, cols, sparsity, | |
| block_height, block_width); | |
| auto sparse_matrix = CsrBlockSparseMatrix<WeightType, RhsType>(masked_matrix); | |
| return SparseLinearLayer<WeightType, RhsType>(std::move(sparse_matrix), | |
| std::move(bias)); | |
| } | |
| template <typename WeightType, typename RhsType> | |
| SparseLinearLayer<WeightType, RhsType> CreateConstantLayer( | |
| int rows, int cols, float sparsity, float constant = 1.f) { | |
| typedef typename TypeOfProduct<WeightType, RhsType>::type BiasType; | |
| CacheAlignedVector<BiasType> bias(rows); | |
| bias.FillOnes(); | |
| MaskedSparseMatrix<float> masked_matrix(rows, cols, sparsity, | |
| /*block_height=*/1, /*block_width=*/1, | |
| constant, /*random=*/false); | |
| CsrBlockSparseMatrix<WeightType, RhsType> sparse_matrix(masked_matrix); | |
| return SparseLinearLayer<WeightType, RhsType>(std::move(sparse_matrix), | |
| std::move(bias)); | |
| } | |
| } // namespace csrblocksparse | |