Spaces:
Sleeping
Sleeping
| /* | |
| * Copyright 2021 Google LLC | |
| * | |
| * Licensed under the Apache License, Version 2.0 (the "License"); | |
| * you may not use this file except in compliance with the License. | |
| * You may obtain a copy of the License at | |
| * | |
| * http://www.apache.org/licenses/LICENSE-2.0 | |
| * | |
| * Unless required by applicable law or agreed to in writing, software | |
| * distributed under the License is distributed on an "AS IS" BASIS, | |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| * See the License for the specific language governing permissions and | |
| * limitations under the License. | |
| */ | |
| // Separate out the assembly kernels for readability. Eventually this will | |
| // become an ifdef switch on the architecture type. | |
| // If there is no architecture-specific implementation, then always use generic. | |
| template <typename WeightType, typename RhsType, typename OutType> | |
| struct ShouldEnableGenericSpMV_4x4 : std::true_type {}; | |
| template <typename WeightType, typename RhsType, typename OutType> | |
| struct ShouldEnableGenericSpMM5_4x4 : std::true_type {}; | |
| template <typename WeightType, typename RhsType, typename OutType> | |
| struct ShouldEnableGenericSpMV_1x1 : std::true_type {}; | |
| template <typename WeightType, typename RhsType, typename OutType> | |
| struct ShouldEnableGenericSpMM5_1x1 : std::true_type {}; | |
| template <typename Type> | |
| struct ShouldEnableGenericAdd : std::true_type {}; | |
| namespace csrblocksparse { | |
| namespace detail { | |
| // The computational routines do NO error checking for speed. It is assumed | |
| // that this has been handled by CSRBlockSparseMatrix. | |
| // Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4 | |
| // blocked pattern, x is a vector and b is vector. Weights are stored for this | |
| // routine by making each 4x4 block contiguous. Blocks are ordered in standard | |
| // row-major format. column indices are converted to deltas and then multiplied | |
| // by 2 to convert to bytes, so that the value can be used directly to offset | |
| // the pointer into the rhs vector. | |
| // | |
| // NOTE: The bias is expected to have be multiplied by .25f prior to calling | |
| // this function. This is automatically taken care of in SparseLinearLayer. | |
| // The bias is reconstructed through horizontal additions, leads to a small | |
| // speedup by reducing latencies at the end of the loop. | |
| template <typename WeightType, typename RhsType, typename OutType> | |
| typename std::enable_if< | |
| ShouldEnableGenericSpMV_4x4<WeightType, RhsType, OutType>::value>::type | |
| SpMV_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes, | |
| const int32_t* nnz_per_row, const RhsType* rhs_ptr, | |
| const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr, | |
| OutType* out_ptr, int64_t assigned_rows, | |
| int64_t rows /* only used in SpMM variants */, | |
| int64_t cols /* only used in SpMM variants */, int relu) { | |
| for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) { | |
| float accumulators[4]; | |
| // Undo the divion by the happens for the assembly version. | |
| for (int i = 0; i < 4; ++i) | |
| accumulators[i] = 4.f * static_cast<float>(*bias_ptr++); | |
| int reduced_col_count = *nnz_per_row++; | |
| for (int c = 0; c < reduced_col_count; ++c) { | |
| int col_delta = *col_deltas_bytes++ / sizeof(RhsType); | |
| rhs_ptr += col_delta; | |
| // Multiply this 4x4 block. | |
| for (int i = 0; i < 4; ++i) { | |
| for (int j = 0; j < 4; ++j) { | |
| accumulators[i] += static_cast<float>(*weights_ptr++) * | |
| static_cast<float>(rhs_ptr[j]); | |
| } | |
| } | |
| } | |
| for (int i = 0; i < 4; ++i) | |
| *out_ptr++ = static_cast<OutType>(relu ? std::max(accumulators[i], 0.f) | |
| : accumulators[i]); | |
| } | |
| } | |
| // Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4 | |
| // blocked pattern, x is a fat vector with 5 columns and b is vector. b is | |
| // broadcast. Weights are stored for this routine by making each 4x4 block | |
| // contiguous. Blocks are ordered in standard row-major format. column indices | |
| // are converted to deltas and then multiplied by 2 to convert to bytes, so | |
| // that the value can be used directly to offset the pointer into the rhs | |
| // vector. | |
| // | |
| // NOTE: The bias is expected to have be multiplied by .25f prior to calling | |
| // this function. This is automatically taken care of in SparseLinearLayer. | |
| // The bias is reconstructed through horizontal additions, leads to a small | |
| // speedup by reducing latencies at the end of the loop. | |
| template <typename WeightType, typename RhsType, typename OutType> | |
| typename std::enable_if< | |
| ShouldEnableGenericSpMM5_4x4<WeightType, RhsType, OutType>::value>::type | |
| SpMM5_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes, | |
| const int32_t* nnz_per_row, const RhsType* rhs_ptr, | |
| const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr, | |
| OutType* out_ptr, int64_t assigned_rows, int64_t rows, int64_t cols, | |
| int relu) { | |
| const RhsType* rhs_ptrs[5]; | |
| for (int i = 0; i < 5; ++i) rhs_ptrs[i] = rhs_ptr + i * cols; | |
| OutType* out_ptrs[5]; | |
| for (int i = 0; i < 5; ++i) out_ptrs[i] = out_ptr + i * rows; | |
| for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) { | |
| float accumulators[4][5]; | |
| // Undo the divion by the happens for the assembly version. | |
| for (int i = 0; i < 4; ++i) { | |
| for (int k = 0; k < 5; ++k) { | |
| accumulators[i][k] = 4.f * static_cast<float>(*bias_ptr); | |
| } | |
| ++bias_ptr; | |
| } | |
| int reduced_col_count = *nnz_per_row++; | |
| for (int c = 0; c < reduced_col_count; ++c) { | |
| int col_delta = *col_deltas_bytes++ / sizeof(RhsType); | |
| for (int k = 0; k < 5; ++k) rhs_ptrs[k] += col_delta; | |
| // multiply this 4x4 block | |
| for (int i = 0; i < 4; ++i) { | |
| for (int j = 0; j < 4; ++j) { | |
| for (int k = 0; k < 5; ++k) { | |
| accumulators[i][k] += static_cast<float>(*weights_ptr) * | |
| static_cast<float>(rhs_ptrs[k][j]); | |
| } | |
| weights_ptr++; | |
| } | |
| } | |
| } | |
| for (int k = 0; k < 5; ++k) { | |
| for (int i = 0; i < 4; ++i) { | |
| out_ptrs[k][0] = static_cast<OutType>( | |
| relu ? std::max(accumulators[i][k], 0.f) : accumulators[i][k]); | |
| out_ptrs[k]++; | |
| } | |
| } | |
| } | |
| } | |
| // Performs the calculation y = A * x + b where A is a sparse matrix with | |
| // a 1x1 blocked pattern (ie unstructured), x is a | |
| // vector and b is vector. | |
| // Weights are stored for this routine in standard CSR format. Each row must | |
| // have a multiple of 8 columns. | |
| // column indices are converted to deltas and then multiplied by 2 to convert | |
| // to bytes, so that the value can be used directly to offset the pointer | |
| // into the rhs vector. | |
| // NOTE: The bias is expected to have be multiplied by .25f prior to calling | |
| // this function. This is automatically taken care of in SparseLinearLayer. | |
| // The bias is reconstructed through horizontal additions, leads to a small | |
| // speedup by reducing latencies at the end of the loop. | |
| template <typename WeightType, typename RhsType, typename OutType> | |
| typename std::enable_if< | |
| ShouldEnableGenericSpMV_1x1<WeightType, RhsType, OutType>::value>::type | |
| SpMV_1x1(const WeightType* weights_ptr, const int16_t* col_deltas_bytes, | |
| const int32_t* nnz_per_row, const RhsType* rhs_ptr, | |
| const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr, | |
| OutType* out_ptr, int64_t assigned_rows, | |
| int64_t rows /* only used in SpMM variants */, | |
| int64_t cols /* only used in SpMM variants */, int relu) { | |
| for (int row = 0; row < assigned_rows; ++row) { | |
| // Undo the divion by the happens for the assembly version. | |
| float accumulator = 4.f * static_cast<float>(*bias_ptr++); | |
| int col_count = *nnz_per_row++; | |
| for (int c = 0; c < col_count; ++c) { | |
| int col_delta = *col_deltas_bytes++ / sizeof(RhsType); | |
| rhs_ptr += col_delta; | |
| accumulator += | |
| static_cast<float>(*weights_ptr++) * static_cast<float>(*rhs_ptr); | |
| } | |
| *out_ptr++ = | |
| static_cast<OutType>(relu ? std::max(accumulator, 0.f) : accumulator); | |
| } | |
| } | |
| // Performs the calculation y = A * x + b where A is a sparse matrix with | |
| // a 1x1 blocked pattern (ie unstructured), x is a | |
| // vector and b is vector. | |
| // Weights are stored for this routine in standard CSR format. Each row must | |
| // have a multiple of 8 columns. | |
| // column indices are converted to deltas and then multiplied by 2 to convert | |
| // to bytes, so that the value can be used directly to offset the pointer | |
| // into the rhs vector. | |
| // NOTE: The bias is expected to have be multiplied by .25f prior to calling | |
| // this function. This is automatically taken care of in SparseLinearLayer. | |
| // The bias is reconstructed through horizontal additions, leads to a small | |
| // speedup by reducing latencies at the end of the loop. | |
| template <typename WeightType, typename RhsType, typename OutType> | |
| typename std::enable_if< | |
| ShouldEnableGenericSpMM5_1x1<WeightType, RhsType, OutType>::value>::type | |
| SpMM5_1x1(const WeightType* weights_ptr, const int16_t* col_deltas_bytes, | |
| const int32_t* nnz_per_row, const RhsType* rhs_ptr, | |
| const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr, | |
| OutType* out_ptr, int64_t assigned_rows, int64_t rows, int64_t cols, | |
| int relu) { | |
| const RhsType* rhs_ptrs[5]; | |
| for (int i = 0; i < 5; ++i) rhs_ptrs[i] = rhs_ptr + i * cols; | |
| OutType* out_ptrs[5]; | |
| for (int i = 0; i < 5; ++i) out_ptrs[i] = out_ptr + i * rows; | |
| for (int row = 0; row < assigned_rows; ++row) { | |
| // Undo the divion by the happens for the assembly version. | |
| float accumulator[5]; | |
| for (int i = 0; i < 5; ++i) | |
| accumulator[i] = 4.f * static_cast<float>(*bias_ptr); | |
| ++bias_ptr; | |
| int col_count = *nnz_per_row++; | |
| for (int c = 0; c < col_count; ++c) { | |
| int col_delta = *col_deltas_bytes++ / sizeof(RhsType); | |
| for (int i = 0; i < 5; ++i) { | |
| rhs_ptrs[i] += col_delta; | |
| accumulator[i] += static_cast<float>(*weights_ptr) * | |
| static_cast<float>(rhs_ptrs[i][0]); | |
| } | |
| weights_ptr++; | |
| } | |
| for (int i = 0; i < 5; ++i) { | |
| out_ptrs[i][0] = static_cast<OutType>(relu ? std::max(accumulator[i], 0.f) | |
| : accumulator[i]); | |
| out_ptrs[i]++; | |
| } | |
| } | |
| } | |
| template <typename Type> | |
| typename std::enable_if<ShouldEnableGenericAdd<Type>::value>::type SumVectors( | |
| int start, int end, const Type* add1, const Type* add2, Type* result) { | |
| LOG_FIRST_N(WARNING, 1) << "SumVectors: using generic kernel!"; | |
| for (int i = start; i < end; ++i) { | |
| Type sum = static_cast<Type>(static_cast<float>(add1[i]) + | |
| static_cast<float>(add2[i])); | |
| result[i] = sum; | |
| } | |
| } | |
| } // namespace detail | |
| } // namespace csrblocksparse | |