Spaces:

ntt123
/

vietnam-male-voice-wavegru-tts

Running

File size: 35,003 Bytes

21f3d42

/*
 * Copyright 2021 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LYRA_CODEC_SPARSE_MATMUL_LAYERS_CSR_BLOCKSPARSE_MATRIX_H_
#define LYRA_CODEC_SPARSE_MATMUL_LAYERS_CSR_BLOCKSPARSE_MATRIX_H_

#include <algorithm>
#include <cstdint>
#include <iostream>
#include <memory>
#include <tuple>
#include <vector>

#include "glog/logging.h"
// IWYU pragma: begin_exports
#include "sparse_matmul/compute/kernels_generic.h"
#include "sparse_matmul/compute/matmul.h"
#include "sparse_matmul/compute/thread_bounds.h"
#include "sparse_matmul/layers/masked_sparse_matrix.h"
#include "sparse_matmul/numerics/fixed_types.h"
#include "sparse_matmul/numerics/float16_types.h"
#include "sparse_matmul/os/coop_threads.h"
#include "sparse_matmul/vector/cache_aligned_vector.h"
// IWYU pragma: end_exports
#include "absl/memory/memory.h"

namespace csrblocksparse {
// CsrBlockSparseMatrix stores a modified block compressed sparse row
// representation of a sparse matrix.  The ordering of the weights is modified
// in the 16x1 and 1x1 cases so that a certain number (4 and 8 respectively)
// of columns of weights are stored contiguously before moving on to the next
// row.  The 4x4 case stores each block contiguously.
//
// Currently it is constructed from a MaskedSparseMatrix which usees a dense
// binary mask representation.  The construction generates the compressed
// representation.  Further iterations will support a direct serialization
// of the compressed representation.
//
// MaskedSparseMatrix masked_matrix(rows, cols, existing_mask, existing_values)
// CsrBlockSparseMatrix matrix(masked_matrix)
//
// matrix.SpMV_bias(rhs, bias, &out);
//
// This class is thread compatible.
template <typename WeightType, typename RhsType, typename DeltaType = int16_t>
class CsrBlockSparseMatrix {
 public:
  CsrBlockSparseMatrix() {}

  // Reference used to indicate that this is an input and not an output.
  CsrBlockSparseMatrix(const uint8_t* const& buffer, const std::size_t& len) {
    ReadFromFlatBuffer(buffer, len);
    ComputeRHSIndices();
  }

  template <typename InputType>
  CsrBlockSparseMatrix(const MaskedSparseMatrix<InputType>& masked_matrix) {
    sparsity_ = masked_matrix.sparsity();
    rows_ = masked_matrix.rows();
    cols_ = masked_matrix.cols();

    DetermineBlockSize(masked_matrix);

    if (block_width_ == 1 && block_height_ == 1)
      col_multiple_ = 8;
    else
      col_multiple_ = 1;

    std::vector<InputType> weights(masked_matrix.values().begin(),
                                   masked_matrix.values().end());

    reduced_rows_ = (rows_ + block_height_ - 1) / block_height_;
    rows_ = reduced_rows_ * block_height_;
    reduced_cols_ = cols_ / block_width_;

    // Calculate the reduced CSR representation of the matrix.
    std::vector<int> reduced_mask(reduced_rows_ * reduced_cols_);
    std::vector<int> row_offsets = {0};
    int nnz = 0;
    const auto& mask = masked_matrix.mask();
    for (int r = 0; r < reduced_rows_; ++r) {
      for (int c = 0; c < reduced_cols_; ++c) {
        int mask_val = mask[r * block_height_ * cols_ + c * block_width_];
        reduced_mask[r * reduced_cols_ + c] = mask_val;
        nnz += mask_val;
      }
      row_offsets.push_back(nnz);
    }

    // Make sure the reduced representation has the correct number of columns.
    MakeColumnsMultiple(row_offsets, &reduced_mask, &weights);

    std::vector<int> col_indices;
    std::vector<WeightType> weights_csr;
    std::vector<int> nnz_per_row;
    MaskAndWeightsToCsr(reduced_mask, weights, &nnz_per_row, &col_indices,
                        &weights_csr);

    // Generate column deltas from |col_indices|.
    std::vector<DeltaType> col_deltas;
    for (int i = 0; i < col_indices.size(); ++i) {
      // |col_indices| are used to index the RHS vector which is always float.
      int64_t diff = sizeof(RhsType);
      if (i == 0)
        diff *= block_width_ * (col_indices[i]);
      else
        diff *= block_width_ * (col_indices[i] - col_indices[i - 1]);

      CHECK(diff < std::numeric_limits<DeltaType>::max())
          << "delta between column indices in bytes " << diff
          << " exceeded the maximum size of the DeltaType "
          << std::numeric_limits<DeltaType>::max();
      col_deltas.push_back(static_cast<DeltaType>(diff));
    }

    // Because of pre-fetching we need some extra values at the end.
    col_deltas.insert(col_deltas.end(), std::max(2, col_multiple_ + 1), 0);
    nnz_per_row.insert(nnz_per_row.end(), 2, nnz_per_row.back());

    weights_ = CacheAlignedVector<WeightType>(weights_csr);
    col_deltas_ = CacheAlignedVector<DeltaType>(col_deltas);
    nnz_per_row_ = CacheAlignedVector<int>(nnz_per_row);
    ComputeRHSIndices();

    num_threads_ = 0;
    PrepareForThreads(1);
  }

  // Constructor makes a matrix from the given weights, deltas and nnz, taking
  // the other parameters from |src_matrix|. |cols| is the number of raw columns
  // (NOT blocks) of the new matrix.
  CsrBlockSparseMatrix(
      const CsrBlockSparseMatrix<WeightType, RhsType, DeltaType>& src_matrix,
      const std::vector<WeightType>& new_weights,
      const std::vector<DeltaType>& new_deltas, const std::vector<int>& new_nnz,
      int cols) {
    num_threads_ = 0;
    col_multiple_ = src_matrix.col_multiple_;
    block_width_ = src_matrix.block_width_;
    block_height_ = src_matrix.block_height_;
    reduced_rows_ = new_nnz.size();
    rows_ = reduced_rows_ * block_height_;
    cols_ = cols;
    reduced_cols_ = cols_ / block_width_;
    weights_ = CacheAlignedVector<WeightType>(new_weights);
    col_deltas_ = CacheAlignedVector<DeltaType>(new_deltas);
    nnz_per_row_ = CacheAlignedVector<int>(new_nnz);
    sparsity_ = 1.0f - static_cast<float>(new_weights.size()) / (rows_ * cols_);
    ComputeRHSIndices();
    name_ = src_matrix.name_;
    PrepareForThreads(1);
  }

  // Factory method takes a column slice out of *this and returns a sparse
  // matrix that takes as inputs [|start_col|, |end_col|) of *this, and
  // returns the same number of outputs, but only a partial result.
  // If |keep_rhs_size|, then the new matrix takes the same rhs as the current
  // matrix, but uses a subset of it, instead of expecting just the reduced rhs.
  // If |start_col| > |end_col|, then we slice out the complement of the defined
  // interval, ie [0, |end_col|) + [|start_col|, current end).
  // NOTE That |start_col| and |end_col| are in raw column coordinates, NOT
  // block units.
  CsrBlockSparseMatrix SplitByColumn(int start_col, int end_col,
                                     bool keep_rhs_size = false) const {
    int weight_index = 0;
    int delta_index = 0;
    std::vector<DeltaType> new_deltas;
    std::vector<WeightType> new_weights;
    std::vector<int> new_nnz(reduced_rows_);
    int col = 0;
    int prev_col = keep_rhs_size ? 0 : start_col;
    for (int r = 0; r < reduced_rows_; ++r) {
      int reduced_col_count = nnz_per_row_[r];
      for (int c = 0; c < reduced_col_count; ++c, ++delta_index) {
        col += col_deltas_[delta_index] / sizeof(RhsType);
        if ((start_col < end_col && start_col <= col && col < end_col) ||
            (start_col > end_col && (col < end_col || col >= start_col))) {
          ++new_nnz[r];
          new_deltas.push_back((col - prev_col) * sizeof(RhsType));
          prev_col = col;
          for (int i = 0; i < block_width_ * block_height_;
               ++i, ++weight_index) {
            new_weights.push_back(weights_[weight_index]);
          }
        } else {
          weight_index += block_width_ * block_height_;
        }
      }
    }
    int new_cols = keep_rhs_size ? cols_ : end_col - start_col;
    return CsrBlockSparseMatrix(*this, new_weights, new_deltas, new_nnz,
                                new_cols);
  }

  // Factory method takes a row slice out of *this and returns a sparse
  // matrix that takes the sampe inputs as *this, and returns the outputs for
  // the range [|start_row|, |end_row|).
  // NOTE That |start_row| and |end_row| are in raw column coordinates, NOT
  // block units.
  CsrBlockSparseMatrix SplitByRow(int start_row, int end_row) const {
    int start_reduced = start_row / block_height_;
    int end_reduced = end_row / block_height_;
    std::vector<int> new_nnz(nnz_per_row_.data() + start_reduced,
                             nnz_per_row_.data() + end_reduced);
    int weight_start = 0;
    for (int r = 0; r < start_reduced; ++r) {
      weight_start += nnz_per_row_[r];
    }
    int weight_end = weight_start;
    for (int r = start_reduced; r < end_reduced; ++r) {
      weight_end += nnz_per_row_[r];
    }
    int delta_start = 0;
    for (int i = 0; i < weight_start; ++i) {
      delta_start += col_deltas_[i];
    }
    std::vector<DeltaType> new_deltas(col_deltas_.data() + weight_start,
                                      col_deltas_.data() + weight_end);
    new_deltas[0] += delta_start;
    int block_size = block_height_ * block_width_;
    std::vector<WeightType> new_weights(
        weights_.data() + weight_start * block_size,
        weights_.data() + weight_end * block_size);
    return CsrBlockSparseMatrix(*this, new_weights, new_deltas, new_nnz, cols_);
  }

  // Combines adjacent row blocks, doubling the block height.
  // This necessarily involves adding zero weights where the blocks don't align
  // across adjacent pairs of rows, so use with caution, as the resulting matrix
  // is most likely to run slower if very sparse to begin with.
  // In the few cases where the blocks do mostly align, the resulting matmul
  // could be much faster, as the number of reads of the rhs will be halved.
  void DoubleBlockHeight() {
    int new_rows = reduced_rows_ / 2;
    std::vector<int> new_nnz(new_rows);
    std::vector<DeltaType> new_rhs_indices;
    std::vector<WeightType> new_weights;
    int rhs_index1 = 0;
    int rhs_index2 = 0;
    int block_size = block_height_ * block_width_;
    for (int r = 0; r < new_rows; ++r) {
      int start_nnz = new_rhs_indices.size();
      rhs_index2 += nnz_per_row_[r * 2];
      int end1 = rhs_index1 + nnz_per_row_[r * 2];
      int end2 = rhs_index2 + nnz_per_row_[r * 2 + 1];
      // Run over a pair of rows with 2 iterators, combining blocks as we go, or
      // padding with zeros where the block positions don't match.
      while (rhs_index1 < end1 || rhs_index2 < end2) {
        int col1 = rhs_index1 < end1 ? rhs_indices_[rhs_index1] : reduced_cols_;
        int col2 = rhs_index2 < end2 ? rhs_indices_[rhs_index2] : reduced_cols_;
        if (col1 < col2) {
          // Need zero weights for row2 to pad out weights block.
          new_rhs_indices.push_back(col1);
          new_weights.insert(new_weights.end(),
                             weights_.data() + rhs_index1 * block_size,
                             weights_.data() + (rhs_index1 + 1) * block_size);
          new_weights.insert(new_weights.end(), block_size,
                             static_cast<WeightType>(0.0f));
          ++rhs_index1;
        } else if (col1 > col2) {
          // Need zero weights for row1 to pad out weights block.
          new_rhs_indices.push_back(col2);
          new_weights.insert(new_weights.end(), block_size,
                             static_cast<WeightType>(0.0f));
          new_weights.insert(new_weights.end(),
                             weights_.data() + rhs_index2 * block_size,
                             weights_.data() + (rhs_index2 + 1) * block_size);
          ++rhs_index2;
        } else {
          // Combine weights for both row1 and row2.
          new_rhs_indices.push_back(col1);
          new_weights.insert(new_weights.end(),
                             weights_.data() + rhs_index1 * block_size,
                             weights_.data() + (rhs_index1 + 1) * block_size);
          new_weights.insert(new_weights.end(),
                             weights_.data() + rhs_index2 * block_size,
                             weights_.data() + (rhs_index2 + 1) * block_size);
          ++rhs_index1;
          ++rhs_index2;
        }
      }
      rhs_index1 = rhs_index2;
      new_nnz[r] = new_rhs_indices.size() - start_nnz;
    }
    block_height_ *= 2;
    reduced_rows_ /= 2;
    weights_ = CacheAlignedVector<WeightType>(new_weights);
    rhs_indices_ = CacheAlignedVector<DeltaType>(new_rhs_indices);
    nnz_per_row_ = CacheAlignedVector<int>(new_nnz);
    sparsity_ = 1.0f - static_cast<float>(new_weights.size()) / (rows_ * cols_);
    ComputeColDeltas();
    if (num_threads_ > 0) {
      int num_threads = num_threads_;
      num_threads_ = 0;
      PrepareForThreads(num_threads);
    }
  }

  // Allocates memory and fills buffer.
  // Caller is responsible for the memory de-allocation.
  // TODO(b/189958858): Both Read and Write need to eventually handle the
  // different possible HalfType and DeltaType values, but punting for now as
  // there is only one supported combination.
  std::size_t WriteToFlatBuffer(std::string* csr_flatbuffer) {
    std::size_t bytes = 0;
    bytes += FixedParameterSize();
    bytes += weights_.size() * sizeof(WeightType);
    bytes += col_deltas_.size() * sizeof(DeltaType);
    bytes += nnz_per_row_.size() * sizeof(int);

    uint8_t* bytes_ptr_ptr =
        reinterpret_cast<uint8_t*>(CHECK_NOTNULL(malloc(bytes)));

    int* int_bytes_ptr = reinterpret_cast<int*>(bytes_ptr_ptr);

    *int_bytes_ptr++ = rows_;
    *int_bytes_ptr++ = cols_;
    *int_bytes_ptr++ = reduced_rows_;
    *int_bytes_ptr++ = reduced_cols_;
    *int_bytes_ptr++ = block_width_;
    *int_bytes_ptr++ = block_height_;
    *int_bytes_ptr++ = col_multiple_;
    *int_bytes_ptr++ = num_threads_;
    *int_bytes_ptr++ = weights_.size();
    *int_bytes_ptr++ = col_deltas_.size();
    *int_bytes_ptr++ = nnz_per_row_.size();

    float* float_bytes_ptr = reinterpret_cast<float*>(int_bytes_ptr);
    *float_bytes_ptr++ = sparsity_;

    uint8_t* bytes_ptr = reinterpret_cast<uint8_t*>(float_bytes_ptr);

    memcpy(bytes_ptr, weights_.data(), weights_.size() * sizeof(WeightType));
    bytes_ptr += weights_.size() * sizeof(WeightType);

    memcpy(bytes_ptr, col_deltas_.data(),
           col_deltas_.size() * sizeof(DeltaType));
    bytes_ptr += col_deltas_.size() * sizeof(DeltaType);

    memcpy(bytes_ptr, nnz_per_row_.data(), nnz_per_row_.size() * sizeof(int));
    bytes_ptr += nnz_per_row_.size() * sizeof(int);

    csr_flatbuffer->resize(bytes);
    csr_flatbuffer->assign(reinterpret_cast<char*>(bytes_ptr_ptr), bytes);
    free(bytes_ptr_ptr);

    return bytes;
  }

  void ReadFromFlatBuffer(const uint8_t* const& bytes, const std::size_t& len) {
    CHECK_GE(len, FixedParameterSize());

    const int* int_bytes_ptr = reinterpret_cast<const int*>(bytes);
    rows_ = *int_bytes_ptr++;
    cols_ = *int_bytes_ptr++;
    reduced_rows_ = *int_bytes_ptr++;
    reduced_cols_ = *int_bytes_ptr++;
    block_width_ = *int_bytes_ptr++;
    block_height_ = *int_bytes_ptr++;
    col_multiple_ = *int_bytes_ptr++;
    int num_threads = *int_bytes_ptr++;
    int32_t weights_size = *int_bytes_ptr++;
    int32_t col_deltas_size = *int_bytes_ptr++;
    int32_t nnz_per_row_size = *int_bytes_ptr++;

    // Make sure negative sizes don't mess things up.
    weights_size = std::max(0, weights_size);
    col_deltas_size = std::max(0, col_deltas_size);
    nnz_per_row_size = std::max(0, nnz_per_row_size);

    const float* float_bytes_ptr =
        reinterpret_cast<const float*>(int_bytes_ptr);
    sparsity_ = *float_bytes_ptr++;

    std::size_t total_bytes =
        FixedParameterSize() + weights_size * sizeof(WeightType) +
        col_deltas_size * sizeof(DeltaType) + nnz_per_row_size * sizeof(int);

    CHECK_EQ(total_bytes, len)
        << "total bytes: " << total_bytes << ", actual len given: " << len;

    const uint8_t* bytes_ptr =
        reinterpret_cast<const uint8_t*>(float_bytes_ptr);
    std::vector<WeightType> weights_raw(weights_size);
    memcpy(weights_raw.data(), bytes_ptr, weights_size * sizeof(WeightType));
    weights_ = CacheAlignedVector<WeightType>(weights_raw);
    bytes_ptr += weights_size * sizeof(WeightType);

    std::vector<DeltaType> deltas_raw(col_deltas_size);
    memcpy(deltas_raw.data(), bytes_ptr, col_deltas_size * sizeof(DeltaType));
    col_deltas_ = CacheAlignedVector<DeltaType>(deltas_raw);
    bytes_ptr += col_deltas_size * sizeof(DeltaType);

    std::vector<int> nnz_raw(nnz_per_row_size);
    memcpy(nnz_raw.data(), bytes_ptr, nnz_per_row_size * sizeof(int));
    nnz_per_row_ = CacheAlignedVector<int>(nnz_raw);
    num_threads_ = 0;
    PrepareForThreads(num_threads);
  }

  // Multiply a Sparse matrix by a possibly dense matrix.  Often the matrix is
  // a vector with a small number of columns, hence the term "fat vector".
  // 1x1 and 4x4 have specializations for output columns (ie fatness) > 5,
  // and often achieve twice as many GFlops when multiplying a right hand side
  // that has 5 or more columns.  (Best is a multiple of 5).
  // 16x1 doesn't have enough registers and just loops over the width 1 kernel.
  //
  // |rhs| and |out| are COLUMN MAJOR.

  // Fast Tuples WeightType, BiasType, RhsType, OutType are:
  // (float, float, float, float)
  // (bfloat16, float, float, float)
  // and only on ARM64.  All other cases use a slow generic implementation.
  template <typename RhsClass, typename BiasClass, typename OutClass,
            typename BiasType = typename BiasClass::value_type,
            typename OutType = typename OutClass::value_type>
  void SpMM_bias(const RhsClass& rhs, const BiasClass& bias, OutClass* out,
                 bool relu = false, int tid = 0,
                 SpinBarrier* barrier = nullptr) const {
    static_assert(std::is_same<typename RhsClass::value_type, RhsType>::value,
                  "Rhs types must match");
    CHECK_LT(tid, num_threads_);
    CHECK_EQ(rhs.cols(), out->cols());
    CHECK_EQ(rhs.rows(), cols_);
    CHECK_GE(out->rows(), rows_);
    int cols_to_go = out->cols();
    int rhs_index = *thread_bounds_.OffsetRhsIndices(rhs_indices_.data(), tid);
    const RhsType* rhs_ptr = rhs.data() + rhs_index * block_height_;
    OutType* out_ptr = thread_bounds_.OffsetOutput(out->data(), tid);
    const WeightType* weights_ptr =
        thread_bounds_.OffsetWeights(weights_.data(), tid);
    const DeltaType* delta_ptr =
        thread_bounds_.OffsetRhsIndices(col_deltas_.data(), tid);
    int offset = *delta_ptr / sizeof(RhsType);
    rhs_ptr -= offset;
    const int* nnz_ptr = nnz_per_row_.data() + thread_bounds_.StartRow(tid);
    int assigned_rows =
        thread_bounds_.StartRow(tid + 1) - thread_bounds_.StartRow(tid);
    const BiasType* bias_ptr = thread_bounds_.OffsetBias(bias.data(), tid);

    while (cols_to_go > 0) {
      if (block_width_ == 4 && block_height_ == 4) {
        if (cols_to_go >= 5) {
          detail::SpMM5_4x4<WeightType, RhsType, OutType>(
              weights_ptr, delta_ptr, nnz_ptr, rhs_ptr, bias_ptr, out_ptr,
              assigned_rows, out->col_stride(), rhs.col_stride(), relu);
        } else {
          detail::SpMV_4x4<WeightType, RhsType, OutType>(
              weights_ptr, delta_ptr, nnz_ptr, rhs_ptr, bias_ptr, out_ptr,
              assigned_rows, out->col_stride(), rhs.col_stride(), relu);
        }
      } else {
        if (cols_to_go >= 5) {
          detail::SpMM5_1x1<WeightType, RhsType, OutType>(
              weights_ptr, delta_ptr, nnz_ptr, rhs_ptr, bias_ptr, out_ptr,
              assigned_rows, out->col_stride(), rhs.col_stride(), relu);
        } else {
          detail::SpMV_1x1<WeightType, RhsType, OutType>(
              weights_ptr, delta_ptr, nnz_ptr, rhs_ptr, bias_ptr, out_ptr,
              assigned_rows, out->col_stride(), rhs.col_stride(), relu);
        }
      }

      if (cols_to_go >= 5) {
        cols_to_go -= 5;
        rhs_ptr += rhs.col_stride() * 5;
        out_ptr += out->col_stride() * 5;
      } else {
        cols_to_go--;
        rhs_ptr += rhs.col_stride();
        out_ptr += out->col_stride();
      }
      if (barrier) barrier->barrier();
    }
  }
  template <typename MVRhsType, typename MVBiasType, typename OutType>
  void MatVec(const MVRhsType* rhs, const MVBiasType* bias, bool relu, int tid,
              int replicas, int output_stride, OutType* output) {
    CHECK_LT(tid, num_threads_);
    CHECK_EQ(block_width_, 4) << "Block width must be 4!";
    if (block_height_ == 8) {
      matmul_.MatVec8x4(
          thread_bounds_.OffsetWeights(weights_.cast_data(), tid), rhs,
          thread_bounds_.OffsetBias(bias, tid), nnz_per_row_.data(),
          thread_bounds_.OffsetRhsIndices(rhs_indices_.data(), tid),
          thread_bounds_.StartRow(tid), thread_bounds_.StartRow(tid + 1), relu,
          replicas, output_stride, thread_bounds_.OffsetOutput(output, tid));
    } else {
      CHECK_EQ(block_height_, 4) << "Block height must be 4 or 8!";
      matmul_.MatVec4x4(
          thread_bounds_.OffsetWeights(weights_.cast_data(), tid), rhs,
          thread_bounds_.OffsetBias(bias, tid), nnz_per_row_.data(),
          thread_bounds_.OffsetRhsIndices(rhs_indices_.data(), tid),
          thread_bounds_.StartRow(tid), thread_bounds_.StartRow(tid + 1), relu,
          replicas, output_stride, thread_bounds_.OffsetOutput(output, tid));
    }
  }

  int rows() const { return rows_; }
  int cols() const { return cols_; }
  int block_height() const { return block_height_; }
  int block_width() const { return block_width_; }
  float sparsity() const { return sparsity_; }
  int num_threads() const { return num_threads_; }
  const ThreadBounds& thread_bounds() const { return thread_bounds_; }
  const CacheAlignedVector<DeltaType>& rhs_indices() const {
    return rhs_indices_;
  }
  const std::string& name() const { return name_; }
  void set_name(const std::string& name) { name_ = name; }
  const std::vector<int>& split_points() const {
    return thread_bounds_.row_starts();
  }

  std::size_t bytes() const {
    return weights_.size() * sizeof(WeightType) +
           col_deltas_.size() * sizeof(DeltaType) +
           nnz_per_row_.size() * sizeof(int);
  }

  // Multiplies a sparse matrix by a possibly dense matrix, as SpMM_bias above,
  // and then samples from the output (softmax distribution) layer.
  template <typename RhsClass, typename BiasClass, typename OutClass,
            typename BiasType = typename BiasClass::value_type,
            typename OutType = typename OutClass::value_type>
  typename std::enable_if<!IsFixed32Type<OutType>::value, int>::type
  SpMM_bias_Sample(const RhsClass& rhs, const BiasClass& bias, OutClass* out,
                   float temperature, int tid, SpinBarrier* barrier,
                   std::minstd_rand* gen,
                   CacheAlignedVector<float>* scratch) const {
    SpMM_bias(rhs, bias, out, /*relu=*/false, tid, barrier);
    return out->Sample(temperature, gen, scratch);
  }
  // Fixed32 version.
  template <typename RhsClass, typename BiasClass, typename OutClass,
            typename BiasType = typename BiasClass::value_type,
            typename OutType = typename OutClass::value_type>
  typename std::enable_if<IsFixed32Type<OutType>::value, int>::type
  SpMM_bias_Sample(const RhsClass& rhs, const BiasClass& bias, OutClass* out,
                   float temperature, int tid, SpinBarrier* barrier,
                   std::minstd_rand* gen,
                   CacheAlignedVector<float>* scratch) const {
    // We don't pass the barrier on, as we have more work to do.
    SpMM_bias(rhs, bias, out, /*relu=*/false, tid);
    return out->ReducingSample(gen, scratch, tid, temperature, barrier);
  }

  void Print() const {
    std::cout << "Weights\n";
    weights_.Print();
    std::cout << std::endl;
    std::cout << "Deltas\n";
    col_deltas_.Print();
    std::cout << std::endl;
    std::cout << "nnz\n";
    nnz_per_row_.Print();
    std::cout << std::endl;
  }

  // Split the computation amongst threads by rows based on the number of
  // non zeros, with the addition of a constant to account for the work of the
  // bias and the horizontal add at the end, and also guarantees that each
  // thread writes only whole cache lines, based on the size of OutType.
  // The |cache_line_size| arg is used only for testing. Normally it is provided
  // through the architecture #defines.
  // Each thread gets a contiguous row range (|split_points|).
  // Thread t does rows [ split_points[t], split_points[t + 1] )
  // Each thread also needs to know how many non zeros were before it to skip
  // (|nnz_to_skip|).  And finally it also needs to know what the offset into
  // the rhs vector would have been at the split point (|rhs_to_skip|).
  //
  // Some tricky corner cases where the number of non-zeros doesn't split
  // nicely amongst the number of requested threads are not handled and default
  // to one thread; these cases are only going to happen in tests and not in
  // the matrices that correspond in real models.
  //
  // Returns the maximum number of threads that can be used; <= |num_threads|.
  template <typename OutType = int32_t>
  int PrepareForThreads(int num_threads, int cache_line_size = -1) {
    CHECK_GT(num_threads, 0);
    // we've already prepared for this number of threads, nothing to do
    if (num_threads == num_threads_) return num_threads_;

    num_threads_ = num_threads;
    thread_bounds_.PrepareForThreads(
        block_width_, block_height_, num_threads_,
        ReducedRowsPerCacheLine<OutType>(cache_line_size), reduced_rows_,
        nnz_per_row_.data());
    return num_threads_;
  }

  // Computes and stores the |rhs_indices_| from the |col_deltas_|.
  void ComputeRHSIndices() {
    std::vector<int> cumulative_deltas = CumulativeColDeltas();
    std::vector<DeltaType> rhs_indices(cumulative_deltas.size() +
                                       reduced_rows_);
    int total_indices = 0;
    int delta_index = 0;
    for (int r = 0; r < reduced_rows_; ++r) {
      for (int n = 0; n < nnz_per_row_[r]; ++n, ++delta_index) {
        rhs_indices[total_indices++] =
            cumulative_deltas[delta_index] / block_width_;
      }
    }
    rhs_indices_ = CacheAlignedVector<DeltaType>(rhs_indices);
  }

  // Computes and stores the |col_deltas_| from the |rhs_indices_|.
  void ComputeColDeltas() {
    std::vector<int> col_deltas(rhs_indices_.size());
    int prev_index = 0;
    for (int i = 0; i < rhs_indices_.size(); ++i) {
      int offset = rhs_indices_[i] - prev_index;
      prev_index = rhs_indices_[i];
      col_deltas[i] = offset * block_width_ * sizeof(RhsType);
    }
    col_deltas_ = CacheAlignedVector<DeltaType>(col_deltas);
  }

  // Computes and returns the inclusive prefix sum of the deltas, ie absolute
  // positions.
  std::vector<int> CumulativeColDeltas() const {
    std::vector<int> cum_col_deltas(col_deltas_.size());
    for (int i = 0; i < col_deltas_.size(); ++i) {
      cum_col_deltas[i] = col_deltas_[i] / sizeof(RhsType);
      if (i > 0) cum_col_deltas[i] += cum_col_deltas[i - 1];
    }
    return cum_col_deltas;
  }

 private:
  constexpr std::size_t FixedParameterSize() const {
    return sizeof(int)      // rows
           + sizeof(int)    // cols
           + sizeof(int)    // reduced_rows
           + sizeof(int)    // reduced_cols
           + sizeof(int)    // block_width
           + sizeof(int)    // block_height
           + sizeof(float)  // sparsity
           + sizeof(int)    // col_multiple
           + sizeof(int)    // num_threads_
           + sizeof(int)    // weights_.size()
           + sizeof(int)    // col_deltas_.size()
           + sizeof(int);   // nnz_per_row_.size()
  }
  // Possible block sizes are only those that are supported by the computation
  // default is 1x1, other options are 4x4 and 16x1.
  template <typename InputType>
  void DetermineBlockSize(const MaskedSparseMatrix<InputType>& masked_matrix) {
    const std::vector<std::pair<int, int>> kPreferredOrder = {{4, 4}};
    int rows = masked_matrix.rows();
    int cols = masked_matrix.cols();

    for (const auto& block_size : kPreferredOrder) {
      int block_height, block_width;
      std::tie(block_height, block_width) = block_size;
      if (cols % block_width != 0) continue;

      int reduced_rows = (rows + block_height - 1) / block_height;
      int reduced_cols = cols / block_width;

      // For each possible block, confirm that it is either all 0s or all 1s.
      bool all_same = true;
      const auto& mask = masked_matrix.mask();
      for (int r = 0; r < reduced_rows; ++r) {
        for (int c = 0; c < reduced_cols; ++c) {
          int val = mask[r * block_height * cols + c * block_width];
          for (int i = 0; i < block_height; ++i) {
            for (int j = 0; j < block_width; ++j) {
              int index = (r * block_height + i) * cols + c * block_width + j;
              if (index < masked_matrix.mask().size()) {
                all_same &= (masked_matrix.mask()[index] == val);
              }
            }
          }
        }
      }

      // If this block configuration is possible, accept it.
      if (all_same) {
        block_height_ = block_height;
        block_width_ = block_width;
        return;
      }
    }

    // No large blocks were found, default to 1x1.
    block_height_ = 1;
    block_width_ = 1;
  }

  // CSR descriptors are for the reduced matrix, weights is the full matrix.
  template <typename InputType>
  void MakeColumnsMultiple(const std::vector<int>& row_offsets,
                           std::vector<int>* reduced_mask,
                           std::vector<InputType>* weights) {
    if (col_multiple_ > 0) {
      // Make sure each row has a number of columns that is a multiple of
      // |col_multiple|.
      for (int r = 1; r < row_offsets.size(); ++r) {
        int num_row = row_offsets[r] - row_offsets[r - 1];
        int num_needed = col_multiple_ - num_row % col_multiple_;
        if (num_needed < col_multiple_) {
          // Find gaps in the columns where we can insert a column of 0 weights.
          int num_added = 0;
          for (int c = 0; c < reduced_cols_; ++c) {
            if ((*reduced_mask)[(r - 1) * reduced_cols_ + c] == 0) {
              (*reduced_mask)[(r - 1) * reduced_cols_ + c] = 1;

              // Zero out the weights that correspond to this block.
              for (int i = 0; i < block_height_; ++i) {
                for (int j = 0; j < block_width_; ++j) {
                  (*weights)[((r - 1) * block_height_ + i) * cols_ +
                             block_width_ * c + j] = InputType(0.f);
                }
              }
              num_added++;
            }

            if (num_added == num_needed) break;
          }
        }
      }
    }
  }

  // Given the final dense mask and weights, convert to the compressed
  // block CSR representation.
  template <typename InputType>
  void MaskAndWeightsToCsr(const std::vector<int>& mask,
                           const std::vector<InputType>& weights,
                           std::vector<int>* nnz_per_row,
                           std::vector<int>* col_indices,
                           std::vector<WeightType>* weights_csr) {
    std::vector<int> row_offsets = {0};
    int nnz = 0;
    // Standard CSR format.
    if (block_width_ == 1 && block_height_ == 1) {
      for (int r = 0; r < rows_; ++r) {
        for (int c = 0; c < cols_; ++c) {
          if (mask[r * cols_ + c] == 1) {
            nnz++;
            col_indices->push_back(c);
            weights_csr->push_back(WeightType(weights[r * cols_ + c]));
          }
        }
        row_offsets.push_back(nnz);
      }
    } else if (block_width_ == 4 && block_height_ == 4) {
      // Weights are stored contiguously for each block in this case.
      for (int r = 0; r < reduced_rows_; ++r) {
        for (int c = 0; c < reduced_cols_; ++c) {
          if (mask[r * reduced_cols_ + c] == 1) {
            col_indices->push_back(c);
            nnz++;
            for (int i = 0; i < block_height_; ++i) {
              for (int j = 0; j < block_width_; ++j) {
                int row_index = (block_height_ * r + i) * cols_;
                int w_index = row_index + block_width_ * c + j;
                WeightType weight = w_index < weights.size()
                                        ? WeightType(weights[w_index])
                                        : WeightType(0.0f);
                weights_csr->push_back(weight);
              }
            }
          }
        }
        row_offsets.push_back(nnz);
      }
    }
    for (int i = 1; i < row_offsets.size(); ++i)
      nnz_per_row->push_back(row_offsets[i] - row_offsets[i - 1]);
  }

  // Returns the number of block rows per cache line. This is the minimum unit
  // into which the calculation is broken for threads.
  template <typename OutType>
  int ReducedRowsPerCacheLine(int override_cache_line_size = -1) const {
    int line_size = kCacheLineSize;
    if (override_cache_line_size >= 1) line_size = override_cache_line_size;
    return std::max<int>(line_size / (block_height_ * sizeof(OutType)), 1);
  }

  int col_multiple_;
  int rows_;
  int cols_;
  int reduced_rows_;
  int reduced_cols_;
  float sparsity_;
  int block_width_;
  int block_height_;
  int num_threads_;
  std::string name_;

  CacheAlignedVector<WeightType> weights_;
  CacheAlignedVector<DeltaType> col_deltas_;
  CacheAlignedVector<int> nnz_per_row_;
  // |thread_bounds_| and |rhs_indices_| don't need to be serialized as they are
  // always recalculated from serialized data.
  CacheAlignedVector<DeltaType> rhs_indices_;
  Matmul<WeightType, RhsType> matmul_;
  ThreadBounds thread_bounds_;
  static constexpr int kCacheLineSize = 64;
};

// Converts a sparse matrix represented with (|mask|, |weights|, |size|) into
// the CSR format, and returns that as a serialized string.
template <typename MaskType>
std::string ConvertDenseToSparseRepresentation_Int16Deltas(
    const std::vector<MaskType>& mask, const std::vector<float>& weights,
    const int rows, const int cols) {
  MaskedSparseMatrix<float> masked_weights(rows, cols, mask.data(),
                                           weights.data());
  CsrBlockSparseMatrix<csrblocksparse::bfloat16, float, int16_t>
      sparse_masked_weights(masked_weights);
  std::string buffer;
  sparse_masked_weights.WriteToFlatBuffer(&buffer);
  return buffer;
}

}  // namespace csrblocksparse
#endif  // LYRA_CODEC_SPARSE_MATMUL_LAYERS_CSR_BLOCKSPARSE_MATRIX_H_