File size: 82,881 Bytes

"""
    Author: Eric Lin (xihlin)
"""
"""
    ... note(bapatra)::
        This is written as one big file, instead of splitting into logical components because I was running into issues with transformers auto module
        imports when splitting into different files. I've tried keeping the logical partitions demarkated with comment blocks, but it is not ideal.
        In the future, would be really good to revisit this and refactor into a more readable file structure.

"""
from typing import TypeVar
from functools import lru_cache
import math
import pytest
import torch
import numpy as np

import triton
import triton.language as tl

import os

import dataclasses

Phi3SmallConfig = TypeVar('Phi3SmallConfig')

# triton 2.0.0: fail at backward on A100, for the examples, if h_dim=128.

# Done
#  1. strided of qkv
#  2. seq len not power of 2
#  3. bf16 with Triton May, 2023

# TODO:
#  1. wip: support non-contiguous backward, also help reduce memory allocation in training (q, k, v split)
#  2. block sparse with different BLOCK_M, BLOCK_N?
#  3. for Lq not divided by BLOCK_M, BLOCK_N, only apply mask to K/V on last batch, still need to apply mask on Q.
#     Attempt, fail to compile
#  4. For 2nd iter of inference,  BLOCK_M=1, how to make things work?  K/V maynot divided by BLOCK_N.
#  5. The inner loop can also be paralled via bigger num_stage(better) or on different thread-block (via m/L and atomic update, but this no-comm/sync between blocks)


###########################################################
################### Kernel Parameters #####################
###########################################################

@dataclasses.dataclass
class BlockSparseParams(object):
    block_size: int
    kernel_block_size: int
    num_local_blocks: int
    vert_stride: int
    homo_head_pattern: bool = False

    @classmethod
    def from_config(cls, config: Phi3SmallConfig) -> "BlockSparseParams":
        return cls(
            block_size=config.blocksparse_block_size,
            kernel_block_size=config.blocksparse_triton_kernel_block_size,
            num_local_blocks=config.blocksparse_num_local_blocks,
            vert_stride=config.blocksparse_vert_stride,
            homo_head_pattern=config.blocksparse_homo_head_pattern,
        )


###########################################################
###########################################################

###########################################################
################### Utility Functions #####################
###########################################################

# helper functions for 3D sparse pattern
# these function are not optimized and very inefficient. Avoid calling them too frequent.
# currently, it is only called within `get_local_strided_sparse_attention_op`, which is cached.
def dense_to_crow_col(x):
    ''' Turning a 2D/3D torch tensor (x) to CSR rows/cols indexing.
    param:
    TODO:
        1. improve efficiency, is it faster if done in CPU, or customize a cuda kernel for it?
    NOTE: col_indices padded -1
    '''
    pad = -1
    dim = x.dim()
    assert x.dim() in (2, 3)
    if x.dim() == 2:
        x = x[None]
    x = [xi.to_sparse_csr() for xi in x]
    crows = torch.vstack([xi.crow_indices() for xi in x])
    cols = [xi.col_indices() for xi in x]
    max_cols = max(len(xi) for xi in cols)
    cols = [torch.cat([xi, pad + xi.new_zeros(max_cols - xi.shape[0])]) for xi in cols]
    cols = torch.vstack(cols)
    if dim == 2:
        crows = crows[0]
        cols = cols[0]
    return crows, cols


def crow_col_to_dense(crows, cols, dtype=torch.float16):
    dim = crows.dim()
    if dim == 1:
        crows = crows[None]
        cols = cols[None]
    device = crows.device
    crows, cols = crows.cpu(), cols.cpu()  # faster in cpu
    shape = (crows.shape[0], crows.shape[1] - 1, cols.max() + 1)
    x = torch.zeros(shape, dtype=dtype)
    for i in range(shape[0]):
        for j in range(shape[1]):
            x[i, j, cols[i, crows[i, j]:crows[i, j+1]]] = 1
    if dim == 1:
        x = x[0]
    return x.to(device)


def dense_to_ccol_row(x):
    '''Similar, but to CSC format
    '''
    x = x.transpose(-2, -1)
    return dense_to_crow_col(x)


def ccol_row_to_dense(ccol, rows, dtype=torch.float16):
    return crow_col_to_dense(ccol, rows, dtype).permute(0, 2, 1).contiguous()


def _get_sparse_attn_mask_homo_head(q_len, N_CTX, dtype, device, BLOCK=128, local_blocks=4, vert_stride=4, return_dense=False):
    '''
    :return: a tuple of 3:
        - tuple of crow_indices, col_indices representation of CSR format.
        - block dense mask
        - all token dense mask (be aware that it can be OOM if it is too big) if `return_dense==True`, otherwise, None
    '''
    with torch.no_grad():
        N_BLOCK = triton.cdiv(N_CTX, BLOCK)
        q_pos = torch.arange(N_BLOCK)[:, None]
        k_pos = torch.arange(N_BLOCK)[None]
        mask_vert_strided = (torch.arange(N_BLOCK) + 1) % vert_stride == 0
        block_mask_dense = ((q_pos >= k_pos) & ((q_pos - k_pos < local_blocks) | mask_vert_strided)).to(device).to(dtype)
        N_BLOCK_Q = triton.cdiv(q_len, BLOCK)
        block_mask_dense_output = block_mask_dense[-N_BLOCK_Q:].contiguous().to_sparse_csr()
    if return_dense:
        mask_dense = torch.kron(block_mask_dense, block_mask_dense.new_ones((BLOCK, BLOCK)))
        causal_mask = torch.tril(torch.ones(N_CTX, N_CTX)).type_as(mask_dense)[-q_len:]
        mask_dense = mask_dense[-q_len:, :N_CTX] * causal_mask
        return (block_mask_dense_output.crow_indices(), block_mask_dense_output.col_indices()), block_mask_dense, mask_dense
    else:
        return (block_mask_dense_output.crow_indices(), block_mask_dense_output.col_indices()), block_mask_dense, None


def _get_sparse_attn_mask(n_heads, q_len, N_CTX, dtype, device, BLOCK=128, local_blocks=4, vert_stride=4, homo_head=True, return_dense=False):
    '''
    :return: a tuple of 3:
        - tuple of crow_indices, col_indices representation of CSR format.
        - block dense mask
        - all token dense mask (be aware that it can be OOM if it is too big) if `return_dense==True`, otherwise, None
    '''
    if homo_head:
        with torch.no_grad():
            (crow, col), block_mask_dense, mask_dense = _get_sparse_attn_mask_homo_head(q_len, N_CTX, dtype, device, BLOCK, local_blocks, vert_stride, return_dense)
            crow = crow[None].expand(n_heads, crow.shape[0])
            col = col[None].expand(n_heads, col.shape[0])
            if return_dense:
                mask_dense = mask_dense[None].expand(n_heads, *mask_dense.shape)
            return (crow, col), block_mask_dense, mask_dense

    with torch.no_grad():
        N_BLOCK = triton.cdiv(N_CTX, BLOCK)
        q_pos = torch.arange(N_BLOCK)[None, :, None]
        k_pos = torch.arange(N_BLOCK)[None, None]
        head_sliding_step = max(1, int(vert_stride / n_heads))  # if vert_stride <= n_heads, rotating the heads
        mask_vert_strided = [(torch.arange(N_BLOCK) + h * head_sliding_step + 1) % vert_stride == 0 for h in range(n_heads)]
        mask_vert_strided = torch.vstack(mask_vert_strided).unsqueeze(1)
        block_mask_dense = ((q_pos >= k_pos) & ((q_pos - k_pos < local_blocks) | mask_vert_strided)).to(device).to(dtype)
        N_BLOCK_Q = triton.cdiv(q_len, BLOCK)
        block_mask_dense_output = block_mask_dense[:, -N_BLOCK_Q:]
    if return_dense:
        mask_dense = torch.kron(block_mask_dense, block_mask_dense.new_ones((BLOCK, BLOCK)))
        causal_mask = torch.tril(torch.ones(N_CTX, N_CTX)).type_as(mask_dense)[-q_len:]
        mask_dense = mask_dense[..., -q_len:, :N_CTX] * causal_mask[None]
        return dense_to_crow_col(block_mask_dense_output), block_mask_dense, mask_dense
    else:
        return dense_to_crow_col(block_mask_dense_output), block_mask_dense, None


def get_sparse_attn_mask(q, N_CTX, *args, **kwargs):
    return _get_sparse_attn_mask(q.size(1), q.size(2), N_CTX, q.dtype, q.device, *args, **kwargs)

###########################################################
###########################################################

###########################################################
###################### Training Kernels ###################
###########################################################

# TODO: only apply loading/saving mask on the last iteration for EVEN_N_BLOCK, useful for 1st iteration of inference.
#    Experiment failed inside loop.
#    Another idea: only on saving? load even out of boundary(will it causes illegal access error)?
@triton.jit
def _fwd_kernel(
    Q, K, V, sm_scale,
    layout_crow_ptr,
    layout_col_ptr,
    layout_crow_stride_h, layout_crow_stride_m,
    layout_col_stride_h, layout_col_stride_m,
    TMP, L, M,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug. TMP, L, M are assumed to have contiguous layouts
    Out,
    stride_qz, stride_qh, stride_qm, stride_qd,
    stride_kz, stride_kh, stride_kn, stride_kd,
    stride_vz, stride_vh, stride_vn, stride_vd,
    stride_oz, stride_oh, stride_om, stride_od,
    Z, H, N_CTX,
    PAST_LEN,
    Q_ROUNDED_LEN,
    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
    BLOCK_N: tl.constexpr,
    EVEN_M_BLOCK: tl.constexpr,
    EVEN_N_BLOCK: tl.constexpr,
    INFERENCE: tl.constexpr,
    NUM_DBLOCKS: tl.constexpr,
):
    Q_LEN = N_CTX - PAST_LEN
    start_m = tl.program_id(0)
    off_hz = tl.program_id(1)
    off_h = off_hz % H
    off_z = off_hz // H
    Q += off_z * stride_qz + off_h * stride_qh
    K += off_z * stride_kz + off_h * stride_kh
    V += off_z * stride_vz + off_h * stride_vh
    # initialize offsets
    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
    offs_n = tl.arange(0, BLOCK_N)
    offs_d = tl.arange(0, BLOCK_DMODEL)
    off_q = offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qd
    # off_k = offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kd
    off_k = offs_n[None, :] * stride_kn + offs_d[:, None] * stride_kd
    off_v = offs_n[:, None] * stride_vn + offs_d[None, :] * stride_vd
    # Initialize pointers to Q, K, V
    q_ptrs = Q + off_q
    k_ptrs = K + off_k
    v_ptrs = V + off_v
    # initialize pointer to m and l
    t_ptrs = TMP + off_hz * Q_ROUNDED_LEN + offs_m
    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
    if NUM_DBLOCKS >= 2:
        acc2 = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)

    # load q: it will stay in SRAM throughout
    if EVEN_M_BLOCK:
        q = tl.load(q_ptrs)
        if NUM_DBLOCKS >= 2:
            q2 = tl.load(q_ptrs + BLOCK_DMODEL * stride_qd)
    else:
        q = tl.load(q_ptrs, mask=offs_m[:, None] < Q_LEN)
        if NUM_DBLOCKS >= 2:
            q2 = tl.load(q_ptrs + BLOCK_DMODEL * stride_qd, mask=offs_m[:, None] < Q_LEN)

    layout_ptr = layout_crow_ptr + off_h * layout_crow_stride_h + start_m * layout_crow_stride_m
    start_l = tl.load(layout_ptr).to(tl.int32)
    end_l = tl.load(layout_ptr + layout_crow_stride_m).to(tl.int32)

    # loop over k, v and update accumulator
    for col_idx_idx in range(start_l, end_l):
        col_idx = tl.load(layout_col_ptr +  off_h * layout_col_stride_h + col_idx_idx * layout_col_stride_m).to(tl.int32)
        start_n = col_idx * BLOCK_N
        # -- compute qk ----
        if EVEN_N_BLOCK:
            k = tl.load(k_ptrs + start_n * stride_kn)
        else:
            k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_n[None, :] + start_n < N_CTX)
        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
        qk += tl.dot(q, k)

        if NUM_DBLOCKS >= 2:
            if EVEN_N_BLOCK:
                k = tl.load(k_ptrs + start_n * stride_kn + BLOCK_DMODEL * stride_kd)
            else:
                k = tl.load(k_ptrs + start_n * stride_kn + BLOCK_DMODEL * stride_kd, mask=offs_n[None, :] + start_n < N_CTX)
            qk += tl.dot(q2, k)

        qk *= sm_scale
        qk += tl.where(offs_m[:, None] + PAST_LEN >= (start_n + offs_n[None, :]), 0, float('-inf'))
        # -- compute m_ij, p, l_ij
        m_ij = tl.max(qk, 1)
        p = tl.exp(qk - m_ij[:, None])
        l_ij = tl.sum(p, 1)
        # -- update m_i and l_i
        m_i_new = tl.maximum(m_i, m_ij)
        alpha = tl.exp(m_i - m_i_new)
        beta = tl.exp(m_ij - m_i_new)
        l_i_new = alpha * l_i + beta * l_ij
        # -- update output accumulator --
        # scale p
        p_scale = beta / l_i_new
        p = p * p_scale[:, None]
        # scale acc
        acc_scale = l_i / l_i_new * alpha
        # tl.store(t_ptrs, acc_scale)
        # acc_scale = tl.load(t_ptrs)  # BUG: have to store and immediately load
        acc = acc * acc_scale[:, None]
        if NUM_DBLOCKS >= 2:
            acc2 = acc2 * acc_scale[:, None]
        p = p.to(Q.dtype.element_ty)
        # update acc
        if EVEN_N_BLOCK:
            v = tl.load(v_ptrs + start_n * stride_vn)
        else:
            v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_n[:, None] + start_n < N_CTX)
        acc += tl.dot(p, v)

        if NUM_DBLOCKS >= 2:
            if EVEN_N_BLOCK:
                v = tl.load(v_ptrs + start_n * stride_vn + BLOCK_DMODEL * stride_vd)
            else:
                v = tl.load(v_ptrs + start_n * stride_vn + BLOCK_DMODEL * stride_vd, mask=offs_n[:, None] + start_n < N_CTX)
            acc2 += tl.dot(p, v)

        # update m_i and l_i
        l_i = l_i_new
        m_i = m_i_new

    # rematerialize offsets to save registers
    # start_m = tl.program_id(0)
    # offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
    # write back l and m
    if not INFERENCE:
        l_ptrs = L + off_hz * N_CTX + offs_m
        m_ptrs = M + off_hz * N_CTX + offs_m
        if EVEN_M_BLOCK:
            tl.store(l_ptrs, l_i)
            tl.store(m_ptrs, m_i)
        else:
            tl.store(l_ptrs, l_i,  mask=offs_m < Q_LEN)
            tl.store(m_ptrs, m_i,  mask=offs_m < Q_LEN)
    # initialize pointers to output
    # offs_n = tl.arange(0, BLOCK_DMODEL)
    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :] * stride_od
    out_ptrs = Out + off_o
    tl.store(out_ptrs, acc,  mask=offs_m[:, None] < Q_LEN)
    if NUM_DBLOCKS >= 2:
        tl.store(out_ptrs + BLOCK_DMODEL * stride_od, acc2,  mask=offs_m[:, None] < Q_LEN)


## backward
@triton.heuristics(
    {
        'EVEN_M_BLOCK': lambda kwargs: kwargs['N_CTX'] % kwargs['BLOCK_M'] == 0,
    }
)
@triton.jit
def _bwd_preprocess(
    Out, DO, L, # assume contiguous for Out, DO, L, NewDO, Delta layout.
    NewDO, Delta,
    N_CTX,
    BLOCK_M: tl.constexpr, D_HEAD: tl.constexpr,
    EVEN_M_BLOCK: tl.constexpr,
):
    off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
    off_d = tl.arange(0, D_HEAD)
    # load
    if EVEN_M_BLOCK:
        o = tl.load(Out + off_m[:, None] * D_HEAD + off_d[None, :]).to(tl.float32)
        do = tl.load(DO + off_m[:, None] * D_HEAD + off_d[None, :]).to(tl.float32)
    else:
        o = tl.load(Out + off_m[:, None] * D_HEAD + off_d[None, :], mask=off_m[:, None] < N_CTX).to(tl.float32)
        do = tl.load(DO + off_m[:, None] * D_HEAD + off_d[None, :], mask=off_m[:, None] < N_CTX).to(tl.float32)
    denom = tl.load(L + off_m).to(tl.float32)
    # compute
    do = do / denom[:, None]
    delta = tl.sum(o * do, axis=1)
    # write-back
    if EVEN_M_BLOCK:
        tl.store(NewDO + off_m[:, None] * D_HEAD + off_d[None, :], do)
    else:
        tl.store(NewDO + off_m[:, None] * D_HEAD + off_d[None, :], do,  mask=off_m[:, None] < N_CTX)
    tl.store(Delta + off_m, delta)


# Does not suuport unequal seqlen(q) and seqlen(k)
@triton.heuristics(
    {
        'EVEN_M_BLOCK': lambda kwargs: kwargs['N_CTX'] % kwargs['BLOCK_M'] == 0,
        'EVEN_N_BLOCK': lambda kwargs: kwargs['N_CTX'] % kwargs['BLOCK_N'] == 0,
    }
)
@triton.jit
def _bwd_kernel(
    Q, K, V, sm_scale,
    layout_ccol_ptr,
    layout_row_ptr,
    layout_ccol_stride_h, layout_ccol_stride_m,
    layout_row_stride_h, layout_row_stride_m,
    Out, DO,  # assume contigous: Out, Do, DQ, DK, DV, L, M, D, seq(q) == seq(k), with stride_oz, stride_oh, stride_om, stride_od,
    DQ, DK, DV,
    L, M,
    D,
    stride_qz, stride_qh, stride_qm, stride_qd,
    stride_kz, stride_kh, stride_kn, stride_kd,
    stride_vz, stride_vh, stride_vn, stride_vd,
    stride_oz, stride_oh, stride_om, stride_od,
    # stride_dz, stride_dh, stride_dm, stride_dd,
    Z, H, N_CTX,
    num_block,
    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
    BLOCK_N: tl.constexpr,
    EVEN_M_BLOCK: tl.constexpr,
    EVEN_N_BLOCK: tl.constexpr,
    NUM_DBLOCKS: tl.constexpr,
):
    start_n = tl.program_id(0)
    off_hz = tl.program_id(1)
    off_z = off_hz // H
    off_h = off_hz % H
    # offset pointers for batch/head
    Q += off_z * stride_qz + off_h * stride_qh
    K += off_z * stride_kz + off_h * stride_kh
    V += off_z * stride_vz + off_h * stride_vh
    DO += off_z * stride_oz + off_h * stride_oh
    DQ += off_z * stride_oz + off_h * stride_oh
    DK += off_z * stride_oz + off_h * stride_oh
    DV += off_z * stride_oz + off_h * stride_oh
    # Look like this loop can be parallelled
    # for start_n in range(0, num_block):

    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
    offs_m = tl.arange(0, BLOCK_M)
    offs_d = tl.arange(0, BLOCK_DMODEL)
    # initialize pointers to value-like data
    k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kd)
    v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :] * stride_vd)

    # pointer to row-wise quantities in value-like data
    D_ptrs = D + off_hz * N_CTX
    m_ptrs = M + off_hz * N_CTX
    # initialize dv amd dk
    dv = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
    dk = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
    # k and v stay in SRAM throughout
    if EVEN_N_BLOCK:
        k = tl.load(k_ptrs)
        v = tl.load(v_ptrs)
    else:
        k = tl.load(k_ptrs, mask=offs_n[:, None] < N_CTX)
        v = tl.load(v_ptrs, mask=offs_n[:, None] < N_CTX)

    if NUM_DBLOCKS >= 2:
        dv2 = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
        dk2 = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
        if EVEN_N_BLOCK:
            k2 = tl.load(k_ptrs + BLOCK_DMODEL * stride_kd)
            v2 = tl.load(v_ptrs + BLOCK_DMODEL * stride_vd)
        else:
            k2 = tl.load(k_ptrs + BLOCK_DMODEL * stride_kd, mask=offs_n[:, None] < N_CTX)
            v2 = tl.load(v_ptrs + BLOCK_DMODEL * stride_vd, mask=offs_n[:, None] < N_CTX)

    # loop over rows

    layout_ptr = layout_ccol_ptr + off_h * layout_ccol_stride_h + start_n * layout_ccol_stride_m
    start_l = tl.load(layout_ptr).to(tl.int32)
    end_l = tl.load(layout_ptr + layout_ccol_stride_m).to(tl.int32)

    for row_idx_idx in range(start_l, end_l):
        row_idx = tl.load(layout_row_ptr + off_h * layout_row_stride_h + row_idx_idx * layout_row_stride_m).to(tl.int32)
        start_m = row_idx * BLOCK_M

        # offs_qm = start_m + tl.arange(0, BLOCK_M)
        offs_m_curr = start_m + offs_m
        q_ptrs =   Q + (offs_m_curr[:, None] * stride_qm + offs_d[None, :] * stride_qd)
        do_ptrs = DO + (offs_m_curr[:, None] * stride_om + offs_d[None, :] * stride_od)
        dq_ptrs = DQ + (offs_m_curr[:, None] * stride_om + offs_d[None, :] * stride_od)

        # load q, k, v, do on-chip
        if EVEN_M_BLOCK:
            q = tl.load(q_ptrs)
        else:
            q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < N_CTX)
        # re-compute p = softmax(qk, dim=-1).T
        # NOTE: `do` is pre-divided by `l`; no normalization here
        qk = tl.dot(q, tl.trans(k))

        if NUM_DBLOCKS >= 2:
            if EVEN_M_BLOCK:
                q2 = tl.load(q_ptrs + BLOCK_DMODEL * stride_qd)
            else:
                q2 = tl.load(q_ptrs + BLOCK_DMODEL * stride_qd, mask=offs_m_curr[:, None] < N_CTX)
            qk += tl.dot(q2, tl.trans(k2))

        qk += tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), 0, float('-inf'))

        if EVEN_M_BLOCK:
            m = tl.load(m_ptrs + offs_m_curr)
        else:
            m = tl.load(m_ptrs + offs_m_curr, mask=offs_m_curr < N_CTX)
        p = tl.exp(qk * sm_scale - m[:, None])

        # compute dv
        if EVEN_M_BLOCK:
            do = tl.load(do_ptrs)
        else:
            do = tl.load(do_ptrs, mask=offs_m_curr[:, None] < N_CTX)

        if NUM_DBLOCKS >= 2:
            if EVEN_M_BLOCK:
                do2 = tl.load(do_ptrs + BLOCK_DMODEL * stride_od)
            else:
                do2 = tl.load(do_ptrs + BLOCK_DMODEL * stride_od, mask=offs_m_curr[:, None] < N_CTX)

        dv += tl.dot(tl.trans(p.to(Q.dtype.element_ty)), do)

        if NUM_DBLOCKS >= 2:
            dv2 += tl.dot(tl.trans(p.to(Q.dtype.element_ty)), do2)

        # compute dp = dot(v, do)
        if EVEN_M_BLOCK:
            Di = tl.load(D_ptrs + offs_m_curr)
        else:
            Di = tl.load(D_ptrs + offs_m_curr, mask=offs_m_curr < N_CTX)
        dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
        dp += tl.dot(do, tl.trans(v))

        if NUM_DBLOCKS >= 2:
            dp += tl.dot(do2, tl.trans(v2))

        # compute ds = p * (dp - delta[:, None])
        ds = p * dp * sm_scale
        # compute dk = dot(ds.T, q)
        dk += tl.dot(tl.trans(ds.to(Q.dtype.element_ty)), q)
        if NUM_DBLOCKS >= 2:
            dk2 += tl.dot(tl.trans(ds.to(Q.dtype.element_ty)), q2)

        # # compute dq
        dq = tl.dot(ds.to(Q.dtype.element_ty), k)
        if EVEN_M_BLOCK:
            tl.atomic_add(dq_ptrs, dq)
        else:
            tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < N_CTX)

        if NUM_DBLOCKS >= 2:
            dq2 = tl.dot(ds.to(Q.dtype.element_ty), k2)
            dq_ptrs2 = dq_ptrs + BLOCK_DMODEL * stride_od
            if EVEN_M_BLOCK:
                tl.atomic_add(dq_ptrs2, dq2)
            else:
                tl.atomic_add(dq_ptrs2, dq2, mask=offs_m_curr[:, None] < N_CTX)

    # write-back
    dv_ptrs = DV + (offs_n[:, None] * stride_om + offs_d[None, :] * stride_od)
    dk_ptrs = DK + (offs_n[:, None] * stride_om + offs_d[None, :] * stride_od)
    if EVEN_N_BLOCK:
        tl.store(dv_ptrs, dv)
        tl.store(dk_ptrs, dk)
    else:
        tl.store(dv_ptrs, dv, mask=offs_n[:, None] < N_CTX)
        tl.store(dk_ptrs, dk, mask=offs_n[:, None] < N_CTX)

    if NUM_DBLOCKS >= 2:
        dv_ptrs2 = dv_ptrs + BLOCK_DMODEL * stride_od
        dk_ptrs2 = dk_ptrs + BLOCK_DMODEL * stride_od
        if EVEN_N_BLOCK:
            tl.store(dv_ptrs2, dv2)
            tl.store(dk_ptrs2, dk2)
        else:
            tl.store(dv_ptrs2, dv2, mask=offs_n[:, None] < N_CTX)
            tl.store(dk_ptrs2, dk2, mask=offs_n[:, None] < N_CTX)



def _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, BLOCK_M, BLOCK_N, num_warps=None, num_stages=1, inference=None, out=None):
    '''
    :param q, k, v: [batch, n_heads, seq_len, model_dim]. len of q is allowed to be different than k/v.
    :param layout_crow_indices, layout_col_indices: same as CSR.crow_indices, and CSR.col_indices used to preresent a sparse tensor.
        Each element represent a block, i.e, all elements in a block to be attentdd, or not attended at all..
    '''
    assert q.shape[-1] == k.shape[-1] == v.shape[-1]
    assert k.shape[2] == v.shape[2]
    o = out if out is not None else torch.empty_like(q).contiguous()
    grid = (triton.cdiv(q.shape[2], BLOCK_M), q.shape[0] * q.shape[1])

    q_rounded_len = grid[0] * BLOCK_M
    tmp = torch.empty((q.shape[0] * q.shape[1], q_rounded_len), device=q.device, dtype=torch.float32)

    if inference is None:
        inference = (not q.requires_grad) and (not k.requires_grad)  and (not v.requires_grad)

    if inference:
        L, m = tmp, tmp  # no need to use create new tensor
    else:
        L = torch.empty((q.shape[0] * q.shape[1], q_rounded_len), device=q.device, dtype=torch.float32)
        m = torch.empty((q.shape[0] * q.shape[1], q_rounded_len), device=q.device, dtype=torch.float32)

    if layout_col_indices.dim() == 1:
        layout_crow_indices = layout_crow_indices[None].expand(q.shape[1] , -1)
        layout_col_indices = layout_col_indices[None].expand(q.shape[1] , -1)

    assert q.shape[-1] in [64, 128]
    BLOCK_DMODEL = 64

    if num_warps is None:
        MIN_D = min(BLOCK_M, BLOCK_N, BLOCK_DMODEL)
        num_warps = max(1, 2 ** int(math.log2(MIN_D / 16)))
        # print(f'> {BLOCK_M=}, {BLOCK_N=}, {BLOCK_DMODEL=}, {num_warps=}, {num_stages=}')
    else:
        assert math.log2(num_warps) % 1 == 0, f'''"num_warps" should be power of 2, but got {num_warps}.'''

    ## For debugging:
    # print(f'>> {q.shape=}, {k.shape=}, {BLOCK_M=}, {BLOCK_N=}, {num_warps=}, {BLOCK_DMODEL=}, {q.stride()=}, {k.stride()=}')
    # print(f'>> {layout_crow_indices=}\n{layout_col_indices=}\n {layout_crow_indices.stride()=}, {layout_crow_indices.stride()=}')
    # print(f'> {q.shape=}, {k.shape=}, {layout_crow_indices.shape}, {layout_col_indices.shape}, {layout_crow_indices.stride()}, \
    #   {layout_col_indices.stride()}, {layout_crow_indices=}, {layout_col_indices=}')

    with torch.cuda.device(q.device.index):
        _fwd_kernel[grid](
            q, k, v, sm_scale,
            layout_crow_indices,
            layout_col_indices,
            layout_crow_indices.stride(0), layout_crow_indices.stride(1),
            layout_col_indices.stride(0), layout_col_indices.stride(1),
            tmp, L, m,
            o,
            q.stride(0), q.stride(1), q.stride(2), q.stride(3),
            k.stride(0), k.stride(1), k.stride(2), k.stride(3),
            v.stride(0), v.stride(1), v.stride(2), v.stride(3),
            o.stride(0), o.stride(1), o.stride(2), o.stride(3),
            q.shape[0], q.shape[1], k.shape[2],
            k.shape[2] - q.shape[2],
            q_rounded_len,
            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
            BLOCK_DMODEL=BLOCK_DMODEL,
            EVEN_M_BLOCK=q.shape[2] % BLOCK_M == 0,
            EVEN_N_BLOCK=k.shape[2] % BLOCK_N == 0 ,
            INFERENCE=inference,
            NUM_DBLOCKS=q.shape[-1] // BLOCK_DMODEL,
            num_warps=num_warps,
            num_stages=num_stages,
        )
    if inference:
        L, m = None, None

    ctx.save_for_backward(q, k, v, o, L, m, layout_crow_indices,  layout_col_indices)
    ctx.BLOCK_M = BLOCK_M
    ctx.BLOCK_N = BLOCK_N
    ctx.BLOCK_DMODEL = BLOCK_DMODEL
    # ctx.BLOCK = BLOCK
    ctx.grid = grid
    ctx.sm_scale = sm_scale
    ctx.num_warps = num_warps
    ctx.num_stages = num_stages
    return o


def _backward(ctx, do, layout_ccol_indices, layout_row_indices, dq=None, dk=None, dv=None):
    # q, k, v, o, l, m = ctx.saved_tensors
    q, k, v, o, l, m, layout_crow_indices, layout_col_indices = ctx.saved_tensors

    ## this following too slow to do online, so get it from inputs, which is cached.
    # layout_ccol_indices, layout_row_indices = dense_to_ccol_row(crow_col_to_dense(ctx.layout_crow_indices, ctx.layout_col_indices))
    # layout_ccol_indices, layout_row_indices = dense_to_ccol_row(crow_col_to_dense(layout_crow_indices, layout_col_indices))

    if not do.is_contiguous():
        do = do.contiguous()
        ## for debugging
        # print(f'----> do is not contiguous: {do.stride()=}')
        # raise ValueError(f'>>>> output grad is not contiguous: {do.stride()=}')

    if not o.is_contiguous():
        # TODO: currently only work with contiguous q/k/v.
        raise ValueError(f'--> output is not contiguous: {o.stride()=}. This is maybe caused by q/k/v not being contiguous.')


    if layout_ccol_indices.dim() == 1:
        layout_ccol_indices = layout_ccol_indices[None].expand(q.shape[1], -1)
        layout_row_indices = layout_row_indices[None].expand(q.shape[1], -1)

    # do = do.contiguous()
    dq = dq if dq is not None else torch.zeros_like(q, dtype=torch.float32)
    dk = dk if dk is not None else torch.empty_like(k)
    dv =dv if dv is not None else  torch.empty_like(v)
    do_scaled = torch.empty_like(do)
    delta = torch.empty_like(l)

    assert o.stride() == dq.stride() == dk.stride() == dv.stride() == do_scaled.stride()

    _bwd_preprocess[(ctx.grid[0] * ctx.grid[1], )](
        o, do, l,
        do_scaled, delta,
        k.shape[2],
        BLOCK_M=ctx.BLOCK_M, D_HEAD=q.shape[-1],
    )

    grid = (triton.cdiv(q.shape[2], ctx.BLOCK_N), ctx.grid[1])

    _bwd_kernel[grid](
        q, k, v, ctx.sm_scale,
        layout_ccol_indices,
        layout_row_indices,
        layout_ccol_indices.stride(0), layout_ccol_indices.stride(1),
        layout_row_indices.stride(0), layout_row_indices.stride(1),
        o, do_scaled,
        dq, dk, dv,
        l, m,
        delta,
        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
        q.shape[0], q.shape[1], q.shape[2],
        ctx.grid[0],
        BLOCK_M=ctx.BLOCK_M,
        BLOCK_N=ctx.BLOCK_N,
        BLOCK_DMODEL=ctx.BLOCK_DMODEL,
        NUM_DBLOCKS=q.shape[-1] // ctx.BLOCK_DMODEL,
        num_warps=ctx.num_warps,
        num_stages=1,
    )
    return dq, dk, dv, None, None, None


class _sparse_attention(torch.autograd.Function):

    @staticmethod
    def forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale):
        BLOCK = 128
        # shape constraints
        return _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, BLOCK, BLOCK)

    @staticmethod
    def backward(ctx, do):
        # q, k, v, o, l, m = ctx.saved_tensors
        q, k, v, o, l, m, layout_crow_indices, layout_col_indices = ctx.saved_tensors
        # TODO: the following is very inefficient.
        # layout_ccol_indices, layout_row_indices = dense_to_ccol_row(crow_col_to_dense(ctx.layout_crow_indices, ctx.layout_col_indices))
        layout_ccol_indices, layout_row_indices = dense_to_ccol_row(crow_col_to_dense(layout_crow_indices, layout_col_indices))
        return _backward(ctx, do, layout_ccol_indices, layout_row_indices)



# suppressed
class _sparse_attention_inference(_sparse_attention):
    # TODO: does not work now, as BLOCK_M cannot be <1, as shape for tl.dot cannot be smaller than 16.
    @staticmethod
    def forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale):
        BLOCK = 128
        return _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, 1, BLOCK)



def sparse_attention_factory(BLOCK_M=128, BLOCK_N=128, **kwargs):
    class _sparse_attention_config(_sparse_attention):
        @staticmethod
        def forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale):
            # shape constraints
            return _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, BLOCK_M, BLOCK_N,
                            **kwargs
                        )
    return _sparse_attention_config.apply


@lru_cache(maxsize=8)
def get_local_strided_sparse_attention_op(
        n_heads: int,
        max_seq_len:int,
        sparse_block_size: int=128,
        local_blocks: int=4,
        vert_stride: int=4,
        homo_head: bool=False,
        dtype=torch.bfloat16,
        device='cuda',
        active_head_range=None,
        verbose=True,
        **kwargs):
    '''
    :param n_heads: total number of attention heads (regardless of tensor/model parallel)
    :param max_seq_len: max sequence length. Need to be bigger or equal to the length of sequences.
    :param sparse_block_size: sparse block size. Default to 128
    :param local_blocks: number of nearest block to attend to. Default to 4, i.e., attention to previous 4xblock_size tokens.
    :param vert_stride: Default to 4. Meaning
    :param homo_head: if all head shared the same pattern.
    :param active_head_range: tuple of start & end of the heads, e..g, (8, 16). Default to use all heads.
                              Mainly for tensor/model parallelization where heads are splitted to different GPUs.
    '''

    if verbose:
        print((f'> new block_sparse_attn op constructed with config: '
            f'{n_heads=}, {max_seq_len=}, {sparse_block_size=}, {local_blocks=}, '
            f'{vert_stride=}, {homo_head=}, {active_head_range=}, {kwargs=}'))
    # assert math.log2(max_seq_len) % 2 == 0, f"max_seq_len should be power of 2 to be more efficient"
    _, block_sparse_pattern, _ = _get_sparse_attn_mask(n_heads, max_seq_len, max_seq_len, dtype, device,
                                                       BLOCK=sparse_block_size, local_blocks=local_blocks,
                                                       vert_stride=vert_stride, homo_head=homo_head,
                                                       return_dense=False)
    if (not homo_head) and (active_head_range is not None):
        assert isinstance(active_head_range, tuple)
        assert len(active_head_range) == 2, '"active_head_range" should be a tuple of start/end index of the heads.'
        h_start, h_end = active_head_range
        block_sparse_pattern = block_sparse_pattern[h_start:h_end]
    # print(block_sparse_pattern)
    return get_sparse_attn_op(block_sparse_pattern, sparse_block_size, **kwargs)


def get_sparse_attn_op(
        sparse_pattern: torch.tensor,
        sparse_block_size: int=128,
        kernel_block_size=128,
        qkv_format='q,k,v',
          **kwargs):
    '''
    Ccreate a block-sparse op with fixed layout. This is to avoid the need to of create CSR layout and convert it to CSC layout everytime,
        which is very inefficient (use python loops on CPU.  PyTorch 1.13 supports CSR->CSC, may help.)

    :param sparse_pattern: sparse pattern of the blocks. Should be `num_blocks(q) x num_blocks(k)` or `n_heads x num_blocks x num_blocks`.
        This tensor should have lower-triangular matrices on the last 2 dimensions for causal attention
    :param sparse_block_size: sparse block size. Default to 128
    :param kernel_block_size: the tile/block size to launch a triton instance. Default to None, i.e., same as `sparse_block_size`
    :param qkv_format: Choices=['q,k,v', 'q, kv', 'qkv'], i.e., separated q,k,v, or kv packed, or qkv packed. Currently, only 'q,k,v' is supported.

    :param kwargs: keyward arguments passed to `_forward`
    '''
    # assert qkv_format in ('q,k,v', 'q, kv', 'qkv')  # to save from running `concat` at forward/backward

    assert qkv_format == 'q,k,v'

    if kernel_block_size is None:
        kernel_block_size = sparse_block_size
    else:
        assert sparse_block_size % kernel_block_size == 0, f"The sparse block size must be a multiple of {kernel_block_size}."
        assert kernel_block_size >=16 and math.log2(kernel_block_size) % 1 == 0, f"block_size must be power of 2 and at least 16, but {kernel_block_size} is given"


        # print(f'>> {sparse_pattern.shape=}')
        # print(f'{sparse_pattern=}')
        if sparse_block_size // kernel_block_size > 1:
            _mul = sparse_block_size // kernel_block_size
            # need to consider if block_m and block_n are different
            sparse_pattern = torch.kron(sparse_pattern, sparse_pattern.new_ones(_mul, _mul))
            num_sparse_blocks = sparse_pattern.size(-1)
            block_causal_mask = torch.arange(0, num_sparse_blocks)[:, None] >= torch.arange(0, num_sparse_blocks)[None]
            sparse_pattern *= block_causal_mask.type_as(sparse_pattern)
            # print(f'>> after: {sparse_pattern.shape=}')
            # print(f'{sparse_pattern=}')

    BLOCK_N = kernel_block_size
    NUM_BLOCK =  sparse_pattern.size(-1)
    MAX_SEQ_LEN = kernel_block_size * NUM_BLOCK

    grand_layout_crow_indices, grand_layout_col_indices = dense_to_crow_col(sparse_pattern)
    # sparse csc layout for backward
    grand_layout_ccol_indices, grand_layout_row_indices = dense_to_ccol_row(sparse_pattern)


    # cache GPU backward layout. limit the size to avoid OOM as time goes.
    # For inference, one only needs to cache one block as sequence length always increases
    # Therefore, this cache needs to be reconstructed per every `block_size`-steps.
    # For training/finetune, set to 8 to increase cache hit.
    # Given an input, the block_len will be the same for all layers, so cache is very helpful.

    max_cache_size = 1 if kwargs.get('inference', False) else 8

    @lru_cache(maxsize=max_cache_size)
    def get_backward_layout_by_block_len(block_len):
        assert block_len <= NUM_BLOCK
        if block_len == NUM_BLOCK:
            return (grand_layout_ccol_indices, grand_layout_row_indices)
        return dense_to_ccol_row(sparse_pattern[..., :block_len, :block_len])

    # for debugging
    # if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
    #     print(f'> {sparse_pattern.cpu().tolist()=}')
    #     print('----')
    #     print(f'> {grand_layout_crow_indices.cpu().tolist()=}\n{grand_layout_col_indices.cpu().tolist()=}')


     # q, k, v separated
    class _q_k_v_sparse_attention(torch.autograd.Function):
        @staticmethod
        def forward(ctx, q, k, v, sm_scale):
            # assert q.shape[2] == 1 or q.shape[2] == k.shape[2]
            # shape constraints
            MIN_BLOCK_SIZE = 16
            assert BLOCK_N >= MIN_BLOCK_SIZE
            BLOCK_M = 16 if q.shape[2] <= 16 else BLOCK_N  # BLOCK_M has to be power of 2

            # this following code only works for causal attention
            K_BLOCKS = triton.cdiv(k.shape[2],  kernel_block_size)
            # Q_START_BLOCKS = K_BLOCKS - 1 if q.shape[2] == 1 else 0
            Q_START_BLOCKS = K_BLOCKS - triton.cdiv(q.shape[2], BLOCK_N)
            # print(Q_START_BLOCKS, K_BLOCKS)

            layout_crow_indices = grand_layout_crow_indices[..., Q_START_BLOCKS:K_BLOCKS+1]
            layout_col_indices = grand_layout_col_indices
            # print(BLOCK_M, BLOCK_N, Q_START_BLOCKS, K_BLOCKS+1, layout_crow_indices, layout_col_indices)

            return _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, BLOCK_M, BLOCK_N,
                            **kwargs
                        )
        @staticmethod
        def backward(ctx, do):
            q, k = ctx.saved_tensors[:2]
            assert q.shape[2] == k.shape[2], '> currently backward can only be done if q, k have same length. Contact @EricLin if you need it.'
            # assume q, k have same length
            block_len = triton.cdiv(do.shape[2], kernel_block_size)
            backward_layout = get_backward_layout_by_block_len(block_len)
            return _backward(ctx, do, *backward_layout)[:4]


    def _q_k_v_sparse_attention_fn(*args):
        return _q_k_v_sparse_attention.apply(*args)

    _q_k_v_sparse_attention_fn.sparse_pattern = sparse_pattern
    _q_k_v_sparse_attention_fn.grand_layout_crow_indices = grand_layout_crow_indices
    _q_k_v_sparse_attention_fn.grand_layout_col_indices = grand_layout_col_indices
    _q_k_v_sparse_attention_fn.grand_layout_ccol_indices = grand_layout_ccol_indices
    _q_k_v_sparse_attention_fn.grand_layout_row_indices = grand_layout_row_indices

    return _q_k_v_sparse_attention_fn

###########################################################
###########################################################

###########################################################
################ Inference Kernels ########################
###########################################################

def blocksparse_flash_attn_padded_fwd(
    q, k, v, # (batch, tokens, n_heads, head_size)
    sm_scale,
    sparse_layout,
    *,
    left_paddings = None,
    seqlens = None,
    block_size = 64,
    max_seqlen = None
):
    '''
    q, k, v: (batch, tokens, n_heads/n_kv_heads, head_size)
    left_paddings: (batch, ), number of left paddings for each sample.
    seqlens: can be used to specify right padding. No need to specify if left_paddings is used.
    '''
    batches, q_len, n_heads, head_size = q.shape
    _, k_len, n_kv_heads, _ = k.shape


    assert q.dim() == k.dim() == v.dim() == 4
    assert q.size(2) % k.size(2) == 0
    assert q.size(0) == k.size(0) and q.size(3) == k.size(3)
    assert k.shape == v.shape # TODO: allow diff head_size for k, v
    assert q_len == 1 or q_len == k_len, \
    f'q length can only 1 for decoding for same as k length for prefilling.'

    q_k_ratio = q.size(2) // k.size(2)

    if max_seqlen:
        assert k.size(1) <= max_seqlen, f'k has seqlen {k.size(1)} while max sequence length is set to {max_seqlen}.'

    # paddings always has zero output, a little slower than using empty
    out = q.new_zeros(q.shape)

    layout_crow_indices, layout_col_indices = sparse_layout
    block_d = triton.next_power_of_2(head_size)

    if left_paddings is not None:
        assert left_paddings.shape == (batches,)
        k_batch_starts = left_paddings.to(q.device, dtype=torch.int32).contiguous()
    else:
        k_batch_starts = torch.zeros((batches,), dtype=torch.int32, device=q.device)

    if seqlens is not None:
        k_batch_ends = k_batch_starts + seqlens.type_as(k_batch_starts)
        assert k_batch_ends.max() <= k_len, f'seqlens (+left_paddings if any) exceeds seqlen.'
    else:
        k_batch_ends = torch.zeros_like(k_batch_starts) + k_len

    if q_len == 1:
        q_batch_starts = torch.zeros_like(k_batch_starts)
        q_batch_ends = q_batch_starts + 1
    else:
        q_batch_starts = k_batch_starts
        q_batch_ends = k_batch_ends

    # switch to use cpu to avoid too many kernel lauch when iterate over
    q_lens = (q_batch_ends - q_batch_starts).cpu()
    n_blocks = (q_lens + block_size - 1) // block_size

    q_batch_ids = torch.tensor([i for i, n in enumerate(n_blocks) for _ in range(n)],
                                dtype=q_batch_starts.dtype,
                                device=q_batch_starts.device)
    q_start_sids = torch.tensor([i * block_size for n in n_blocks for i in range(n)],
                               dtype=q_batch_starts.dtype,
                               device=q_batch_starts.device)

    grid = (len(q_start_sids), n_heads)

    with torch.cuda.device(q.device.index):
        _fwd_kernel_batch_inference[grid](
            q, k, v, out,
            sm_scale,
            q_batch_starts,
            q_batch_ends,
            k_batch_starts,
            k_batch_ends,
            q_batch_ids,
            q_start_sids,

            *q.stride(),
            *k.stride(),
            *v.stride(),
            *out.stride(),

            layout_crow_indices,
            layout_col_indices,
            *layout_crow_indices.stride(),
            *layout_col_indices.stride(),

            q_k_ratio,
            HAS_BATCH_DIM = True,
            D_HEAD = head_size,
            BLOCK_M = block_size,
            BLOCK_N = block_size,
            BLOCK_D = block_d,
            BLOCK_M_LOADING = 16 if q_len == 1 else block_size, # smaller for decoding
            EVEN_D = block_d == head_size,
            num_warps = 1 if q_len == 1 else 4,
            num_stages = 3
        )

    return out


def blocksparse_flash_attn_varlen_fwd(
    q, k, v, # (#tokens, n_heads, head_size)
    cu_seqlens_k,
    cu_seqlens_q,
    sm_scale,
    sparse_layout,
    *,
    block_size=64,
    max_seqlen = None
):
    # split q to blocks
    _, n_heads, head_size = q.shape
    batch_size = cu_seqlens_k.size(0) - 1


    # print(f'> {q.shape=}, {k.shape=}')
    assert q.dim() == k.dim() == v.dim() == 3
    assert q.size(1) % k.size(1) == 0
    assert q.size(2) == k.size(2)
    assert k.shape == v.shape # TODO: allow diff head_size for k, v
    assert cu_seqlens_k.dim() == 1

    q_k_ratio = q.size(1) // k.size(1)

    if cu_seqlens_q is None:
        if q.size(0) == batch_size: # decoding only
            cu_seqlens_q = torch.arange(0, batch_size + 1,
                                        dtype=cu_seqlens_k.dtype,
                                        device=cu_seqlens_k.device)
        elif q.size(0) == k.size(0):
            cu_seqlens_q = cu_seqlens_k
        else:
            raise ValueError('cu_seqlens_q must be specified if it is mix of prefilling and decoding.')
    else:
        assert cu_seqlens_k.size(0) == cu_seqlens_q.size(0)

    # switch to use cpu to avoid too many kernel lauch when iterate over
    q_lens = (cu_seqlens_q[1:] - cu_seqlens_q[:-1]).cpu()
    k_lens = (cu_seqlens_k[1:] - cu_seqlens_k[:-1]).cpu()

    assert torch.logical_or(q_lens == 1, k_lens == q_lens).all(), \
        'length of q should either be 1 (decoding) or same as k (prefilling).'

    if max_seqlen:
        assert k_lens.max() <= max_seqlen

    n_blocks = (q_lens + block_size - 1) // block_size

    q_batch_ids = torch.tensor([i for i, n in enumerate(n_blocks) for _ in range(n)],
                                dtype=cu_seqlens_q.dtype,
                                device=cu_seqlens_q.device)
    q_start_sids = torch.tensor([i * block_size for n in n_blocks for i in range(n)],
                               dtype=cu_seqlens_q.dtype,
                               device=cu_seqlens_q.device)


    out = q.new_empty(q.shape)
    cu_seqlens_q = cu_seqlens_q.contiguous()
    cu_seqlens_k = cu_seqlens_k.contiguous()

    layout_crow_indices, layout_col_indices = sparse_layout
    block_d = triton.next_power_of_2(head_size)

    decoding_only =  (q_lens == 1).all()

    grid = (len(q_start_sids), n_heads)

    _fwd_kernel_batch_inference[grid](
    q, k, v, out,
    sm_scale,
    cu_seqlens_q[:-1],
    cu_seqlens_q[1:],
    cu_seqlens_k[:-1],
    cu_seqlens_k[1:],
    q_batch_ids,
    q_start_sids,

    0, *q.stride(),
    0, *k.stride(),
    0, *v.stride(),
    0, *out.stride(),

    layout_crow_indices,
    layout_col_indices,
    *layout_crow_indices.stride(),
    *layout_col_indices.stride(),

    q_k_ratio,
    HAS_BATCH_DIM = False,
    D_HEAD = head_size,
    BLOCK_M = block_size,
    BLOCK_N = block_size,
    BLOCK_D = block_d,
    BLOCK_M_LOADING = 16 if decoding_only else block_size, # smaller for decoding
    EVEN_D = block_d == head_size,
    num_warps = 1 if decoding_only else 4,
    num_stages = 3
    )

    return out


@triton.jit
def _fwd_kernel_inner(
    acc, l_i, m_i,
    q, Q,
    k_block_col_idx,
    layout_col_ptr,
    layout_col_stride_h, layout_col_stride_m,
    k_ptrs,
    v_ptrs,
    off_h, offs_m, offs_n, offs_d,
    stride_kt, stride_vt,
    sm_scale,
    k_seqlen,
    past_len,
    LAST_K_BLOCK: tl.constexpr,
    BLOCK_M_LOADING: tl.constexpr,
    BLOCK_N: tl.constexpr,
    D_HEAD: tl.constexpr,
    EVEN_D: tl.constexpr,
    M_LT_N: tl.constexpr
):
    k_block_id = tl.load(layout_col_ptr +  off_h * layout_col_stride_h + k_block_col_idx * layout_col_stride_m).to(tl.int32)
    start_n = k_block_id * BLOCK_N
    # -- compute qk ----
    if LAST_K_BLOCK:
        if EVEN_D:
            k = tl.load(k_ptrs + start_n * stride_kt,
                        mask=offs_n[None, :] + start_n < k_seqlen)
        else:
            # mask = mask & (offs_d[:, ])
            k = tl.load(k_ptrs + start_n * stride_kt,
                        mask=(offs_n[None, :] + start_n < k_seqlen) & (offs_d[:, None] < D_HEAD))
    else:
        if EVEN_D:
            k = tl.load(k_ptrs + start_n * stride_kt)
        else:
            k = tl.load(k_ptrs + start_n * stride_kt,
                        mask=offs_d[:, None] < D_HEAD)


    qk = tl.zeros([BLOCK_M_LOADING, BLOCK_N], dtype=tl.float32)
    qk += tl.dot(q, k)

    qk *= sm_scale

    # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
    if LAST_K_BLOCK | M_LT_N:
        qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float('-inf'))

    # -- compute m_ij, p, l_ij
    m_ij = tl.max(qk, 1)
    p = tl.exp(qk - m_ij[:, None])

    l_ij = tl.sum(p, 1)
    # -- update m_i and l_i
    m_i_new = tl.maximum(m_i, m_ij)
    alpha = tl.exp(m_i - m_i_new)
    beta = tl.exp(m_ij - m_i_new)
    l_i_new = alpha * l_i + beta * l_ij
    # -- update output accumulator --
    # scale p
    p_scale = beta / l_i_new
    p = p * p_scale[:, None]
    # scale acc
    acc_scale = l_i / l_i_new * alpha
    acc = acc * acc_scale[:, None]

    p = p.to(Q.dtype.element_ty)
    # update acc
    if LAST_K_BLOCK:
        if EVEN_D:
            v = tl.load(v_ptrs + start_n * stride_vt,
                        mask=offs_n[:, None] + start_n < k_seqlen)
        else:
            v = tl.load(v_ptrs + start_n * stride_vt,
                        mask=(offs_n[:, None] + start_n < k_seqlen) & (offs_d[None, :] < D_HEAD))
    else:
        if EVEN_D:
            v = tl.load(v_ptrs + start_n * stride_vt)
        else:
            v = tl.load(v_ptrs + start_n * stride_vt,
                        mask=offs_d[None, :] < D_HEAD)

    acc += tl.dot(p, v)
    # update m_i and l_i
    l_i = l_i_new
    m_i = m_i_new
    return acc, l_i, m_i


@triton.heuristics(
    {
        'M_LT_N': lambda kwargs: kwargs['BLOCK_M'] < kwargs['BLOCK_N'],
    }
)
@triton.jit
def _fwd_kernel_batch_inference(
    Q, K, V, Out,

    sm_scale,
    q_batch_starts,
    q_batch_ends,
    k_batch_starts,
    k_batch_ends,
    q_batch_ids,
    q_start_sids,

    stride_qb, stride_qt, stride_qh, stride_qd,
    stride_kb, stride_kt, stride_kh, stride_kd,
    stride_vb, stride_vt, stride_vh, stride_vd,
    stride_ob, stride_ot, stride_oh, stride_od,

    layout_crow_ptr,
    layout_col_ptr,
    layout_crow_stride_h, layout_crow_stride_m,
    layout_col_stride_h, layout_col_stride_m,

    q_k_ratio,

    HAS_BATCH_DIM: tl.constexpr,
    D_HEAD: tl.constexpr,
    BLOCK_M: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_D: tl.constexpr,
    BLOCK_M_LOADING: tl.constexpr,
    EVEN_D: tl.constexpr,
    M_LT_N: tl.constexpr
):
    '''
    NOTATION:
    pid: position id
    sid: storage id
    sbid: storage block id
    pbid: position block id
    offs_m, offs_n: storage offsets of m-dim(q, row) and n-dim(k, col)

    q and blocks in KV needs to be contiguous

    Arguments:
    kv_seq_lens: for compute past_len
    kv_storage_offsets: similar to block_tables in vllm, except it is dynamic.
        TODO: fix this

    TODO:
    Optimize grouped-attn

    CUDA graph support issue
        1. grid is dynamic: vllm set up multiple cuda graph in decoding phase, with diff max token size (16, 32, ...)
            since we mix prompt and decoing phase here, it can be more complex.
            need to set up diff cuda-graph for diff (off_zm, off_z)

            # indeed, q_batch_ids can be padded to maximum number of grid[0], i.e., assume all decoding
            therefore, cu_seqlens_q, kv_seq_lens

    '''
    off_zm = tl.program_id(0)
    off_h = tl.program_id(1)

    off_h_for_kv = off_h // q_k_ratio
    off_z = tl.load(q_batch_ids + off_zm).to(tl.int32)   # [0, 0, 0, 1]
    q_start_sid = tl.load(q_start_sids + off_zm)
    start_m = q_start_sid // BLOCK_M

    if HAS_BATCH_DIM:
        Q += off_z * stride_qb
        K += off_z * stride_kb
        V += off_z * stride_vb
        Out += off_z * stride_ob

    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M_LOADING)
    offs_n = tl.arange(0, BLOCK_N)
    offs_d = tl.arange(0, BLOCK_D)

    q_cu_start = tl.load(q_batch_starts + off_z).to(tl.int32)
    q_seqlen = tl.load(q_batch_ends + off_z).to(tl.int32) - q_cu_start

    k_cu_start = tl.load(k_batch_starts + off_z).to(tl.int32)
    k_seqlen = tl.load(k_batch_ends + off_z).to(tl.int32) - k_cu_start

    past_len = k_seqlen - q_seqlen

    Q += q_cu_start * stride_qt + off_h * stride_qh
    K += k_cu_start * stride_kt + off_h_for_kv * stride_kh
    V += k_cu_start * stride_vt + off_h_for_kv * stride_vh
    Out += q_cu_start * stride_ot + off_h * stride_oh

    q_pbid = (past_len + q_start_sid) // BLOCK_M

    if EVEN_D:
        q = tl.load(Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd,
                    mask=offs_m[:, None] < q_seqlen)
    else:
        q = tl.load(Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd,
                    mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD),
                    other=0)

    sparse_crow_ptr = layout_crow_ptr + off_h * layout_crow_stride_h + q_pbid * layout_crow_stride_m

    # TODO: load at once, supported in new Triton
    k_block_start = tl.load(sparse_crow_ptr).to(tl.int32)
    k_block_end = tl.load(sparse_crow_ptr + 1).to(tl.int32)

    m_i = tl.zeros([BLOCK_M_LOADING], dtype=tl.float32) - float('inf')
    l_i = tl.zeros([BLOCK_M_LOADING], dtype=tl.float32)
    acc = tl.zeros([BLOCK_M_LOADING, BLOCK_D], dtype=tl.float32)

    k_ptrs = K + offs_n[None, :] * stride_kt + offs_d[:, None] * stride_kd
    v_ptrs = V + offs_n[:, None] * stride_vt + offs_d[None, :] * stride_vd

    for k_block_col_idx in range(k_block_start, k_block_end - 1):
        acc, l_i, m_i = _fwd_kernel_inner(
            acc, l_i, m_i,
            q, Q,
            k_block_col_idx,
            layout_col_ptr,
            layout_col_stride_h, layout_col_stride_m,
            k_ptrs,
            v_ptrs,
            off_h, offs_m, offs_n, offs_d,
            stride_kt, stride_vt,
            sm_scale,
            k_seqlen,
            past_len,
            False,
            BLOCK_M_LOADING,
            BLOCK_N,
            D_HEAD,
            EVEN_D,
            M_LT_N
            )

    acc, l_i, m_i = _fwd_kernel_inner(
        acc, l_i, m_i,
        q, Q,
        k_block_end - 1,
        layout_col_ptr,
        layout_col_stride_h, layout_col_stride_m,
        k_ptrs,
        v_ptrs,
        off_h, offs_m, offs_n, offs_d,
        stride_kt, stride_vt,
        sm_scale,
        k_seqlen,
        past_len,
        True,
        BLOCK_M_LOADING,
        BLOCK_N,
        D_HEAD,
        EVEN_D,
        M_LT_N
        )

    # write output
    if EVEN_D:
        tl.store(Out + offs_m[:, None] * stride_ot + offs_d[None, :] * stride_od, acc,
                mask=offs_m[:, None] < q_seqlen)
    else:
        tl.store(Out + offs_m[:, None] * stride_ot + offs_d[None, :] * stride_od, acc,
                mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD))


###########################################################
###########################################################

###########################################################
################## Testing Utilities ######################
###########################################################


def torch_attention(q, k, v, attn_mask=None, sm_scale=None, block_attn_mask=None, block_size=128, do=None):
    '''
    q, k, v: shape=(batch, n_heads, seq, dim)
    '''
    # for verification
    if sm_scale is None:
        sm_scale = math.sqrt(float(q.size(-1)))

    if block_attn_mask is not None:
        assert attn_mask is None
        outs = []
        for s in range(0, q.size(2), block_size):
            e = min(s + block_size, q.size(2))
            q_block = q[:, :, s:e]
            attn = torch.einsum('bhmd,bhnd->bhmn', q_block, k[:, :, :e]).float() * sm_scale
            mask = block_attn_mask[..., s // block_size, : (s // block_size + 1)]
            mask = torch.kron(mask, torch.ones(block_size, block_size, device=mask.device))
            mask[..., :, s:].masked_fill_(torch.arange(0, block_size)[:, None] <= torch.arange(0, block_size)[None, :], 0)
            attn = attn.masked_fill((1 - mask).bool(), float('-inf'))
            attn = attn.softmax(-1)
            out = torch.einsum('bhmn,bhnd->bhmd', attn.type_as(v), v[:, :, :e])
            outs.append(out)
        torch_output = torch.cat(outs, dim=2)
    else:
        attn = torch.einsum('bhmd,bhnd->bhmn', q, k).float() * sm_scale
        # import ipdb; ipdb.set_trace()
        if attn_mask is not None:
            attn = attn.masked_fill((1 - attn_mask).bool(), float('-inf'))
        # print(f'> torch attn: {attn.exp().sum(-1)=}')

        attn = attn.softmax(-1)
        if do is not None:
            dv = torch.einsum('bhqk,bhqd->bhkd', attn.type_as(do), do)
            print(f'> torch_attn computed dv: {dv=}')
        torch_output = torch.einsum('bhmn,bhnd->bhmd', attn.type_as(v), v)
    return torch_output

###########################################################
###########################################################

###########################################################
#################### Unit Tests ###########################
###########################################################


@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(2, 8, 2048, 128), (1, 4, 4096, 64)])
def test_op(Z, H, N_CTX, D_HEAD, Q_LEN=None, dtype=torch.bfloat16, homo_head=True, kernel_block_size=None, sparse_block_size=128, backward=True,
            sparse_attention_fn=None, local_blocks=4, vert_stride=4, sm_scale=None, max_length=None):
    Q_LEN = Q_LEN or N_CTX
    torch.manual_seed(20)
    q = torch.empty((Z, H, Q_LEN, D_HEAD), dtype=dtype, device='cuda').normal_(mean=0, std=.5) # .requires_grad_()
    k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device='cuda').normal_(mean=0, std=.5) # .requires_grad_()
    v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device='cuda').normal_(mean=0, std=.5) # .requires_grad_()

    if sm_scale is None:
        sm_scale = 1. / math.sqrt(D_HEAD)

    # for debugging
    # print(f'>> {q.shape=}, {k.shape=}, {v.shape=}, {homo_head=}, {kernel_block_size=}, {sparse_block_size=}, {local_blocks=}, {vert_stride=}')
    sm_scale = 0.0078125
    if backward:
        q.requires_grad_(), k.requires_grad_(), v.requires_grad_()

    # qkv = torch.empty((Z, N_CTX, 3*H*D_HEAD), dtype=dtype, device='cuda').normal_(mean=0, std=.5)
    # q = qkv[..., :H*D_HEAD]
    # k = qkv[..., H*D_HEAD:2*H*D_HEAD]
    # v = qkv[..., 2*H*D_HEAD:]
    # q = q.view(Z, N_CTX, H, -1).permute(0, 2, 1, 3)
    # k = k.view(Z, N_CTX, H, -1).permute(0, 2, 1, 3)
    # v = v.view(Z, N_CTX, H, -1).permute(0, 2, 1, 3)

    # if Q_LEN and Q_LEN < N_CTX:
    #     q = q[:, :, -Q_LEN:] # .contiguous()

    # q = q.requires_grad_()
    # k = k.requires_grad_()
    # v = v.requires_grad_()

    dout = torch.randn_like(q).contiguous()

    # dout = torch.eye(N_CTX)[:, :D_HEAD][None, None].expand_as(q).type_as(q).contiguous()
    # print(dout)

    mask_csr, _, mask_dense = get_sparse_attn_mask(q, N_CTX, BLOCK=sparse_block_size,
                            local_blocks=local_blocks, vert_stride=vert_stride, homo_head=homo_head, return_dense=True)

    if sparse_attention_fn is None:
        sparse_attention_fn = get_local_strided_sparse_attention_op(H, N_CTX,
                                                                    sparse_block_size=sparse_block_size,
                                                                    local_blocks=local_blocks,
                                                                    vert_stride=vert_stride,
                                                                    homo_head=homo_head,
                                                                    device=q.device,
                                                                    dtype=q.dtype,
                                                                    kernel_block_size=kernel_block_size)
    # reference implementation
    ref_out = torch_attention(q, k, v, mask_dense, sm_scale)

    # lengths = torch.full((Z,), fill_value=N_CTX, device='cuda')
    # cu_seqlens = torch.zeros((Z + 1,), device='cuda', dtype=torch.int32)
    # cu_seqlens[1:] = lengths.cumsum(0)
    # # qkv = torch.randn((Z * N_CTX, 3, H, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)

    # qkv_list = list(map(lambda x: x.permute(0, 2, 1, 3).contiguous().view(Z * N_CTX, 1, H, D_HEAD), [q, k, v]))
    # qkv = torch.cat(qkv_list, dim=1)
    # ref_out0 = flash_attn_func(qkv, cu_seqlens, dropout_p=0, max_s=N_CTX, softmax_scale=sm_scale, causal=True)
    # ref_out = ref_out0.view(Z, N_CTX, H, D_HEAD).permute(0, 2, 1, 3).contiguous()


    if backward:
        ref_out.backward(dout)
        ref_dv, v.grad = v.grad.clone(), None
        ref_dk, k.grad = k.grad.clone(), None
        ref_dq, q.grad = q.grad.clone(), None

    tri_out = sparse_attention_fn(q, k, v, sm_scale)

    decimal = 1 if dtype == torch.bfloat16 else 2
    assert torch.allclose(ref_out.cpu(), tri_out.cpu(), atol=1e-2, rtol=0), f'>> {ref_out[0, 0, :, 0].tolist()=}\n\n{tri_out[0, 0, :, 0].tolist()=}'

    if backward:
        tri_out.backward(dout)
        tri_dv, v.grad = v.grad.clone(), None
        tri_dk, k.grad = k.grad.clone(), None
        tri_dq, q.grad = q.grad.clone(), None

    if backward:
        assert torch.allclose(ref_dv, tri_dv, atol=1e-2, rtol=1e-2)
        assert torch.allclose(ref_dk, tri_dk, atol=1e-2, rtol=0)
        assert torch.allclose(ref_dq, tri_dq, atol=1e-2, rtol=0)

    print(f'> test passed: {Z=}, {H=}, {N_CTX=}, {D_HEAD=}, {Q_LEN=}, {dtype=}, {homo_head=}, {sparse_block_size=}')

###########################################################

if __name__ == '__main__':

    GPU_TYPE = os.popen('nvidia-smi --query-gpu=name --format=csv | tail -n 1').read().strip()
    # print(GPU_TYPE)
    support_backward = True # 'A100' in GPU_TYPE. Wasn't supportted in consumer A1000.

    ###############
    # benchmarking

    HAS_DENSE_TRITON_FLASH = False
    # try:
    #     from triton.ops.flash_attention import attention as triton_attention
    #     HAS_DENSE_TRITON_FLASH = True
    # except:
    #     HAS_DENSE_TRITON_FLASH = False
    #     print('> cannot import Trition flash attn')

    try:
        from flash_attn.flash_attn_interface import flash_attn_func, flash_attn_unpadded_func
        HAS_FLASH = True
    except BaseException:
        HAS_FLASH = False
        print('> cannot import flash_attn')


    # BATCH, N_HEADS, N_CTX, D_HEAD = 4, 48, 4096, 64
    BATCH, N_HEADS, N_CTX, D_HEAD = 4, 32, 4096, 128  # 6.7B model, with 4k len
    # BATCH, N_HEADS, N_CTX, D_HEAD = 4, 16, 4096, 128  # 204m model

    BLOCK_SIZE = 64
    LOCAl_BLOCKS = 8 # 4
    VERT_STRIDE = 1 # 16 # 8
    HOMO_HEAD = False
    sparse_type = 'home' if HOMO_HEAD else 'hetero'
    dtype = torch.bfloat16


    modes = ['fwd', 'bwd'] if support_backward else ['fwd']

    configs = [triton.testing.Benchmark(
        x_names=['SEQ_LEN'],
        x_vals=[2**i for i in range(8, 16)],
        line_arg='provider',
        line_vals=(['triton'] if HAS_DENSE_TRITON_FLASH else []) + (['flash'] if HAS_FLASH else []) + ['triton_sparse'],
        line_names=(['Triton-Dense'] if HAS_DENSE_TRITON_FLASH else [])  + (['Flash-Dense'] if HAS_FLASH else []) + ['Triton-Sparse'],
        styles=[('red', '-'), ('blue', '-'), ('green', '-')],
        ylabel='ms',
        plot_name=f'fused-attention-batch{BATCH}-head{N_HEADS}-d{D_HEAD}-sparse-local{LOCAl_BLOCKS}-vert{VERT_STRIDE}-{sparse_type}-{dtype}-{mode}',
        args={'H': N_HEADS, 'BATCH': BATCH, 'D_HEAD': D_HEAD, 'dtype': dtype, 'mode': mode}
    ) for mode in modes]


    @triton.testing.perf_report(configs)
    def bench_flash_attention(BATCH, H, SEQ_LEN, D_HEAD, mode, provider, dtype=torch.bfloat16, device='cuda', sparse_attention_fn=None):
        assert mode in ['fwd', 'bwd']
        warmup = 25
        rep = 100
        N_CTX = SEQ_LEN
        if provider == 'triton':
            q = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
            k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
            v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
            sm_scale = 1.3
            fn = lambda: triton_attention(q, k, v, sm_scale)
            if mode == 'bwd':
                o = fn()
                do = torch.randn_like(o)
                fn = lambda: o.backward(do, retain_graph=True)
            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
            return ms
        if provider == 'triton_sparse':
            q = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
            k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
            v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
            sm_scale = 1.3
            # q_pos = torch.arange(N_CTX // BLOCK, device='cuda')[:, None]
            # k_pos = torch.arange(N_CTX // BLOCK, device='cuda')[None]
            # local_blocks = 4 # num_block per attn, block_size is tied to BLOCK
            # vert_stride =N_CTX + 1 # 4
            # mask_vert_strided = torch.arange(N_CTX // BLOCK, device='cuda') % vert_stride == vert_stride - 1
            # mask_dense = ((q_pos >= k_pos) & ((q_pos - k_pos < local_blocks) | mask_vert_strided)).type_as(q)
            # mask = mask_dense.to_sparse_csr()
            # mask_csr, _ = get_sparse_attn_mask(q, N_CTX, BLOCK=BLOCK, local_blocks=LOCAl_BLOCKS, vert_stride=VERT_STRIDE, homo_head=HOMO_HEAD)

            if sparse_attention_fn is None:
                # sparse_attention_fn = sparse_attention
                sparse_attention_fn = get_local_strided_sparse_attention_op(H, SEQ_LEN,
                                                                            local_blocks=LOCAl_BLOCKS,
                                                                            vert_stride=VERT_STRIDE,
                                                                            homo_head=HOMO_HEAD,
                                                                            sparse_block_size=BLOCK_SIZE,
                                                                            kernel_block_size=BLOCK_SIZE,
                                                                            device=q.device)
            # sparse_attention_fn = sparse_attention_factory(128, 128, num_warps=8)

            # fn = lambda: sparse_attention_fn(q, k, v, mask_csr[0], mask_csr[1], sm_scale)
            fn = lambda: sparse_attention_fn(q, k, v, sm_scale)
            if mode == 'bwd':
                o = fn()
                do = torch.randn_like(o)
                fn = lambda: o.backward(do, retain_graph=True)
            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
            return ms
        if provider == 'flash':
            lengths = torch.full((BATCH,), fill_value=N_CTX, device=device)
            cu_seqlens = torch.zeros((BATCH + 1,), device=device, dtype=torch.int32)
            cu_seqlens[1:] = lengths.cumsum(0)
            qkv = torch.randn((BATCH * N_CTX, 3, H, D_HEAD), dtype=dtype, device=device, requires_grad=True)
            fn = lambda: flash_attn_func(qkv, cu_seqlens, 0., N_CTX, causal=True)
            if mode == 'bwd':
                o = fn()
                do = torch.randn_like(o)
                fn = lambda: o.backward(do, retain_graph=True)
            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
            return ms

        # if provider == 'torch':
        #     q = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
        #     k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
        #     v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
        #     sm_scale = 1.3
        #     causal_mask = torch.tril(torch.ones(N_CTX, N_CTX)).type_as(q)
        #     fn = lambda:  torch_attention(q, k, v, causal_mask, sm_scale)
        #     ms = triton.testing.do_bench(fn, percentiles=None, warmup=warmup, rep=rep)
        #     return ms


    BATCH, N_HEADS, N_CTX, D_HEAD, Q_LEN = 4, 32, 4096, 128, 1  # 6.7B model, with 4k len

    BLOCK_SIZE = 64
    LOCAl_BLOCKS = 8 # 4
    VERT_STRIDE = 16 # 8
    HOMO_HEAD = False
    sparse_type = 'home' if HOMO_HEAD else 'hetero'
    dtype = torch.bfloat16
    MAX_N_CTX = 8192

    configs = [triton.testing.Benchmark(
        x_names=['PAST_LEN'],
        x_vals=[2**i - 1 for i in range(8, 14)],
        line_arg='provider',
        line_vals=['torch'] + (['flash'] if HAS_FLASH else []) + ['triton_sparse', 'triton_dense'],
        line_names=['Torch']  + (['Flash-Dense'] if HAS_FLASH else []) + ['Triton-Sparse', 'Triton-Dense'],
        styles=[('red', '-'), ('blue', '-'), ('green', '-'), ('cyan', '-')],
        ylabel='ms',
        plot_name=f'fused-attention-inference-batch{BATCH}-head{N_HEADS}-d{D_HEAD}-sparse-local{LOCAl_BLOCKS}-vert{VERT_STRIDE}-{sparse_type}',
        args={'H': N_HEADS, 'BATCH': BATCH, 'D_HEAD': D_HEAD, 'Q_LEN': Q_LEN, 'dtype': torch.float16, 'mode': mode}
    ) for mode in ['fwd']]
    @triton.testing.perf_report(configs)
    def bench_flash_attention_inference(BATCH, H, PAST_LEN, D_HEAD, Q_LEN, mode, provider, dtype=torch.bfloat16, device='cuda'):
        assert mode in ['fwd']
        warmup = 25
        rep = 100
        N_CTX = PAST_LEN + Q_LEN
        if provider == 'torch':
            q = torch.randn((BATCH, H, Q_LEN, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
            k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
            v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
            sm_scale = 1.3
            mask_csr, _, mask_dense = get_sparse_attn_mask(q, N_CTX, BLOCK=BLOCK_SIZE,
                                    local_blocks=LOCAl_BLOCKS, vert_stride=VERT_STRIDE, homo_head=VERT_STRIDE, return_dense=True)

            fn = lambda: torch_attention(q, k, v, mask_dense, sm_scale=sm_scale, block_size=2048)
            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
            return ms
        if provider == 'triton_sparse':
            q = torch.randn((BATCH, H, Q_LEN, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
            k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
            v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
            sm_scale = 1.3
            sparse_attention_fn = get_local_strided_sparse_attention_op(H, MAX_N_CTX,
                                                                        local_blocks=LOCAl_BLOCKS,
                                                                        vert_stride=VERT_STRIDE,
                                                                        homo_head=HOMO_HEAD,
                                                                        sparse_block_size=BLOCK_SIZE,
                                                                        kernel_block_size=BLOCK_SIZE,
                                                                        device=q.device,
                                                                        inference=True)

            fn = lambda: sparse_attention_fn(q, k, v, sm_scale)
            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
            return ms
        if provider == 'triton_dense':
            q = torch.randn((BATCH, H, Q_LEN, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
            k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
            v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
            sm_scale = 1.3
            sparse_attention_fn = get_local_strided_sparse_attention_op(H, MAX_N_CTX,
                                                                        local_blocks=1,
                                                                        vert_stride=1,
                                                                        homo_head=True,
                                                                        sparse_block_size=BLOCK_SIZE,
                                                                        kernel_block_size=BLOCK_SIZE,
                                                                        device=q.device,
                                                                        inference=True)

            fn = lambda: sparse_attention_fn(q, k, v, sm_scale)
            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
            return ms
        if provider == 'flash':
            assert Q_LEN == 1
            lengths = torch.full((BATCH,), fill_value=N_CTX, device=device)
            cu_seqlens = torch.zeros((BATCH + 1,), device=device, dtype=torch.int32)
            cu_seqlens[1:] = lengths.cumsum(0)
            cu_seqlens_q = torch.arange(BATCH + 1, device=device, dtype=torch.int32)

            # (total_q, nheads, headdim),
            q = torch.randn((BATCH, H, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
            k = torch.randn((BATCH*N_CTX, H, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
            v = torch.randn((BATCH*N_CTX, H, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)

            fn = lambda: flash_attn_unpadded_func(q, k, v, cu_seqlens_q, cu_seqlens, 1, N_CTX, dropout_p=0, softmax_scale=1.3, causal=False)
            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
            return ms


    test_op(1, 4, 512, 128, dtype=torch.float16, homo_head=False, backward=support_backward)
    # bench_flash_attention.run(save_path='.', print_data=True)

    bench_flash_attention_inference.run(save_path='.', print_data=True)
    exit()
    # head_dim=64
    test_op(1, 2, 1024, 64, kernel_block_size=64, sparse_block_size=64,
            dtype=torch.bfloat16, homo_head=False, backward=support_backward)
    # uneven length, bf16
    test_op(1, 16, 224, 128, dtype=torch.bfloat16, homo_head=False, backward=False, sparse_block_size=128,
            kernel_block_size=64, local_blocks=8, vert_stride=8)
    test_op(3, 2, 2047, 128, homo_head=False, backward=False)

    # diff kernel/sparse block size
    test_op(1, 16, 224, 128, dtype=torch.bfloat16, homo_head=False, backward=False, kernel_block_size=64)
    # inference
    # test_op(1, 4, 512 + 256, 128, Q_LEN=1,  dtype=torch.bfloat16, homo_head=False, backward=support_backward)

    # dense flash attn
    test_op(1, 2, 1024, 128, kernel_block_size=128, sparse_block_size=128, dtype=torch.bfloat16, homo_head=False,
            backward=support_backward, local_blocks=1, vert_stride=1)

    # fp16
    test_op(1, 4, 512 + 256, 128, dtype=torch.float16, homo_head=False, backward=support_backward)

    # longer sequence
    test_op(2, 4, 8192, 64, homo_head=False, backward=support_backward)
    test_op(2, 4, 8192, 128, dtype=torch.bfloat16, homo_head=False, backward=support_backward)

    # homo head
    test_op(3, 2, 2048, 64, homo_head=True, dtype=torch.bfloat16, backward=False)
    test_op(3, 2, 2048, 64, homo_head=True, backward=support_backward)

    # sparse_attention_fn = sparse_attention_factory(16, 128, num_warps=1, INFERENCE=True)
    # test_op(8, 1, 2047, 128, 1, backward=False, sparse_attention_fn=None)
    # test_op_inference(3, 2, 2048, 128, 2048)
    # test_op_inference(3, 2, 2047, 64, 2047)
    # test_op_inference(3, 2, 256, 64, 128)
    # test_op_inference(3, 2, 2048, 64, 1)

    bench_flash_attention.run(save_path='.', print_data=True)
    # bench_flash_attention_inference.run(save_path='.', print_data=True)

# ========================
# Some Benchmark Results #
# ========================

# fused-attention-batch4-head48-d64-sparse-local4-vert4-hetero-fwd
#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
# 0    256.0      0.057184     0.069646       0.052567
# 1    512.0      0.131688     0.187658       0.110212
# 2   1024.0      0.391844     0.524990       0.247875
# 3   2048.0      1.305190     1.456685       0.596506
# 4   4096.0      4.623019     4.968653       1.600277
# 5   8192.0     17.513062    18.332262       4.802458
# 6  16384.0     68.453377    70.337540      16.052908
# 7  32768.0    270.655487   276.020233      57.938946
# fused-attention-batch4-head48-d64-sparse-local4-vert4-hetero-bwd (num_warp=8):
# SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
# 0    256.0      0.190120     0.150313       0.181451
# 1    512.0      0.406348     0.391767       0.391177
# 2   1024.0      1.029704     1.182967       0.885741
# 3   2048.0      2.985456     3.843399       2.040469
# 4   4096.0      9.808897    13.073701       5.069609
# 5   8192.0     34.995201    47.863808      13.948782
# 6  16384.0    132.740097   182.579193      42.816513
# 7  32768.0    542.223389   714.820618     147.053574
# fused-attention-inference-batch4-head32-d128-sparse-local4-vert4-hetero:
# PAST_LEN  Torch-Dense  Flash-Dense  Triton-Sparse
# 0     256.0     0.050949     0.032357       0.107513
# 1     512.0     0.073624     0.050651       0.199086
# 2    1024.0     0.107472     0.080379       0.245445
# 3    2048.0     0.178423     0.129448       0.338259
# 4    4096.0     0.327647     0.223106       0.517048
# 5    8192.0     0.588423     0.411263       0.884606
# 6   16384.0     1.098898     0.798941       1.611809
# 7   32768.0     2.094537     1.594726       3.044160


# 6.7B
# fused-attention-batch4-head32-d128-sparse-local4-vert4-hetero-fwd:
#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
# 0    256.0      0.069208     0.082156       0.065097
# 1    512.0      0.138271     0.201393       0.144467
# 2   1024.0      0.391521     0.624614       0.322382
# 3   2048.0      1.268443     2.406325       0.784367
# 4   4096.0      4.455703     9.139097       2.100856
# 5   8192.0     16.764315    35.289600       6.328320
# 6  16384.0     65.221634   138.401794      21.069057
# 7  32768.0    257.251343   548.085754      76.111870
# fused-attention-batch4-head32-d128-sparse-local4-vert4-hetero-bwd:
#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
# 0    256.0      0.297118     0.266469       0.255255
# 1    512.0      0.672826     0.613685       0.552954
# 2   1024.0      1.718434     1.705066       1.251953
# 3   2048.0      4.936755     5.403875       2.927895
# 4   4096.0     15.911594    18.959362       7.436288
# 5   8192.0     55.357441    70.808578      21.140224
# 6  16384.0    208.188416   273.617920      68.018173
# 7  32768.0    806.037476  1081.453613     218.720261
# fused-attention-inference-batch4-head32-d128-sparse-local4-vert4-hetero:
#    PAST_LEN  Torch-Dense  Flash-Dense  Triton-Sparse
# 0     256.0     0.050151     0.032337       0.107593
# 1     512.0     0.073409     0.051737       0.200200
# 2    1024.0     0.107533     0.082099       0.247067
# 3    2048.0     0.177259     0.128891       0.338510
# 4    4096.0     0.325866     0.223621       0.524842
# 5    8192.0     0.586926     0.408913       0.885490
# 6   16384.0     1.100834     0.793277       1.612271
# 7   32768.0     2.098851     1.595831       3.064544

# fused-attention-batch4-head32-d128-sparse-local4-vert8-hetero-fwd:
#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
# 0    256.0      0.066673     0.082037       0.065085
# 1    512.0      0.137379     0.201880       0.143473
# 2   1024.0      0.390675     0.624234       0.312046
# 3   2048.0      1.267739     2.406950       0.696045
# 4   4096.0      4.445138     9.136333       1.665788
# 5   8192.0     16.768614    35.265533       4.380486
# 6  16384.0     65.235970   138.393600      12.997633
# 7  32768.0    257.317902   550.442993      42.821121
# fused-attention-batch4-head32-d128-sparse-local4-vert8-hetero-bwd:
#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
# 0    256.0      0.296461     0.266581       0.254022
# 1    512.0      0.671427     0.613643       0.551283
# 2   1024.0      1.719918     1.704295       1.229982
# 3   2048.0      4.945305     5.403364       2.721906
# 4   4096.0     15.934293    18.960999       6.259371
# 5   8192.0     55.406593    70.832130      15.676929
# 6  16384.0    208.750595   275.004425      44.837891
# 7  32768.0    808.057861  1080.647705     141.856766
# fused-attention-inference-batch4-head32-d128-sparse-local4-vert8-hetero:
#    PAST_LEN  Torch-Dense  Flash-Dense  Triton-Sparse
# 0     256.0     0.050739     0.032886       0.107837
# 1     512.0     0.073507     0.051996       0.200293
# 2    1024.0     0.106394     0.080679       0.240610
# 3    2048.0     0.177659     0.127660       0.287625
# 4    4096.0     0.326326     0.226971       0.377500
# 5    8192.0     0.586339     0.407367       0.559266
# 6   16384.0     1.102279     0.786221       0.920976
# 7   32768.0     2.097370     1.545090       1.644288


################
##### fp16 #####
################

# fused-attention-batch4-head16-d64-sparse-local4-vert8-hetero-fwd:
#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
# 0    256.0      0.032518     0.035472       0.029939
# 1    512.0      0.054266     0.087841       0.054320
# 2   1024.0      0.133447     0.263090       0.102045
# 3   2048.0      0.384615     1.023293       0.201763
# 4   4096.0      1.300890     4.023936       0.449555
# 5   8192.0      4.774144    15.816704       1.150854
# 6  16384.0     18.220032    62.771198       3.356001
# 7  32768.0     71.405571   250.273788      10.976142
# fused-attention-batch4-head16-d64-sparse-local4-vert8-hetero-bwd:
#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
# 0    256.0      0.083342     0.069742       0.079496
# 1    512.0      0.159894     0.170995       0.151705
# 2   1024.0      0.386071     0.522407       0.331443
# 3   2048.0      1.067715     1.737333       0.715248
# 4   4096.0      3.382731     6.219520       1.597457
# 5   8192.0     11.857793    23.560448       3.879035
# 6  16384.0     44.422142    91.251709      10.626843
# 7  32768.0    175.011841   359.473145      32.340992


################
##### bf16 #####
################

# fused-attention-batch4-head16-d64-sparse-local4-vert8-hetero-fwd:
#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
# 0    256.0      0.037636     0.035902       0.031512
# 1    512.0      0.058591     0.087229       0.058125
# 2   1024.0      0.143337     0.263919       0.108443
# 3   2048.0      0.414458     1.025985       0.214114
# 4   4096.0      1.390841     4.020010       0.480550
# 5   8192.0      5.067938    15.808171       1.230874
# 6  16384.0     19.442280    62.765057       3.597274
# 7  32768.0     75.501572   250.443771      11.768959
# fused-attention-batch4-head16-d64-sparse-local4-vert8-hetero-bwd:
#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
# 0    256.0      0.084404     0.070663       0.082613
# 1    512.0      0.161510     0.172882       0.157661
# 2   1024.0      0.388954     0.526047       0.339855
# 3   2048.0      1.075814     1.736057       0.732420
# 4   4096.0      3.401622     6.221376       1.636039
# 5   8192.0     11.915136    23.483391       3.968725
# 6  16384.0     44.660225    91.302910      10.857130
# 7  32768.0    175.038467   359.048187      32.778240