Resolve - 196 [rank0]: triton.runtime.autotuner.OutOfResources: out of resource: shared memory, Required: 180224, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.

Files changed (1) hide show

triton_flash_blocksparse_attn.py CHANGED Viewed

@@ -1020,7 +1020,7 @@ def blocksparse_flash_attn_padded_fwd(
     BLOCK_M_LOADING = 16 if q_len == 1 else block_size, # smaller for decoding
     EVEN_D = block_d == head_size,
     num_warps = 1 if q_len == 1 else 4,
-    num_stages = 3
     )
     return out

     BLOCK_M_LOADING = 16 if q_len == 1 else block_size, # smaller for decoding
     EVEN_D = block_d == head_size,
     num_warps = 1 if q_len == 1 else 4,
+    num_stages = 1 # <---- instead of 3
     )
     return out